In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os

src_path = '/content/drive/My\ Drive/CMSC422/HW1/20_newsgroups.tar.gz'
extract_path = '/content/20_newsgroups'

if not os.path.exists(extract_path):
    os.makedirs(extract_path)

!tar -xzf {src_path} -C {extract_path}


In [3]:
dataset_path = '/content/20_newsgroups/20_newsgroups'

newsgroup_classes = os.listdir(dataset_path)
print("Newsgroup Categories:", newsgroup_classes)

sample_class_path = os.path.join(dataset_path, newsgroup_classes[0])
sample_files = os.listdir(sample_class_path)
print("\nSample files in category '{}':".format(newsgroup_classes[0]), sample_files[:5])


Newsgroup Categories: ['talk.politics.mideast', 'comp.sys.mac.hardware', 'comp.os.ms-windows.misc', 'talk.politics.misc', 'rec.sport.baseball', 'comp.windows.x', 'rec.sport.hockey', 'sci.electronics', 'comp.sys.ibm.pc.hardware', 'rec.autos', 'sci.space', 'talk.politics.guns', 'misc.forsale', 'talk.religion.misc', 'soc.religion.christian', 'rec.motorcycles', 'sci.med', 'alt.atheism', 'sci.crypt', 'comp.graphics']

Sample files in category 'talk.politics.mideast': ['76346', '76416', '77263', '76376', '76331']


In [4]:
# Print out to see how the data looks
sample_class_path = os.path.join(dataset_path, newsgroup_classes[0])
sample_files = os.listdir(sample_class_path)
sample_file_path = os.path.join(sample_class_path, sample_files[0])


with open(sample_file_path, 'r', encoding='latin1') as file:
    lines = file.readlines()
    print("\nFirst 10 lines of '{}':".format(sample_files[0]))
    for line in lines[:30]:
        print(line.strip())



First 10 lines of '76346':
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!zaphod.mps.ohio-state.edu!cs.utexas.edu!sdd.hp.com!sgiblab!sgigate!sgi!cdp!cpr
From: Center for Policy Research <cpr@igc.apc.org>
Newsgroups: talk.politics.mideast
Date: 22 Apr 93 17:31 PDT
Subject: Re: rejoinder. Questions to Israelis
Message-ID: <1483500353@igc.apc.org>
References: <1483500352@igc.apc.org>
Sender: Notesfile to Usenet Gateway <notes@igc.apc.org>
Nf-ID: #R:cdp:1483500352:cdp:1483500353:000:3689
Nf-From: cdp.UUCP!cpr    Apr 22 17:31:00 1993
Lines: 83


From: Center for Policy Research <cpr>
Subject: rejoinder. Questions to Israelis


Dear Josh

I appreciate the fact that you sought to answer my questions.

Having said that, I am not totally happy with your answers.

1.   You did not fully answer my question whether Israeli ID cards
identify the holders as Jews or Arabs. You imply that U.S.
citizens must identify themselves by 

In [5]:
# Preprocessing
import os
import string
from collections import Counter

def process_all_files(dataset_path):

    all_words = []
    cleaned_data = {}

    for newsgroup in os.listdir(dataset_path):
        newsgroup_path = os.path.join(dataset_path, newsgroup)
        if os.path.isdir(newsgroup_path):
            cleaned_data[newsgroup] = []

            for filename in os.listdir(newsgroup_path):
                file_path = os.path.join(newsgroup_path, filename)

                with open(file_path, 'r', encoding='latin1') as file:
                    lines = file.readlines()

                # Remove the first four lines
                content = ''.join(lines[4:])

                # Convert to lowercase
                content = content.lower()

                # Remove punctuation
                content = content.translate(str.maketrans('', '', string.punctuation))

                # Split into words
                words = content.split()

                # Collect words for stop word calculation
                all_words.extend(words)


                cleaned_data[newsgroup].append((words, file_path))

    # Finding the top 200 words as stop words
    word_counter = Counter(all_words)
    top_200_stop_words = set([word for word, _ in word_counter.most_common(200)])

    final_cleaned_data = {}
    for newsgroup, files in cleaned_data.items():
        final_cleaned_data[newsgroup] = []

        for words, file_path in files:
            # Remove stop words, short words, and email addresses
            words_filtered = [
                word for word in words
                if word not in top_200_stop_words and
                len(word) > 2 and
                '@' not in word
            ]

            # Join filtered words
            cleaned_content = ' '.join(words_filtered)

            final_cleaned_data[newsgroup].append((cleaned_content, file_path))

    return final_cleaned_data




In [6]:
cleaned_dataset = process_all_files(dataset_path)

# Display a cleaned file
example_class = os.listdir(dataset_path)[0]
cleaned_content, file_path = cleaned_dataset[example_class][0]

print(f"\nSample cleaned email from '{example_class}':\n")
print(f"File Path: {file_path}")
print(f"Content: {cleaned_content[:500]}")


Sample cleaned email from 'talk.politics.mideast':

File Path: /content/20_newsgroups/20_newsgroups/talk.politics.mideast/76346
Content: rejoinder questions israelis 1483500353igcapcorg 1483500352igcapcorg notesfile gateway notesigcapcorg nfid rcdp1483500352cdp14835003530003689 nffrom cdpuucpcpr 173100 center policy research cpr rejoinder questions israelis dear josh appreciate fact sought answer questions having totally happy answers fully answer whether israeli cards identify holders jews arabs imply citizens identify themselves race true trying mislead reader democratic country asked reveal ethnical religious identity public o


In [7]:
#Spliting the datasets


train ={}
test = {}

for category, documents in cleaned_dataset.items():
  train[category] = documents[:500]
  test[category] = documents[500:]


example_class = list(cleaned_dataset.keys())[0]
print(f"Number of documents in '{example_class}' - Training: {len(train[example_class])}, Testing: {len(test[example_class])}")


Number of documents in 'talk.politics.mideast' - Training: 500, Testing: 500


In [8]:
#Training
import math
from collections import defaultdict

Ndoc = 500*20
Cdoc = 500
bigdoc = defaultdict(Counter)
V = set()
logprior = math.log(Cdoc / Ndoc)
loglikelihood = defaultdict(dict)


for c, doc in train.items():


  words_count= []

  for content, _ in doc:
    words = content.split()
    words_count.extend(words)
    V.update(words)

  bigdoc[c] = Counter(words_count)

for c in train:

  total_words = sum(bigdoc[c].values())

  for words in V:
    words_count = bigdoc[c][words] +1
    deno = total_words + len(V)
    loglikelihood[c][words] = math.log(words_count/ deno)







In [9]:
#Testing

correct_predictions = 0
total_predictions = 0

for c, doc in test.items():

  score = {}
  for content, _ in doc:

    for class_label in loglikelihood:
      score[class_label] = logprior

    words = content.split()
    word_count_in_vocab = sum(1 for w in words if w in V)

    for w in words:
      if w in V:

         for class_label in loglikelihood:
          score[class_label] += loglikelihood[class_label].get(w, 0)



    best_class = max(score, key= score.get)

    if best_class == c:
      correct_predictions +=1
    total_predictions +=1

accuracy = correct_predictions / total_predictions * 100
print(f"Model accuracy on test data: {accuracy:.2f}")





Model accuracy on test data: 83.61
