In [18]:
import math
import os
import re
from collections import Counter


# Pseudo Code

# store the documents as lists of words, one after another, one list per doc

# For each paper source in 20_newsgroups
#     For each paper in the source
#         remove the first 4 lines (lines starting with Newsgroup, document_id, From, Subject)
#         save into dataset (source, paper)
# for each source in dataset
#     split in half, (train, test)

# dataset complete, good to go

def read_20_newsgroups(directory):
    word_counter = Counter()
    dataset = {}
    for curr_dir, classes, files in os.walk(directory):
        #print(f"Reading directory: {curr_dir}, files: {files}, dirs: {classes}")
        curr_class = curr_dir.rsplit('/', 1)[-1]
        dataset[curr_class] = []
        for file in files:
            file_path = os.path.join(curr_dir, file)
            # print(f"Reading file: {file_path}")
            read_file = []
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    for line in f:
                        bad = ['Newsgroup','document_id', 'From', 'Subject']
                        if any(line.startswith(badd) for badd in bad):
                            continue
                        
                        # print(f'line: {line.strip()}')  # Process each line here
                        words = re.findall(r'\b\w+\b', line.lower())  # Convert to lowercase to count case-insensitively
                        word_counter.update(words)  # Update the counter with words from the line
                        read_file.extend(words)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
            dataset[curr_class].append(read_file)
    stop_words = word_counter.most_common(200)
    print(f'{dataset.keys()}')
    del dataset['20_newsgroups']
    for c, files in dataset.items():
        print(f"Class {c} has {len(files)} files")
        for file in files:
            for word in stop_words:
                
                if word[0] in file:
                    file.remove(word[0])
    C = list(dataset.keys())[1:]
    return C, dataset
C, D = read_20_newsgroups("/home/jems/cmsc422/p1/20_newsgroups")
train, test = {}, {}
# split D in half to get train and test
for c, files in D.items():
    train[c] = files[:len(files)//2]
    test[c] = files[len(files)//2:]

dict_keys(['20_newsgroups', 'talk.politics.mideast', 'rec.autos', 'comp.sys.mac.hardware', 'rec.sport.hockey', 'talk.politics.misc', 'sci.crypt', 'sci.med', 'rec.motorcycles', 'comp.graphics', 'sci.space', 'misc.forsale', 'sci.electronics', 'comp.windows.x', 'alt.atheism', 'talk.religion.misc', 'comp.sys.ibm.pc.hardware', 'rec.sport.baseball', 'talk.politics.guns', 'soc.religion.christian', 'comp.os.ms-windows.misc'])
Class talk.politics.mideast has 1000 files
Class rec.autos has 1000 files
Class comp.sys.mac.hardware has 1000 files
Class rec.sport.hockey has 1000 files
Class talk.politics.misc has 1000 files
Class sci.crypt has 1000 files
Class sci.med has 1000 files
Class rec.motorcycles has 1000 files
Class comp.graphics has 1000 files
Class sci.space has 1000 files
Class misc.forsale has 1000 files
Class sci.electronics has 1000 files
Class comp.windows.x has 1000 files
Class alt.atheism has 1000 files
Class talk.religion.misc has 1000 files
Class comp.sys.ibm.pc.hardware has 1000 

In [19]:
# Pseudo Code
import pandas as pd
import numpy as np
# Naive Bayes Implementations

# returns V, log P(c), log P(w|c)
def train_naive_bayes(D, C): # D is the dataset, C is the classes
    # Initialize the count of each class
    ndoc = sum([len(D[c]) for c in D]) # for class in the dataset count the number of documents and sum them
    logprior = {} # initialize the logprior
    bigdoc = {} # initialize the bigdoc
    V = set() # initialize the vocabulary
    for c, files in D.items():
        for file in files:
            for word in file:
                V.add(word)

    count = pd.DataFrame(np.zeros((len(V), len(C))), columns=C, index=list(V))
    loglikelihood = pd.DataFrame(np.zeros((len(V), len(C))), columns=C, index=list(V))
    print(f'vocab: {list(V)[0]}')
    print(f'starting bigdoc')
    for c in C:
        bigdoc[c] = []
        for doc in D[c]:
            bigdoc[c].append(doc)
        print(f'ended bigdoc for {c}')
        for word in V:
            count.loc[word, c] = bigdoc[c].count(word)
        print(f'ended word count for {c}')
    print(f'starting prob calcs')
    for c in C: # for each class in the classes
        print(f'starting for {c}')
        ndoc_c = len(D[c]) # count the number of documents in the class
        logprior[c] = math.log(ndoc_c / ndoc)
        sumcount = count[c].sum() + 1
        for word in V:
            localsum = count.loc[word, c] + 1
            loglikelihood.loc[word, c] = math.log(localsum/sumcount)
        print(f'ending for {c}')
    print(f'ending word calcs')

    return V, logprior, loglikelihood

V, logprior, loglikelihood = train_naive_bayes(train, C)

vocab: 3056
starting bigdoc
ended bigdoc for rec.autos
ended word count for rec.autos
ended bigdoc for comp.sys.mac.hardware
ended word count for comp.sys.mac.hardware
ended bigdoc for rec.sport.hockey
ended word count for rec.sport.hockey
ended bigdoc for talk.politics.misc
ended word count for talk.politics.misc
ended bigdoc for sci.crypt
ended word count for sci.crypt
ended bigdoc for sci.med
ended word count for sci.med
ended bigdoc for rec.motorcycles
ended word count for rec.motorcycles
ended bigdoc for comp.graphics
ended word count for comp.graphics
ended bigdoc for sci.space
ended word count for sci.space
ended bigdoc for misc.forsale
ended word count for misc.forsale
ended bigdoc for sci.electronics
ended word count for sci.electronics
ended bigdoc for comp.windows.x
ended word count for comp.windows.x
ended bigdoc for alt.atheism
ended word count for alt.atheism
ended bigdoc for talk.religion.misc
ended word count for talk.religion.misc
ended bigdoc for comp.sys.ibm.pc.hardw

In [20]:
def test_naive_bayes(testdoc, C, V, logprior, loglikelihood):
    summ = logprior
    #print(f'logprior: {logprior=}')
    for c in C:
        for word in testdoc:
            if word in V:
                summ[c] = summ[c] + loglikelihood.loc[word, c]
    #print(f'{summ=}')
    total = float('-inf')
    best = None
    for c, val in summ.items():
        if isinstance(val, (int, float)) and val > total:
            best = c
            total = val
    return best
c, files = list(test.items())[0]
file = files[0]
trial = test_naive_bayes(file, C, V, logprior=logprior, loglikelihood=loglikelihood)
print(f'{trial=}')

trial='rec.autos'


In [21]:
# def debugging(testset, logprior, likelihood, C, V):
#     counter = 2
#     for c, files in testset.items():
#         for file in files:
#             if counter < 0:
#                 break
#             right = test_naive_bayes(file, C, V, logprior=logprior, loglikelihood=loglikelihood)
#             print(f'result = {right=}')
#             counter -= 1
#         if counter < 0:
#             break
# debugging(testset=test, C=C, V=V, logprior=logprior, likelihood=loglikelihood)

In [22]:
def eval_bayes(testset, logprior, loglikelihood, C, V):
    correct = 0
    total = 0
    #print(f'{dict(list(testset.items())[:2])=}')
    for c, files in testset.items():
        for file in files:
            #print(f'curr file {file=}')
            right = test_naive_bayes(file, C, V, logprior=logprior, loglikelihood=loglikelihood)
            #print(f'the right class: {c}, what we got: {right}')
            if c == right:
                correct += 1
            else:
                print(f'right: {c}, wrong: {right}')
            total += 1
    return correct/total
    

accuracy = eval_bayes(testset=test, C=C, V=V, logprior=logprior, loglikelihood=loglikelihood)
print(accuracy)

right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.politics.mideast, wrong: rec.autos
right: talk.p

KeyboardInterrupt: 