In [247]:
import numpy as np
import pandas as pd
import nltk
import re


In [248]:
df = pd.read_csv("SMSSpamCollection", sep="\t", names=["label","text"])
stopwords = set(nltk.corpus.stopwords.words('english'))
textdata = df["text"]
label = df["label"]
N = len(textdata)
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Preprocessing
Preprocess the text data in the dataframe `df` by cleaning, tokenizing, removing stop words, lemmatizing, and vectorizing it. Then, split the preprocessed data into training and testing sets.

In [249]:
for i in range(N):
  textdata[i] = textdata[i].lower()

def cleanTexts(text):
  cleanText = re.sub(r'[^a-z\s]', '', text)
  return cleanText

textdata = textdata.apply(cleanTexts)

textdata.head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in  a wkly comp to win fa cup final...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: text, dtype: object

#### Tokenizing

In [250]:
def getVocab(textdata):
  vocab = {}
  i = 0
  for txt in textdata:
      for token in re.findall(r"\b[a-z]+\b", txt):
          if len(token) > 1 and token not in stopwords and token not in vocab:
                vocab[token] = i
                i += 1
  return vocab

def vectorize(text,vocab):
    vector = np.zeros(len(vocab))
    for token in re.findall(r"\b[a-z]+\b", text):
        if token in vocab:
            index = vocab[token]
            vector[index] = 1
    return vector

vocab = getVocab(textdata)

def splitDataset(X,split:float):
    N = len(X)
    n = int(split*N)
    X_train = np.stack(X[:n].apply(vectorize, vocab=vocab))
    Y_train = np.where(label[:n] == "spam", 1, 0)

    X_test = np.stack(X[n:].apply(vectorize, vocab=vocab))
    Y_test = np.where(label[n:] == "spam", 1, 0)
    
    return (X_train,Y_train),(X_test,Y_test)

(X_train,Y_train),(X_test,Y_test) = splitDataset(textdata,0.9)

## Calculating priors and likelihoods

In [251]:
def trainModel(X_train, Y_train, classes: tuple):
    n, d = X_train.shape
    model = {}
    for c in classes:
        x_c = X_train[Y_train == c]
        n_c = x_c.shape[0]
        prior = n_c/n

        wordcnt = np.sum(x_c, axis=0)

        ProbWordsinc = (wordcnt +0.5) / (n_c + d)
        model[c] = (prior,ProbWordsinc)
    return model


def predictClass(x_test, model):
    log_posteriors = {}
    for c, params in model.items():
        prior = params[0]
        wordProbs = params[1]

        log_prob = np.log(prior)

        log_prob += np.sum(x_test * np.log(wordProbs) + (1 - x_test) * np.log(1 - wordProbs))

        log_posteriors[c] = log_prob
    return max(log_posteriors, key=log_posteriors.get) # type: ignore

In [252]:
model = trainModel(X_train, Y_train, classes=(0,1))

predictions = []
for x_test in X_test:
  prediction = predictClass(x_test, model)
  predictions.append(prediction)

predictions = np.array(predictions)

In [253]:
def ConfusionMatrix(Y_test, predictions):
    TP = np.sum((Y_test == 1) & (predictions == 1))
    TN = np.sum((Y_test == 0) & (predictions == 0))
    FP = np.sum((Y_test == 0) & (predictions == 1))
    FN = np.sum((Y_test == 1) & (predictions == 0))

    accuracy = (TP + TN) / (TP + TN + FP + FN)
    R = TP / (TP + FN)
    P = TP / (TP + FP)
    F1 = 2 * (P * R) / (P + R)
    print(f"Confusion Matrix:\nTP: {TP}, FP: {FP}\nFN: {FN}, TN: {TN}")
    print(f"Accuracy: {accuracy*100}")
    print(f"Recall: {R*100}")
    print(f"Precision: {P*100}")
    print(f"F1 Score: {F1*100}")
    return ([[TP, FP], [FN, TN]], accuracy, R, P, F1)

evaluation = ConfusionMatrix(Y_test, predictions)

Confusion Matrix:
TP: 66, FP: 6
FN: 6, TN: 480
Accuracy: 97.84946236559139
Recall: 91.66666666666666
Precision: 91.66666666666666
F1 Score: 91.66666666666666


In [254]:
def testModel(text):
    text = vectorize(cleanTexts(text),vocab)
    return "it's a SPAM!!" if predictClass(text,model) else "it's a HAM"

print(testModel("See you tomorrow at lunch"))

it's a HAM


## MultiClass Naive Bayes

In [255]:
import os

classes = ("business", "entertainment", "politics", "sport", "tech")


def load_bbc_dataset(base_path):
    data = {}
    for category in os.listdir(base_path):
        texts = []
        labels = []
        category_path = os.path.join(base_path, category)
        if os.path.isdir(category_path):
            for file_name in os.listdir(category_path):
                file_path = os.path.join(category_path, file_name)
                with open(file_path, 'r', encoding='latin1') as f:
                    text = f.read()
                    texts.append(text)
                    labels.append(category)
        data[f"{category}"] = pd.DataFrame({'text': texts, 'label': labels})

    return data

# Example usage
base_path = 'BBCNewsSummary/NewsArticles'
bbc = load_bbc_dataset(base_path)
business = bbc["business"]
entertainment = bbc["entertainment"]
politics = bbc["politics"]
sport = bbc["sport"]
tech = bbc["tech"]

In [256]:
split = 0.9
bbc_train = pd.concat([business[:int(len(business) * split)], 
                       entertainment[:int(len(entertainment) * split)], 
                       politics[:int(len(politics) * split)], 
                       sport[:int(len(sport) * split)], 
                       tech[:int(len(tech) * split)]], ignore_index=True)
bbc_test = pd.concat([business[int(len(business) * split):], 
                       entertainment[int(len(entertainment) * split):], 
                       politics[int(len(politics) * split):], 
                       sport[int(len(sport) * split):], 
                       tech[int(len(tech) * split):]], ignore_index=True)

bbcX_train = bbc_train["text"].apply(cleanTexts)
bbcY_train = bbc_train["label"]
bbcX_test = bbc_test["text"].apply(cleanTexts)
bbcY_test = bbc_test["label"]

In [257]:
bbc_vocab = getVocab(bbcX_train)

def vectorize(text,vocab):
    vector = np.zeros(len(vocab))
    for token in re.findall(r"\b[a-z]+\b", text):
        if token in vocab:
            index = vocab[token]
            vector[index] = 1
    return vector

bbcX_train = np.stack(bbcX_train.apply(vectorize, vocab=bbc_vocab)) # type: ignore
bbcY_train = np.array([classes.index(label) for label in bbcY_train])
bbcX_test = np.stack(bbcX_test.apply(vectorize, vocab=bbc_vocab)) # type: ignore
bbcY_test = np.array([classes.index(label) for label in bbcY_test])

In [258]:
bbc_model = trainModel(bbcX_train, bbcY_train, classes=tuple(range(len(classes))))

bbc_predictions = []
for x_test in bbcX_test:
  prediction = predictClass(x_test, bbc_model)
  bbc_predictions.append(prediction)

bbc_predictions = np.array(bbc_predictions)

In [259]:
def bbc_confusionMatrix(Y_test, predictions):
	matrix = [[np.sum((Y_test == 0) & (predictions == 0)), np.sum((Y_test == 0) & (predictions == 1)),np.sum((Y_test == 0) & (predictions == 2)),np.sum((Y_test == 0) & (predictions == 3)),np.sum((Y_test == 0) & (predictions == 4))],
			[np.sum((Y_test == 1) & (predictions == 0)), np.sum((Y_test == 1) & (predictions == 1)),np.sum((Y_test == 1) & (predictions == 2)),np.sum((Y_test == 1) & (predictions == 3)),np.sum((Y_test == 1) & (predictions == 4))],
			[np.sum((Y_test == 2) & (predictions == 0)), np.sum((Y_test == 2) & (predictions == 1)),np.sum((Y_test == 2) & (predictions == 2)),np.sum((Y_test == 2) & (predictions == 3)),np.sum((Y_test == 2) & (predictions == 4))],
			[np.sum((Y_test == 3) & (predictions == 0)), np.sum((Y_test == 3) & (predictions == 1)),np.sum((Y_test == 3) & (predictions == 2)),np.sum((Y_test == 3) & (predictions == 3)),np.sum((Y_test == 3) & (predictions == 4))],
			[np.sum((Y_test == 4) & (predictions == 0)), np.sum((Y_test == 4) & (predictions == 1)),np.sum((Y_test == 4) & (predictions == 2)),np.sum((Y_test == 4) & (predictions == 3)),np.sum((Y_test == 4) & (predictions == 4))]]
	
	accuracy = np.sum(np.diag(matrix)) / np.sum(matrix)
	df = pd.DataFrame(matrix, index=classes, columns=classes)
	return (df, accuracy)
 
df, acc = bbc_confusionMatrix(bbcY_test, bbc_predictions)
print("accuracy:", acc*100)
df

accuracy: 96.0


Unnamed: 0,business,entertainment,politics,sport,tech
business,50,0,1,0,0
entertainment,0,35,2,0,2
politics,0,0,41,0,1
sport,1,0,1,50,0
tech,1,0,0,0,40


In [260]:
def testBBCModel(text):
	text = vectorize(cleanTexts(text),bbc_vocab)
	return f"it's a {classes[predictClass(text,bbc_model)]} article"

testBBCModel("news article about sports")

"it's a business article"