In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from numpy.linalg import norm
from nltk.stem import WordNetLemmatizer
from collections import Counter
import string
import math
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fahad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fahad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fahad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\fahad\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df = pd.read_csv("BBC News Train.csv", sep = ",")
df.drop(['ArticleId'], axis = 1, inplace = True)

In [3]:
df.head()

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business


In [4]:
def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [5]:
X = [preprocess(text) for text in df["Text"]]
y = df['Category'].to_list()

In [6]:
Category = ['business',
            'tech',
            'politics',
            'sport',
            'entertainment']

In [7]:
class TfICf:
    def __init__(self):             
        self.index_dict = {}
        self.icf = {}
        self.tf = {}
        self.class_freq = {}
        self.tficf_score = {}

    def TF(self, corpus, category):
        X = np.array(corpus, dtype = object)
        for label in self.classes:
            idx = (category == label)
            C = sum(X[idx], [])
            count = Counter(C)
            self.tf[label] = count

    def CF(self, corpus, category):
        X = np.array(corpus, dtype = object)
        for label in self.classes:
            idx = (category == label)
            C = sum(X[idx], [])
            for term in self.vocab:
                if term in C:
                    try:
                        self.class_freq[term] += 1
                    except:
                        self.class_freq[term] = 1

    def tficfScore(self):
        for label in self.classes:
            tficf_class_score = {}
            for term in self.vocab:
                try:
                    tficf_class_score[term] = self.icf[term] * self.tf[label][term]
                except:
                    tficf_class_score[term] = 0
            
            self.tficf_score[label] = tficf_class_score           


    def fit(self, corpus, category):
        assert type(category).__module__ == np.__name__
        assert isinstance(corpus, list)
        
        self.vocab = set.union(*map(set, corpus))
        self.classes = np.unique(category)
        self.N_docs = len(corpus)
        num_classes = len(self.classes)

        self.TF(corpus, category)
        self.CF(corpus, category)

        self.icf = {term : math.log(num_classes / self.class_freq[term]) for term in self.vocab}
        self.index_dict = {term : idx for idx, term in enumerate(self.vocab)} 
        self.tficfScore()

        return self

In [70]:
class NaiveBayes:     
    def __init__(self, vocab, tf_score, tf):
        self.vocab = vocab
        self.V = len(vocab)
        self.tf_score = tf_score
        self.tf = tf
        self.Ny = {}

    def fit(self, corpus, y):
        assert type(y).__module__ == np.__name__
        assert isinstance(corpus, list)
        assert isinstance(corpus[0], list)
        
        X = np.array(corpus, dtype=object)

        self.n_sample = y.shape[0]
        self.classes = np.unique(y) 
        self.n_classes = len(self.classes)

        instances_per_class = Counter(y)
        self.prior = {label : instances_per_class[label] / self.n_sample for label in self.classes}

        for label in self.classes:
            idx = (y == label)
            C = set(sum(X[idx], []))
            prob = 0
            for term in C:
                try:
                    prob += self.tf_score[label][term] * self.tf[label][term] 
                except:
                    pass
            self.Ny[label] = prob
        
        return self

    def predict(self, X):
        y_pred = []
        for X_i in X:
            pred = []
            for label in self.classes:
                Nyi = 0
                for term in X_i:    
                    try:
                        Nyi += self.tf_score[label][term] * self.tf[label][term]
                    except:
                        pass

                probability = np.log(self.prior[label]) + np.log((Nyi + 1) / (self.Ny[label] + self.V))
                pred.append(probability)

            pred = np.array(pred)
            y_pred.append(self.classes[np.argmax(pred)])
                
        return np.array(y_pred)

<h1>Naive Baye's with TF-ICF

<h2>80 : 20 Split

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20 , stratify=y, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)

tficf = TfICf().fit(X_train, y_train)
clf = NaiveBayes(tficf.vocab, tficf.tficf_score, tficf.tf).fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)


print('*' * 100)
print("TRAINING REPORT")
print('*' * 100)
print(classification_report(y_train, y_train_pred, zero_division = 1))
print('*' * 100)
print("TESTING REPORT")
print('*' * 100)
print(classification_report(y_test, y_test_pred, zero_division = 1))

****************************************************************************************************
TRAINING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     business       0.95      0.94      0.94       269
entertainment       0.91      0.94      0.93       218
     politics       0.95      0.93      0.94       219
        sport       0.98      1.00      0.99       277
         tech       0.90      0.89      0.90       209

     accuracy                           0.94      1192
    macro avg       0.94      0.94      0.94      1192
 weighted avg       0.94      0.94      0.94      1192

****************************************************************************************************
TESTING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     busine

<h2>70 : 30 Split

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30 , stratify=y, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)

tficf = TfICf().fit(X_train, y_train)
clf = NaiveBayes(tficf.vocab, tficf.tficf_score, tficf.tf).fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)


print('*' * 100)
print("TRAINING REPORT")
print('*' * 100)
print(classification_report(y_train, y_train_pred, zero_division = 1))
print('*' * 100)
print("TESTING REPORT")
print('*' * 100)
print(classification_report(y_test, y_test_pred, zero_division = 1))

****************************************************************************************************
TRAINING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     business       0.94      0.93      0.93       235
entertainment       0.91      0.94      0.92       191
     politics       0.95      0.91      0.93       192
        sport       0.96      1.00      0.98       242
         tech       0.92      0.89      0.90       183

     accuracy                           0.94      1043
    macro avg       0.94      0.93      0.93      1043
 weighted avg       0.94      0.94      0.94      1043

****************************************************************************************************
TESTING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     busine

<h2>60 : 40 Split

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40 , stratify=y, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)

tficf = TfICf().fit(X_train, y_train)
clf = NaiveBayes(tficf.vocab, tficf.tficf_score, tficf.tf).fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)


print('*' * 100)
print("TRAINING REPORT")
print('*' * 100)
print(classification_report(y_train, y_train_pred, zero_division = 1))
print('*' * 100)
print("TESTING REPORT")
print('*' * 100)
print(classification_report(y_test, y_test_pred, zero_division = 1))

****************************************************************************************************
TRAINING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     business       0.94      0.92      0.93       201
entertainment       0.91      0.93      0.92       164
     politics       0.95      0.95      0.95       164
        sport       0.97      0.98      0.97       208
         tech       0.92      0.91      0.92       157

     accuracy                           0.94       894
    macro avg       0.94      0.94      0.94       894
 weighted avg       0.94      0.94      0.94       894

****************************************************************************************************
TESTING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     busine

<h2>50 : 50 Split

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50 , stratify=y, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)

tficf = TfICf().fit(X_train, y_train)
clf = NaiveBayes(tficf.vocab, tficf.tficf_score, tficf.tf).fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)


print('*' * 100)
print("TRAINING REPORT")
print('*' * 100)
print(classification_report(y_train, y_train_pred, zero_division = 1))
print('*' * 100)
print("TESTING REPORT")
print('*' * 100)
print(classification_report(y_test, y_test_pred, zero_division = 1))

****************************************************************************************************
TRAINING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     business       0.96      0.92      0.94       168
entertainment       0.92      0.93      0.93       136
     politics       0.96      0.96      0.96       137
        sport       0.96      0.99      0.98       173
         tech       0.93      0.92      0.93       131

     accuracy                           0.95       745
    macro avg       0.95      0.95      0.95       745
 weighted avg       0.95      0.95      0.95       745

****************************************************************************************************
TESTING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     busine

<h2>Naive Baye's with TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

<h2>80 : 20 Split

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20 , stratify=y, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)
corpus = [' '.join(X_i) for X_i in X_train]

tfidf = TfidfVectorizer(use_idf=True)
score = tfidf.fit_transform(corpus).toarray()
vocab = tfidf.get_feature_names_out()
tf_score = {}
classes = np.unique(y_train)

tf_score = {}
for label in classes:
    C = score[label == y_train]
    score_dict = {term : scr for term, scr in zip(vocab, np.mean(C, axis = 0))}
    tf_score[label] = score_dict

X_t = np.array(X_train, dtype = object)
tf = {}
for label in classes:
    idx = (y_train == label)
    C = sum(X_t[idx], [])
    count = Counter(C)
    tf[label] = count

clf = NaiveBayes(vocab, tf_score, tf).fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)


print('*' * 100)
print("TRAINING REPORT")
print('*' * 100)
print(classification_report(y_train, y_train_pred, zero_division = 1))
print('*' * 100)
print("TESTING REPORT")
print('*' * 100)
print(classification_report(y_test, y_test_pred, zero_division = 1))

****************************************************************************************************
TRAINING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     business       0.87      0.44      0.58       269
entertainment       0.97      0.52      0.67       218
     politics       0.34      0.99      0.50       219
        sport       0.95      0.64      0.77       277
         tech       0.94      0.49      0.65       209

     accuracy                           0.61      1192
    macro avg       0.81      0.62      0.63      1192
 weighted avg       0.82      0.61      0.64      1192

****************************************************************************************************
TESTING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     busine

<h2>70 : 30 Split

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30 , stratify=y, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)
corpus = [' '.join(X_i) for X_i in X_train]

tfidf = TfidfVectorizer(use_idf=True)
score = tfidf.fit_transform(corpus).toarray()
vocab = tfidf.get_feature_names_out()
tf_score = {}
classes = np.unique(y_train)

tf_score = {}
for label in classes:
    C = score[label == y_train]
    score_dict = {term : scr for term, scr in zip(vocab, np.mean(C, axis = 0))}
    tf_score[label] = score_dict

X_t = np.array(X_train, dtype = object)
tf = {}
for label in classes:
    idx = (y_train == label)
    C = sum(X_t[idx], [])
    count = Counter(C)
    tf[label] = count

clf = NaiveBayes(vocab, tf_score, tf).fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)


print('*' * 100)
print("TRAINING REPORT")
print('*' * 100)
print(classification_report(y_train, y_train_pred, zero_division = 1))
print('*' * 100)
print("TESTING REPORT")
print('*' * 100)
print(classification_report(y_test, y_test_pred, zero_division = 1))

****************************************************************************************************
TRAINING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     business       0.89      0.41      0.56       235
entertainment       0.96      0.51      0.66       191
     politics       0.33      0.99      0.49       192
        sport       0.93      0.64      0.76       242
         tech       0.93      0.45      0.61       183

     accuracy                           0.60      1043
    macro avg       0.81      0.60      0.62      1043
 weighted avg       0.82      0.60      0.62      1043

****************************************************************************************************
TESTING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     busine

<h2>60 : 40 Split

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40 , stratify=y, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)
corpus = [' '.join(X_i) for X_i in X_train]

tfidf = TfidfVectorizer(use_idf=True)
score = tfidf.fit_transform(corpus).toarray()
vocab = tfidf.get_feature_names_out()
tf_score = {}
classes = np.unique(y_train)

tf_score = {}
for label in classes:
    C = score[label == y_train]
    score_dict = {term : scr for term, scr in zip(vocab, np.mean(C, axis = 0))}
    tf_score[label] = score_dict

X_t = np.array(X_train, dtype = object)
tf = {}
for label in classes:
    idx = (y_train == label)
    C = sum(X_t[idx], [])
    count = Counter(C)
    tf[label] = count

clf = NaiveBayes(vocab, tf_score, tf).fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)


print('*' * 100)
print("TRAINING REPORT")
print('*' * 100)
print(classification_report(y_train, y_train_pred, zero_division = 1))
print('*' * 100)
print("TESTING REPORT")
print('*' * 100)
print(classification_report(y_test, y_test_pred, zero_division = 1))

****************************************************************************************************
TRAINING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     business       0.88      0.45      0.60       201
entertainment       0.96      0.54      0.69       164
     politics       0.34      0.99      0.50       164
        sport       0.94      0.64      0.76       208
         tech       0.96      0.46      0.63       157

     accuracy                           0.61       894
    macro avg       0.81      0.62      0.64       894
 weighted avg       0.82      0.61      0.64       894

****************************************************************************************************
TESTING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     busine

<h2> 50 : 50 Split

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50 , stratify=y, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)
corpus = [' '.join(X_i) for X_i in X_train]

tfidf = TfidfVectorizer(use_idf=True)
score = tfidf.fit_transform(corpus).toarray()
vocab = tfidf.get_feature_names_out()
tf_score = {}
classes = np.unique(y_train)

tf_score = {}
for label in classes:
    C = score[label == y_train]
    score_dict = {term : scr for term, scr in zip(vocab, np.mean(C, axis = 0))}
    tf_score[label] = score_dict

X_t = np.array(X_train, dtype = object)
tf = {}
for label in classes:
    idx = (y_train == label)
    C = sum(X_t[idx], [])
    count = Counter(C)
    tf[label] = count

clf = NaiveBayes(vocab, tf_score, tf).fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)


print('*' * 100)
print("TRAINING REPORT")
print('*' * 100)
print(classification_report(y_train, y_train_pred, zero_division = 1))
print('*' * 100)
print("TESTING REPORT")
print('*' * 100)
print(classification_report(y_test, y_test_pred, zero_division = 1))

****************************************************************************************************
TRAINING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     business       0.88      0.42      0.56       168
entertainment       0.96      0.51      0.67       136
     politics       0.33      1.00      0.50       137
        sport       0.96      0.62      0.76       173
         tech       0.97      0.50      0.66       131

     accuracy                           0.60       745
    macro avg       0.82      0.61      0.63       745
 weighted avg       0.83      0.60      0.63       745

****************************************************************************************************
TESTING REPORT
****************************************************************************************************
               precision    recall  f1-score   support

     busine