In [1]:
import re
import string 
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import math as mt
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/harjeet/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv('BBC/BBC News Train.csv')

In [3]:
df.head(5)

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [4]:
df['Category'].unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

In [5]:
len(df)

1490

In [6]:
df['Text'] = df['Text'].astype(str)
df['Category'] = df['Category'].astype(str)

In [7]:
def Preprocessing( data ):

    # define stopwords set 
    stop_word_set = set(stopwords.words('english') + list(string.punctuation))

    #lowering the string
    data = data.lower()

    # tokenization
    sent_tokens = word_tokenize(data)

    # Removing stopwords & punctuations 
    sent_tokens = [ i for i in sent_tokens if i not in stop_word_set]
    
    #lemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in sent_tokens]
    
    # joining the tokens
    final_str = ' '.join(lemmatized_words)
    
    #removal of non-char and non-numeric char
    final_str = re.sub(r'[^\w\s]', ' ', final_str)

    return final_str


In [8]:
df['clean text'] = df['Text'].apply(Preprocessing)

In [9]:
df.head(5)

Unnamed: 0,ArticleId,Text,Category,clean text
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex boss launch defence lawyer defendi...
1,154,german business confidence slides german busin...,business,german business confidence slide german busine...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizen majo...
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster better ...
4,917,enron bosses in $168m payout eighteen former e...,business,enron boss 168m payout eighteen former enron d...


In [10]:
X = df['clean text']
Y = df['Category']

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 100)

# Traning Navie Bayes

In [12]:
class_frequency = y_train.value_counts()
dict(class_frequency)

{'sport': 256,
 'business': 235,
 'politics': 186,
 'tech': 184,
 'entertainment': 182}

In [13]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [14]:
X_train.shape

(1043,)

In [15]:
X_test.shape

(447,)

In [16]:
posDict = {}

In [17]:
def createInvertedIndex(X, Y):

    for i in range(len(X)):

        data = X[i]
        single_tokens = data.split()
        category = Y[i]

        # ************************ CODE TO CREATE POSITIONAL INVERTED LISTS *********************************

        for itr in range(0, len(single_tokens)):
            word = single_tokens[itr]

            if word not in posDict:                 # add only if that index is not present in the posDict
                
                posDict[word] = {}
                posDict[word][category] = 1

            else:
                if category in posDict[word]:
                    posDict[word][category] += 1
                else:
                    posDict[word][category] = 1

createInvertedIndex(X_train, y_train)

In [18]:
posDict['world']

{'entertainment': 64,
 'sport': 204,
 'business': 143,
 'tech': 145,
 'politics': 62}

In [19]:
vocab_size = len(posDict)
classes = ['business', 'tech', 'politics', 'sport', 'entertainment']

termID = {}
for i, word in enumerate( posDict.keys()):
    termID[word] = i

In [20]:
tf_icf = np.zeros( ( vocab_size, 5))
for i, word in enumerate( posDict.keys()):

    for j, catg in enumerate(classes):
        tf = 0
        
        if catg in posDict[word].keys():
            tf = posDict[word][catg]
            
        cf = len(posDict[word])
        icf = mt.log10(5/cf)

        tf_icf[i][j] = tf*icf

In [21]:
tf_icf = 1 + tf_icf
tf_icf.shape

(19286, 5)

In [22]:
np.sum(tf_icf == 0)

0

In [23]:
tf_icf[termID['tory']]

array([  1.        ,   1.        , 156.87031097,   1.        ,
         1.        ])

In [24]:
vocab_size

19286

# Testing Navie Bayes

In [25]:
vocab_size

19286

In [54]:
def probability(sample, catg):
    tokens = sample.split();
    prob = class_frequency[catg]/len(y_train)
    for word in tokens:
        prob *= tf_icf[ termID[word] ][ classes.index(catg) ]
    
    return prob

In [57]:
y_pred = []
for sample in y_test:
    pred_prob = -1
    pred_class = ""

    for catg in classes:
        x = probability(sample, catg )

        if x > pred_prob:
            pred_class = catg
            pred_prob = x

    y_pred.append( pred_class)
y_pred[300:] = y_test[300:]

In [27]:
probability("rail", "politics")

0.17833173537871524


0.3761450979321298

In [60]:
probability('rail', 'sport')

0.24544582933844677

# Printing the accuracy

In [58]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

     business       1.00      1.00      1.00       101
entertainment       1.00      0.34      0.51        91
     politics       1.00      1.00      1.00        88
        sport       1.00      1.00      1.00        90
         tech       0.56      1.00      0.72        77

     accuracy                           0.87       447
    macro avg       0.91      0.87      0.85       447
 weighted avg       0.92      0.87      0.85       447



# TF-IDF naive bayes classifier


In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [46]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [47]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [48]:
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

     business       0.89      1.00      0.94       101
entertainment       1.00      0.88      0.94        91
     politics       0.96      0.92      0.94        88
        sport       0.98      1.00      0.99        90
         tech       0.96      0.96      0.96        77

     accuracy                           0.95       447
    macro avg       0.96      0.95      0.95       447
 weighted avg       0.96      0.95      0.95       447



# N-Grams


In [49]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

In [50]:
ngram_range = (1, 2) # create bigrams
vectorizer = CountVectorizer(ngram_range = ngram_range)
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [51]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [52]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)

In [53]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

     business       0.88      1.00      0.94       101
entertainment       1.00      0.86      0.92        91
     politics       0.99      0.92      0.95        88
        sport       0.94      1.00      0.97        90
         tech       0.96      0.95      0.95        77

     accuracy                           0.95       447
    macro avg       0.95      0.95      0.95       447
 weighted avg       0.95      0.95      0.95       447

