In [1]:
import math

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split as split_data
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag

from tabulate import tabulate

In [2]:
def calc_info(p):
    # Formula: I(x) = -log2(p(x))
    ans = {x: -math.log2(px) for x, px in p}
    
    # Printing Output
    for key, value in ans.items():
        print(key, " : ", value)
        
    return ans

def calc_entropy(p, q):
    # Formula: CE(p, q) = -sum(p(x)*log2(q(x)) for x in p)
    H = -sum(px * math.log2(qx) for (_, px), (_, qx) in zip(p, q))
    
    # Printing Output
    print("Entropy: ", H)
    return H
    
def calc_dkl(p, q):
    # Formula: DKL(p, q) = sum(p(x)*log2(p(x)/q(x)) for x in p)
    dkl = sum(px * math.log2(px/qx) for (_, px), (_, qx) in zip(p, q))
    
    # Printing Output
    print("KL Divergence: ", dkl)
    return dkl

In [3]:
p = [('noun', 0.441),('verb', 0.255),('adj', 0.132),('adv', 0.172)]

In [4]:
print("Information of p")
info = calc_info(p)

Information of p
noun  :  1.1811494391045665
verb  :  1.9714308478032292
adj  :  2.9213901653036336
adv  :  2.539519529959989


In [5]:
print("Entropy of P with itself")
H = calc_entropy(p, p)

# for cross entropy let us define a new distribution 'q'
q = [('noun', 0.34), ('verb', 0.3), ('adj', 0.2), ('adv', 0.31)]

print("\nEntropy of (p, q)")
CE = calc_entropy(p, q)

Entropy of P with itself
Entropy:  1.8460226298081348

Entropy of (p, q)
Entropy:  1.7264117009914783


In [6]:
# Divergence with self distribution will be 0
dkl = calc_dkl(p, p)

print("\nDivergence of (p, q)")
dkl_pq = calc_dkl(p, q)
# DKL(p, q) != DKL(q, p)
print("\nDivergence of (q, p)")
dkl_qp = calc_dkl(q, p)

KL Divergence:  0.0

Divergence of (p, q)
KL Divergence:  -0.11961092881665658

Divergence of (q, p)
KL Divergence:  0.3261055526489861


In [7]:
df = pd.read_csv('C:/Users/kbkdf/Documents/Courses/NLP/Assignment-1/spam.csv')
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.rename(columns = {"v1" : "label", "v2":"data"}, inplace = True)
df.head()

Unnamed: 0,label,data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])

df.head()

Unnamed: 0,label,data
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df.describe()

Unnamed: 0,label
count,5572.0
mean,0.134063
std,0.340751
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [11]:
#Finding duplicates
print("Number of duplicate entries in data: ", df.duplicated().sum())

#dropping duplicates
df = df.drop_duplicates(keep = 'first')
print("After dropping: ", df.duplicated().sum())

Number of duplicate entries in data:  403
After dropping:  0


In [12]:
#Splitting the data
X = df["data"]
y = df["label"]

X_train, X_test, y_train, y_test = split_data(X, y, test_size = 0.3, random_state = 42)

In [13]:
models = ['0', '1', '2', '3']
bayes_accu = []
logre_accu = []

#### Bayes Classifier - Baseline Model0

In [14]:
vectorizer = CountVectorizer(stop_words="english")
X_train_cv = vectorizer.fit_transform(X_train)
X_test_cv = vectorizer.transform(X_test)

In [15]:
# Model 0 - Baseline model using unigram and other optional parameters
model0 = MultinomialNB()
model0.fit(X_train_cv, y_train)

MultinomialNB()

In [16]:
y_pred = model0.predict(X_test_cv)
bayes_accu.append(accuracy_score(y_test, y_pred))

print('Accuracy:', accuracy_score(y_test, y_pred))
print("-"*50)
print(classification_report(y_test, y_pred))

Accuracy: 0.9845261121856866
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1331
           1       0.97      0.92      0.94       220

    accuracy                           0.98      1551
   macro avg       0.98      0.96      0.97      1551
weighted avg       0.98      0.98      0.98      1551



#### Bayes Classifier - only Nouns Model1

In [17]:
def ext_nouns(text):
    tokens = word_tokenize(text)
    return ' '.join([word for word, pos in pos_tag(tokens) if pos.startswith('N')])

In [18]:
X1 = X.apply(ext_nouns)
X1_train, X1_test, y1_train, y1_test = split_data(X1, y, test_size = 0.3, random_state = 42)

In [19]:
print("Before extracting Nouns:\n","-"*30,"\n", X.head())
print("="*60)
print("After extracting Nouns:\n","-"*30,"\n", X1.head())

Before extracting Nouns:
 ------------------------------ 
 0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: data, dtype: object
After extracting Nouns:
 ------------------------------ 
 0    Go point .. Available bugis world la buffet Ci...
1                                    Ok lar Joking oni
2    entry comp FA Cup tkts May FA entry question t...
3                                            dun hor U
4                                                  Nah
Name: data, dtype: object


In [20]:
vectorizer = CountVectorizer(stop_words="english")
X1_train_cv = vectorizer.fit_transform(X1_train)
X1_test_cv = vectorizer.transform(X1_test)

In [21]:
# Model 1 - Use only nouns and other optional parameters
model1 = MultinomialNB()
model1.fit(X1_train_cv, y1_train)

MultinomialNB()

In [22]:
y1_pred = model1.predict(X1_test_cv)
bayes_accu.append(accuracy_score(y1_test, y1_pred))

print('Accuracy:', accuracy_score(y1_test, y1_pred))
print("-"*50)
print(classification_report(y1_test, y1_pred))

Accuracy: 0.9819471308833011
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1331
           1       0.96      0.91      0.93       220

    accuracy                           0.98      1551
   macro avg       0.97      0.95      0.96      1551
weighted avg       0.98      0.98      0.98      1551



#### Bayes Classifier - only Verbs Model2

In [23]:
def ext_verbs(text):
    tokens = word_tokenize(text)
    return ' '.join([word for word, pos in pos_tag(tokens) if pos.startswith('V')])

In [24]:
X2 = X.apply(ext_verbs)
X2_train, X2_test, y2_train, y2_test = split_data(X2, y, test_size = 0.3, random_state = 42)

In [25]:
print("Before extracting Verbs:\n","-"*30,"\n", X.head())
print("="*60)
print("After extracting Verbs:\n","-"*30,"\n", X2.head())

Before extracting Verbs:
 ------------------------------ 
 0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: data, dtype: object
After extracting Verbs:
 ------------------------------ 
 0                        got
1                           
2     win Text receive apply
3                  say c say
4    do think goes usf lives
Name: data, dtype: object


In [26]:
vectorizer = CountVectorizer(stop_words="english")
X2_train_cv = vectorizer.fit_transform(X2_train)
X2_test_cv = vectorizer.transform(X2_test)

In [27]:
# Model 2 - Use only verbs and other optional parameters
model2 = MultinomialNB()
model2.fit(X2_train_cv, y2_train)

MultinomialNB()

In [28]:
y2_pred = model2.predict(X2_test_cv)
bayes_accu.append(accuracy_score(y2_test, y2_pred))

print('Accuracy:', accuracy_score(y2_test, y2_pred))
print("-"*50)
print(classification_report(y2_test, y2_pred))

Accuracy: 0.9284332688588007
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1331
           1       0.96      0.52      0.67       220

    accuracy                           0.93      1551
   macro avg       0.94      0.76      0.82      1551
weighted avg       0.93      0.93      0.92      1551



#### Bayes Classifier - Choice of Configuration Model3

In [29]:
def ext_NounVerbs(text):
    tokens = word_tokenize(text)
    return ' '.join([word for word, pos in pos_tag(tokens) if pos.startswith(('N', 'V'))])

In [30]:
X3 = X.apply(ext_NounVerbs)
X3_train, X3_test, y3_train, y3_test = split_data(X3, y, test_size = 0.3, random_state = 42)

In [31]:
print("Before extracting Nouns and verbs:\n","-"*30,"\n", X.head())
print("="*60)
print("After extracting Nouns and Verbs:\n","-"*30,"\n", X3.head())

Before extracting Nouns and verbs:
 ------------------------------ 
 0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: data, dtype: object
After extracting Nouns and Verbs:
 ------------------------------ 
 0    Go point .. Available bugis world la buffet Ci...
1                                    Ok lar Joking oni
2    entry comp win FA Cup tkts May Text FA receive...
3                                  dun say hor U c say
4                          Nah do think goes usf lives
Name: data, dtype: object


In [32]:
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words="english")
X3_train_cv = vectorizer.fit_transform(X_train)
X3_test_cv = vectorizer.transform(X_test)

In [33]:
# Model 3 - Choice of configuration and other optional parameters
model3 = MultinomialNB()
model3.fit(X3_train_cv, y3_train)

MultinomialNB()

In [34]:
y3_pred = model3.predict(X3_test_cv)
bayes_accu.append(accuracy_score(y3_test, y3_pred))

print('Accuracy:', accuracy_score(y3_test, y3_pred))
print("-"*50)
print(classification_report(y3_test, y3_pred))

Accuracy: 0.9858156028368794
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1331
           1       0.98      0.92      0.95       220

    accuracy                           0.99      1551
   macro avg       0.98      0.96      0.97      1551
weighted avg       0.99      0.99      0.99      1551



#### Logistic Regression - Comparable to Baseline Model0

In [35]:
vectorizer = CountVectorizer(stop_words="english")
log_reg = LogisticRegression()

In [36]:
model4 = make_pipeline(vectorizer, log_reg)
model4.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer(stop_words='english')),
                ('logisticregression', LogisticRegression())])

In [37]:
y4_pred = model4.predict(X_test)
logre_accu.append(accuracy_score(y_test, y4_pred))

print('Accuracy:', accuracy_score(y_test, y4_pred))
print("-"*50)
print(classification_report(y_test, y4_pred))

Accuracy: 0.9754996776273372
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1331
           1       0.99      0.84      0.91       220

    accuracy                           0.98      1551
   macro avg       0.98      0.92      0.95      1551
weighted avg       0.98      0.98      0.97      1551



#### Logistic Regression - Comparable to only Nouns Model1

In [38]:
vectorizer = CountVectorizer(stop_words="english")
log_reg = LogisticRegression()

In [39]:
model5 = make_pipeline(vectorizer, log_reg)
model5.fit(X1_train, y1_train)

Pipeline(steps=[('countvectorizer', CountVectorizer(stop_words='english')),
                ('logisticregression', LogisticRegression())])

In [40]:
y5_pred = model5.predict(X1_test)
logre_accu.append(accuracy_score(y1_test, y5_pred))

print('Accuracy:', accuracy_score(y1_test, y5_pred))
print("-"*50)
print(classification_report(y1_test, y5_pred))

Accuracy: 0.9613152804642167
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1331
           1       0.98      0.75      0.85       220

    accuracy                           0.96      1551
   macro avg       0.97      0.87      0.91      1551
weighted avg       0.96      0.96      0.96      1551



#### Logistic Regression - Comparable to only Verbs Model2

In [41]:
vectorizer = CountVectorizer(stop_words="english")
log_reg = LogisticRegression()

In [42]:
model6 = make_pipeline(vectorizer, log_reg)
model6.fit(X2_train, y2_train)

Pipeline(steps=[('countvectorizer', CountVectorizer(stop_words='english')),
                ('logisticregression', LogisticRegression())])

In [43]:
y6_pred = model6.predict(X2_test)
logre_accu.append(accuracy_score(y2_test, y6_pred))

print('Accuracy:', accuracy_score(y2_test, y6_pred))
print("-"*50)
print(classification_report(y2_test, y6_pred))

Accuracy: 0.9097356544165055
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1331
           1       0.95      0.38      0.55       220

    accuracy                           0.91      1551
   macro avg       0.93      0.69      0.75      1551
weighted avg       0.91      0.91      0.89      1551



#### Logistic Regression - Comparable to Choice of Configuration Model3

In [44]:
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words="english")
log_reg = LogisticRegression()

In [45]:
model7 = make_pipeline(vectorizer, log_reg)
model7.fit(X3_train, y3_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(ngram_range=(1, 3), stop_words='english')),
                ('logisticregression', LogisticRegression())])

In [46]:
y7_pred = model4.predict(X3_test)
logre_accu.append(accuracy_score(y3_test, y7_pred))

print('Accuracy:', accuracy_score(y3_test, y7_pred))
print("-"*50)
print(classification_report(y3_test, y7_pred))

Accuracy: 0.9522888459058672
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1331
           1       0.99      0.67      0.80       220

    accuracy                           0.95      1551
   macro avg       0.97      0.84      0.89      1551
weighted avg       0.95      0.95      0.95      1551



In [47]:
table = []
for i in range(len(models)):
    table.append([models[i], bayes_accu[i], logre_accu[i]])
    
print(tabulate(table, headers=["Model", "Naive Bayes", "Logistic regression"], tablefmt="fancy_grid"))

╒═════════╤═══════════════╤═══════════════════════╕
│   Model │   Naive Bayes │   Logistic regression │
╞═════════╪═══════════════╪═══════════════════════╡
│       0 │      0.984526 │              0.9755   │
├─────────┼───────────────┼───────────────────────┤
│       1 │      0.981947 │              0.961315 │
├─────────┼───────────────┼───────────────────────┤
│       2 │      0.928433 │              0.909736 │
├─────────┼───────────────┼───────────────────────┤
│       3 │      0.985816 │              0.952289 │
╘═════════╧═══════════════╧═══════════════════════╛
