In [100]:
import pandas as pd
import numpy as np
import regex as re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet 
import string
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
# from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [126]:
train=pd.read_csv("./train.csv")
val=pd.read_csv("./validation.csv")
test=pd.read_csv("./test.csv")

## Multinomial Naive Bayes Model (tfidf tokenizer) 

In [127]:
vectorizer = CountVectorizer()
vectorizer.fit(train.X_train)


CountVectorizer()

### Tf-Idf Tokenizer

In [128]:
bagofwords=vectorizer.vocabulary_
X_train=vectorizer.transform(train.X_train)
X_val=vectorizer.transform(val.X_val)
X_test=vectorizer.transform(test.X_test)
print(len(bagofwords))
X_train.shape,X_val.shape

7330


((4512, 7330), (502, 7330))

In [129]:
tfidf_transformer = TfidfTransformer().fit(X_train)
X_train_tfidf=tfidf_transformer.transform(X_train)
X_val_tfidf=tfidf_transformer.transform(X_val)
X_test_tfidf=tfidf_transformer.transform(X_test)
X_train_tfidf.shape,X_val_tfidf.shape

((4512, 7330), (502, 7330))

### Model Fitting and Hyperparameter Tuning

In [130]:
for i in np.arange(0.25, 2.25, 0.25):
    spam_detector = MultinomialNB(alpha=i).fit(X_train_tfidf, train.y_train)
    y_pred = spam_detector.predict(X_val_tfidf)
    print(accuracy_score(val.y_val, y_pred), '\n', classification_report(val.y_val, y_pred))


0.9880478087649402 
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99       441
        spam       0.98      0.92      0.95        61

    accuracy                           0.99       502
   macro avg       0.99      0.96      0.97       502
weighted avg       0.99      0.99      0.99       502

0.9900398406374502 
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99       441
        spam       1.00      0.92      0.96        61

    accuracy                           0.99       502
   macro avg       0.99      0.96      0.98       502
weighted avg       0.99      0.99      0.99       502

0.9880478087649402 
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99       441
        spam       1.00      0.90      0.95        61

    accuracy                           0.99       502
   macro avg       0.99      0.95      0.97       502
weighted av

In [131]:
#Alpha = 0.5 gives best results. It maximises accuracy and recall for Spam mails.

In [132]:
spam_detector = MultinomialNB(alpha=0.5).fit(X_train_tfidf, train.y_train)

In [133]:
y_pred = spam_detector.predict(X_val_tfidf)
accuracy_score(val.y_val, y_pred)


0.9900398406374502

In [134]:
print(classification_report(val.y_val, y_pred))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       441
        spam       1.00      0.92      0.96        61

    accuracy                           0.99       502
   macro avg       0.99      0.96      0.98       502
weighted avg       0.99      0.99      0.99       502



In [136]:
y_pred_test = spam_detector.predict(X_test_tfidf)
accuracy_score(test.y_test, y_pred_test)

0.9731182795698925

### Model Evaluation on Test Data

In [137]:
print(classification_report(test.y_test, y_pred_test))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       485
        spam       1.00      0.79      0.89        73

    accuracy                           0.97       558
   macro avg       0.98      0.90      0.94       558
weighted avg       0.97      0.97      0.97       558



## Multinomial Naive Bayes Model (Count tokenizer)

In [138]:
vectorizer = CountVectorizer()
vectorizer.fit(train.X_train)

CountVectorizer()

In [139]:
bagofwords=vectorizer.vocabulary_
X_train=vectorizer.transform(train.X_train)
X_val=vectorizer.transform(val.X_val)
X_test=vectorizer.transform(test.X_test)
print(len(bagofwords))
X_train.shape,X_val.shape

7330


((4512, 7330), (502, 7330))

In [140]:
for i in np.arange(0.25, 2.25, 0.25):
    spam_detector = MultinomialNB(alpha=i).fit(X_train, train.y_train)
    y_pred = spam_detector.predict(X_val)
    print(accuracy_score(val.y_val, y_pred), '\n', classification_report(val.y_val, y_pred))


0.9860557768924303 
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       441
        spam       0.94      0.95      0.94        61

    accuracy                           0.99       502
   macro avg       0.96      0.97      0.97       502
weighted avg       0.99      0.99      0.99       502

0.9880478087649402 
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       441
        spam       0.95      0.95      0.95        61

    accuracy                           0.99       502
   macro avg       0.97      0.97      0.97       502
weighted avg       0.99      0.99      0.99       502

0.9880478087649402 
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       441
        spam       0.95      0.95      0.95        61

    accuracy                           0.99       502
   macro avg       0.97      0.97      0.97       502
weighted av

In [141]:
# The model performs similar for all values of alpha from 0.5 to 2

## Support Vector Classifier Model

In [142]:
clf = SVC(C=1)
clf.fit(X_train_tfidf, train.y_train)


SVC(C=1)

In [143]:
y_predicted = clf.predict(X_val_tfidf)

In [144]:
print(accuracy_score(val.y_val, y_pred), '\n', classification_report(val.y_val, y_pred))

0.9880478087649402 
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       441
        spam       0.95      0.95      0.95        61

    accuracy                           0.99       502
   macro avg       0.97      0.97      0.97       502
weighted avg       0.99      0.99      0.99       502



In [145]:
for i in range(1,10000,1000):
    clf = SVC(C=i)
    clf.fit(X_train_tfidf, train.y_train)
    y_predicted = clf.predict(X_val_tfidf)
    print(accuracy_score(val.y_val, y_pred), '\n', classification_report(val.y_val, y_pred))

0.9880478087649402 
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       441
        spam       0.95      0.95      0.95        61

    accuracy                           0.99       502
   macro avg       0.97      0.97      0.97       502
weighted avg       0.99      0.99      0.99       502

0.9880478087649402 
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       441
        spam       0.95      0.95      0.95        61

    accuracy                           0.99       502
   macro avg       0.97      0.97      0.97       502
weighted avg       0.99      0.99      0.99       502

0.9880478087649402 
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       441
        spam       0.95      0.95      0.95        61

    accuracy                           0.99       502
   macro avg       0.97      0.97      0.97       502
weighted av

In [146]:
# Model performs similar for the different values of regularization parameter.

In [147]:
# Multinomial Naive Bayes Model (tfidf tokenizer) performs best, out of the three model.