In [113]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,roc_auc_score,accuracy_score

In [66]:
data = pd.read_csv(r'E:\programming\dataset\sms_spam_kaggle\spam.csv',encoding='ISO-8859-1')
#when i try to load it simply, it gave me a unicodeDeocode error, i searched up the probelm, 
#and got the solution to use encoding='ISO-8859-1', although i don't know why but it gave me
#some extra columns which i removed

In [67]:
data.shape

(5572, 5)

In [68]:
data = data[['v1','v2']]

In [69]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [70]:
data['v2'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

## Cleaning the text and preprocessing

In [71]:
#remvogin all words which contrain number or special characters
#lemmatizing the words
#removing human names from the corpus

In [72]:
def letters_only(astr):
    return astr.isalpha()

all_names = names.words()
lemmatizer = WordNetLemmatizer()

In [73]:
cleaned_text = []
for i in range(len(data)):
    sentence = data['v2'][i].split()
    cleaned_text.append(" ".join([lemmatizer.lemmatize(word) for word in sentence if letters_only(word) and not word in all_names]))

In [74]:
data['v2'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [75]:
cleaned_text[0]

'Go until jurong Available only in bugis n great world la e Cine there got amore'

In [76]:
data['cleaned_text'] = cleaned_text

In [77]:
data.head()

Unnamed: 0,v1,v2,cleaned_text
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong Available only in bugis n grea...
1,ham,Ok lar... Joking wif u oni...,Ok Joking wif u
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup final ...
3,ham,U dun say so early hor... U c already then say...,U dun say so early U c already then
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I think he go to he life around here though


In [78]:
#looking for any missing values
data.isnull().sum()

v1              0
v2              0
cleaned_text    0
dtype: int64

In [79]:
#we don't have any missing values

In [80]:
#label encoding for target
encoding = {'ham':0,'spam':1}
data['target'] = data['v1'].map(encoding)

In [81]:
data.head()

Unnamed: 0,v1,v2,cleaned_text,target
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong Available only in bugis n grea...,0
1,ham,Ok lar... Joking wif u oni...,Ok Joking wif u,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup final ...,1
3,ham,U dun say so early hor... U c already then say...,U dun say so early U c already then,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I think he go to he life around here though,0


In [82]:
#we don't need the v1 and v2 colunbs
data.drop(['v1','v2'],axis = 1,inplace = True)

In [83]:
data.head()

Unnamed: 0,cleaned_text,target
0,Go until jurong Available only in bugis n grea...,0
1,Ok Joking wif u,0
2,Free entry in a wkly comp to win FA Cup final ...,1
3,U dun say so early U c already then,0
4,Nah I think he go to he life around here though,0


### Splitting the data into train and test

In [85]:
X_train,X_test,y_train,y_test = train_test_split(data['cleaned_text'],data['target'],test_size = 0.3,random_state = 0)

In [86]:
print(X_train.shape,X_test.shape)

(3900,) (1672,)


## Vectorizing the features

In [90]:
cv = CountVectorizer(stop_words = 'english',max_features = 500)

In [91]:
train_vectorized = cv.fit_transform(X_train)
test_vectorized = cv.transform(X_test)

In [93]:
#train_vectorized and test_vectorized are spares metrics
print(train_vectorized.shape,test_vectorized.shape)

(3900, 500) (1672, 500)


In [94]:
# we cam view the 500 features that are used
features = cv.get_feature_names()
print(features)

['able', 'abt', 'account', 'actually', 'address', 'afternoon', 'aight', 'alright', 'angry', 'answer', 'anytime', 'apply', 'ard', 'ask', 'asked', 'attempt', 'await', 'award', 'awarded', 'away', 'babe', 'baby', 'bad', 'beautiful', 'bed', 'believe', 'best', 'better', 'big', 'birthday', 'bit', 'blue', 'bonus', 'book', 'booked', 'bored', 'bout', 'box', 'boy', 'break', 'bring', 'bslvyl', 'bt', 'bus', 'busy', 'buy', 'called', 'caller', 'callertune', 'calling', 'came', 'camera', 'car', 'card', 'care', 'cash', 'cause', 'chance', 'change', 'charge', 'chat', 'check', 'choose', 'claim', 'class', 'close', 'code', 'collect', 'colour', 'come', 'coming', 'comp', 'company', 'completely', 'congrats', 'congratulations', 'contact', 'content', 'cool', 'copy', 'cos', 'cost', 'couple', 'coz', 'crave', 'currently', 'customer', 'da', 'dad', 'darlin', 'dat', 'date', 'dating', 'day', 'dear', 'decided', 'den', 'did', 'didnt', 'dinner', 'direct', 'dis', 'dnt', 'doe', 'dogging', 'doing', 'dont', 'double', 'download

## Basic  Model


In [96]:
#alpha is the smoothing factor
nb = MultinomialNB(alpha = 1,fit_prior = True)

In [97]:
nb.fit(train_vectorized,y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [99]:
prediction_probs = nb.predict_proba(test_vectorized)

In [100]:
predicted_classes = nb.predict(test_vectorized)

In [103]:
report = classification_report(y_test,predicted_classes)
print(report)

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1434
           1       0.85      0.84      0.85       238

    accuracy                           0.96      1672
   macro avg       0.91      0.91      0.91      1672
weighted avg       0.96      0.96      0.96      1672



In [105]:
acc = accuracy_score(y_test,predicted_classes)
print('the accuracy on test examples is {}'.format(acc))

the accuracy on test examples is 0.9569377990430622


In [111]:
roc_score = roc_auc_score(y_test,prediction_probs[:,1])
print('the roc score obtained is {}'.format(roc_score))

the roc score obtained is 0.9728341127245878


## Finding the best possible values of hyperparameters using k fold validation

In [114]:
k = 10
k_fold = StratifiedKFold(n_splits = k)

#creating numpy arrays for better slicing
cleaned_emails_np = np.array(data['cleaned_text'])
labels_np = np.array(data['target'])


In [117]:
max_features_option = [2000,4000,8000]
smoothing_factor_option = [0.5, 1.0, 1.5, 2.0]
fit_prior_option = [True, False]
auc_record = {}

In [118]:
for train_indices,test_indices in k_fold.split(cleaned_emails_np,labels_np):
    X_train,X_test = cleaned_emails_np[train_indices],cleaned_emails_np[test_indices]
    y_train,y_test = labels_np[train_indices],labels_np[test_indices]
    for features in max_features_option:
        if not features in auc_record:
            auc_record[features] = {}
        cv = CountVectorizer(stop_words = 'english',max_features = features)
        train_doc = cv.fit_transform(X_train)
        test_doc = cv.transform(X_test)
        for smoothing in smoothing_factor_option:
            if not smoothing in auc_record[features]:
                auc_record[features][smoothing] = {}
            for fit_prior in fit_prior_option:
                clf = MultinomialNB(alpha=smoothing, fit_prior=fit_prior)
                clf.fit(train_doc,y_train)
                pred_probas = clf.predict_proba(test_doc)
                pos_prob = pred_probas[:,1]
                auc = roc_auc_score(y_test,pos_prob)  
                auc_record[features][smoothing][fit_prior] \
                    = auc + auc_record[features][smoothing].get(fit_prior, 0.0)


In [119]:
print('Max_features   smoothing    fit_prior      auc')
for max_features,max_features_data in auc_record.items():
    for smoothing,smoothing_data in max_features_data.items():
        for fit_prior,auc in smoothing_data.items():
            print("{0}           {1}           {2}        {3:.4f}".format(max_features,smoothing,fit_prior,auc/k))

Max_features   smoothing    fit_prior      auc
2000           0.5           True        0.9699
2000           0.5           False        0.9699
2000           1.0           True        0.9696
2000           1.0           False        0.9696
2000           1.5           True        0.9689
2000           1.5           False        0.9689
2000           2.0           True        0.9680
2000           2.0           False        0.9680
4000           0.5           True        0.9735
4000           0.5           False        0.9735
4000           1.0           True        0.9718
4000           1.0           False        0.9718
4000           1.5           True        0.9698
4000           1.5           False        0.9698
4000           2.0           True        0.9680
4000           2.0           False        0.9680
8000           0.5           True        0.9733
8000           0.5           False        0.9733
8000           1.0           True        0.9713
8000           1.0           Fal

In [120]:
# by looking at the above information, we can find the combinations for the hyperparameters which has the maximum score 
max_features_best = 4000
smoothing_best = 0.5
fit_prior_best = True

## Final Model

In [121]:
X_train,X_test,y_train,y_test = train_test_split(data['cleaned_text'],data['target'],test_size = 0.3,random_state = 0)

In [122]:
cv = CountVectorizer(max_features = max_features_best,stop_words = 'english')

In [124]:
train_vectorized = cv.fit_transform(X_train)
test_vectorized = cv.transform(X_test)

In [125]:
print(train_vectorized.shape,test_vectorized.shape)

(3900, 4000) (1672, 4000)


In [126]:
nb = MultinomialNB(alpha = smoothing_best,fit_prior=fit_prior_best)
nb.fit(train_vectorized,y_train)

MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True)

In [128]:
predicted_classes = nb.predict(test_vectorized)
predicted_probas = nb.predict_proba(test_vectorized)
probas_spam = predicted_probas[:,1]

In [129]:
final_score = roc_auc_score(y_test,probas_spam)
print('the roc_auc score is ',final_score)

the roc_auc score is  0.9821194168043788


In [133]:
report = classification_report(y_test,predicted_classes)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1434
           1       0.89      0.89      0.89       238

    accuracy                           0.97      1672
   macro avg       0.94      0.93      0.94      1672
weighted avg       0.97      0.97      0.97      1672

