In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

df=pd.read_csv(r"D:\GeakMinds Internship\real world projects\datasets\sms_spam.csv",encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df=df.dropna(axis=1)
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

In [7]:
df.rename(columns={"v1":"Class", "v2":"Text"}, inplace=True)
df.head()

Unnamed: 0,Class,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
ps = PorterStemmer()
 
wn = WordNetLemmatizer()

In [9]:
def clean_text(st, process="lemma"):
    cleaned = re.sub("[^A-z]", " ", st)
    cleaned = cleaned.lower().split()
    
    if process=="stem": cleaned = [ps.stem(wrd) for wrd in cleaned if wrd not in stopwords.words("english")]
    elif process=="lemma": cleaned = [wn.lemmatize(wrd) for wrd in cleaned if wrd not in stopwords.words("english")]
    
    return " ".join(cleaned)

df["Stemmed"] = df.Text.apply(lambda x:clean_text(x, "stem"))
df["Lemmatized"] = df.Text.apply(lambda x:clean_text(x, "lemma"))


In [10]:
df.head()

Unnamed: 0,Class,Text,Stemmed,Lemmatized
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though,nah think go usf life around though


In [12]:
metrics = {
    "stem_NB":[], "lemma_NB":[],
    "stem_SGD":[], "lemma_SGD":[],
}

for key in metrics.keys():
    if "stem" in key: X, y = df.Stemmed, pd.get_dummies(df.Class).iloc[:,1]
    elif "lemma" in key: X, y = df.Lemmatized, pd.get_dummies(df.Class).iloc[:,1]

    skf = StratifiedKFold(n_splits=5)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        vect = CountVectorizer(max_features=2500)
        X_train_dtm = vect.fit_transform(X_train)
        X_test_dtm = vect.transform(X_test)
        
        if "NB" in key:
            clf = MultinomialNB()
        elif "SGD" in key:
            clf = SGDClassifier()
            
        clf.fit(X_train_dtm, y_train)
        y_pred = clf.predict(X_test_dtm)
        
        metrics[key].append(accuracy_score(y_test, y_pred))

    scores = np.array(metrics[key])
    print('Mean accuracy: ', np.mean(scores, axis=0))
    print('Std for accuracy: ', np.std(scores, axis=0))


Mean accuracy:  0.9834887409327676
Std for accuracy:  0.0025132684526388703
Mean accuracy:  0.9825913968972154
Std for accuracy:  0.001759666824840256
Mean accuracy:  0.980437320366151
Std for accuracy:  0.0020038348022536514
Mean accuracy:  0.9793612481986298
Std for accuracy:  0.001881139902406329


In [13]:
metricsDf = pd.DataFrame.from_dict(metrics).transpose()
metricsDf

Unnamed: 0,0,1,2,3,4
stem_NB,0.981166,0.986547,0.986535,0.982047,0.981149
lemma_NB,0.981166,0.984753,0.98474,0.981149,0.981149
stem_SGD,0.980269,0.983857,0.977558,0.980251,0.980251
lemma_SGD,0.979372,0.978475,0.978456,0.977558,0.982944


In [14]:
X, y = df.Stemmed, pd.get_dummies(df.Class).iloc[:,1]

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    vect = CountVectorizer(max_features=2500)
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)

In [15]:
clf = MultinomialNB()
clf.fit(X_train_dtm, y_train)

In [16]:
clf.score(X_train_dtm, y_train)

0.990578734858681

In [17]:
cross_val_score(clf, X_train_dtm, y_train, cv=5)

array([0.9764574 , 0.97982063, 0.98991031, 0.98092031, 0.98877666])

In [None]:
params = { 'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ] }

grid_search = GridSearchCV(clf, param_grid=params, n_jobs=-1, cv=5)
grid_search.fit(X_train_dtm, y_train)

In [19]:
grid_search.best_params_

{'alpha': 0.1}

In [20]:
grid_search.best_score_

0.9834002707694784

In [21]:
grid_search.best_estimator_.score(X_train_dtm, y_train)

0.9910273665320771

In [22]:
grid_search.best_estimator_.score(X_test_dtm, y_test)

0.9802513464991023

In [23]:
fin_clf = grid_search.best_estimator_
fin_clf.fit(X_train_dtm, y_train)
print(f"Test Scores: {fin_clf.score(X_test_dtm, y_test)}")

y_pred = fin_clf.predict(X_test_dtm)
print(f"Accuracy: {accuracy_score(y_pred, y_test)}")

Test Scores: 0.9802513464991023
Accuracy: 0.9802513464991023
