In [26]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# We'll avoid NLTK corpus downloads by using a simple regex tokenizer
import re
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [27]:
df = pd.read_csv('spam.csv')
df

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [28]:
encoder = LabelEncoder()
df['Label'] = encoder.fit_transform(df['Label'])

In [29]:
df

Unnamed: 0,Label,EmailText
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ã_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [30]:
# removing all the duplicate values and keeping only the first
df = df.drop_duplicates(keep='first')

In [31]:
df.shape

(5169, 2)

In [32]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [33]:
def get_importantFeatures(sent):
    # Lowercase and extract alphanumeric tokens using regex (avoids NLTK punkt dependency)
    sent = str(sent).lower()
    #  matches [A-Za-z0-9_], use a pattern that excludes underscores if desired
    tokens = re.findall(r"\b[0-9A-Za-z]+\b", sent)
    return tokens

def removing_stopWords(sent):
    # `sent` is expected to be a list of tokens; filter using sklearn's ENGLISH_STOP_WORDS and punctuation
    returnList = []
    for i in sent:
        if i not in ENGLISH_STOP_WORDS and i not in string.punctuation:
            returnList.append(i)
    return returnList

def potter_stem(sent):
    # `sent` is expected to be a list of tokens (or an iterable), stem and join back to a string
    returnList = []
    for i in sent:
        returnList.append(ps.stem(i))
    return " ".join(returnList)

In [34]:
# Ensure EmailText is a string and then apply the preprocessing steps
df['imp_feature'] = df['EmailText'].astype(str).apply(get_importantFeatures)
df['imp_feature'] = df['imp_feature'].apply(removing_stopWords)
df['imp_feature'] = df['imp_feature'].apply(potter_stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['imp_feature'] = df['EmailText'].astype(str).apply(get_importantFeatures)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['imp_feature'] = df['imp_feature'].apply(removing_stopWords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['imp_feature'] = df['imp_feature'].apply(potter_stem)
A value

In [35]:
df

Unnamed: 0,Label,EmailText,imp_feature
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazi avail bugi n great world la...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah don t think goe usf live
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u u won 750 pound prize...
5568,0,Will Ã_ b going to esplanade fr home?,b go esplanad fr home
5569,0,"Pity, * was in mood for that. So...any other s...",piti mood suggest
5570,0,The guy did some bitching but I acted like i'd...,guy did bitch act like d interest buy week gav...


In [36]:
from sklearn.model_selection import train_test_split
X = df['imp_feature']
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
tfidf = TfidfVectorizer()
feature = tfidf.fit_transform(X_train)

tuned_parameters = {'kernel':['linear','rbf'],'gamma':[1e-3,1e-4], 'C':[1,10,100,1000]}

model = GridSearchCV(svm.SVC(),tuned_parameters)
model.fit(feature, y_train)

0,1,2
,estimator,SVC()
,param_grid,"{'C': [1, 10, ...], 'gamma': [0.001, 0.0001], 'kernel': ['linear', 'rbf']}"
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,1000
,kernel,'rbf'
,degree,3
,gamma,0.001
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [39]:
y_predict = tfidf.transform(X_test)
print("Accuracy:",model.score(y_predict,y_test))

Accuracy: 0.9837587006960556


In [None]:
# GridSearchCV does not provide a .save() method (AttributeError).
# Use pickle to persist the trained estimator (see next cell).
# Keeping a no-op here to avoid breaking execution when running sequentially.
pass

AttributeError: 'GridSearchCV' object has no attribute 'save'

In [None]:
import pickle
filename = 'finalized_model.sav'
# Save the best estimator returned by GridSearchCV so we can load and predict later
pickle.dump(model.best_estimator_, open(filename, 'wb'))

In [None]:
from tkinter import *
import tkinter as tk

spam_model = pickle.load(open("finalized_model.sav",'rb'))

def check_spam():
    text = spam_text_Entry.get()
    is_spam = spam_model.predict(tfidf.transform([text]))
    if is_spam == 1:
        print("text is spam")
        my_string_var.set("Result: text is spam")
    else:
        print("text is not spam")
        my_string_var.set("Result: text is not spam")
win = Tk()

win.geometry("400x600")
win.configure(background="cyan")
win.title("Sms Spam Detector")

title = Label(win, text="SMS Spam Detector", bg="gray",width="300",height="2",fg="white",font=("Calibri 20 bold italic underline")).pack()

spam_text = Label(win, text="Enter your Text: ",bg="cyan", font=("Verdana 12")).place(x=12,y=100)
spam_text_Entry = Entry(win, textvariable=spam_text,width=33)
spam_text_Entry.place(x=155, y=105)

my_string_var = StringVar()
my_string_var.set("Result: ")

print_spam = Label(win, textvariable=my_string_var,bg="cyan", font=("Verdana 12")).place(x=12,y=200)

Button = Button(win, text="Submit",width="12",height="1",activebackground="red",bg="Pink",command=check_spam,font=("Verdana 12")).place(x=12,y=150)

win.mainloop()

text is spam
