# The following program demonstrates the classification of emial into spam or ham using NLTK and ML algorithms

In [1]:
#importing the required libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

# Reading the Data

In [2]:
#reading the data(CSV file)
df=pd.read_csv("spam.csv",encoding='latin1')
df.head(5)

Unnamed: 0,type,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df=df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
df.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#printing the size of the dataset
df.shape

(5572, 2)

In [5]:
#getting feature names
df.columns

Index(['type', 'text'], dtype='object')

In [6]:
#checking the duplicates and remove them
df.drop_duplicates(inplace=True)
df.shape

(5169, 2)

In [7]:
#show the number of missing data(NAN,NaN,na) for each column
df.isnull().sum()

type    0
text    0
dtype: int64

# Message Preprocessing

In [8]:
#function to tokenize each and every word
def tokenizer(text):
    tokenized=nltk.word_tokenize(text)
    tokenized=' '.join(tokenized)
    tokenized=tokenized.replace('n\'t','not')
    return tokenized
                

In [9]:
#creating a function to process punctuation and stopwords in the text data
def process_stop_punc(text):
    #remove punctuations
    #remove stopwords
    #return a list of clen text words
    nopunc=[char for char in text if char not in string.punctuation]
    nopunc=''.join(nopunc)
    
    clean_words=[word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    return clean_words

In [10]:
#functions to convert words into single form i.e. converting plural to singular and past ,past continuous to present
def stemming(List):
    stem_obj=nltk.stem.PorterStemmer()
    List=[stem_obj.stem(i) for i in List]
    message=' '.join(List)
    return message

In [11]:
#function to compile each and every operation
def process(text):
    return stemming(process_stop_punc(tokenizer(text)))

In [12]:
#show the tokenization
df['text'].head().apply(process)

0    Go jurong point crazi avail bugi n great world...
1                                Ok lar joke wif u oni
2    free entri 2 wkli comp win FA cup final tkt 21...
3                  U dun say earli hor U c alreadi say
4                 nah think goe usf live around though
Name: text, dtype: object

# Vectorizing the words

In [60]:
#convert a collection of data to matrix of tokens using tf-idf vectorizer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
message=TfidfVectorizer().fit_transform(df['text'])

In [61]:
#getting the shape of message
message.shape

(5169, 8672)

In [121]:
#using countvectorizer
from sklearn.feature_extraction.text import CountVectorizer
message1=CountVectorizer().fit_transform(df['text'])
message1

<5169x8672 sparse matrix of type '<class 'numpy.int64'>'
	with 68018 stored elements in Compressed Sparse Row format>

# Splitting data into training tesing set

In [63]:
#splitting the data into 80:20 train test ratio for dataset vectorized using tf-idfvectorizer
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(message,df['type'],test_size=0.2,random_state=0)

In [64]:
#splitting the data into 80:20 train test ratio for dataset vectorized using countvectorizer
from sklearn.model_selection import train_test_split
X_train1,X_test1,y_train1,y_test1=train_test_split(message1,df['type'],test_size=0.2,random_state=0)

# Feeding Data and classification report of Naive Bayes classifier 

In [65]:
#creating and training the naive bayes classifier for dataset vectorized using tf-idfvectorizer
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB().fit(X_train,y_train)

In [66]:
#evaluate the model and training dataset
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred=classifier.predict(X_train)
print(classification_report(y_train,pred))
print()
print('confusion Matrix:\n',confusion_matrix(y_train,pred))
print()
print(' training accuracy score:\n',accuracy_score(y_train,pred))


              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      3631
        spam       1.00      0.71      0.83       504

    accuracy                           0.97      4135
   macro avg       0.98      0.86      0.91      4135
weighted avg       0.97      0.97      0.96      4135


confusion Matrix:
 [[3631    0]
 [ 144  360]]

 training accuracy score:
 0.9651753325272068


In [67]:
#printing the predictions
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred=classifier.predict(X_test)
print(classification_report(y_test,pred))
print()
print('confusion Matrix:\n',confusion_matrix(y_test,pred))
print()
print('testing accuracy score:\n',accuracy_score(y_test,pred))


              precision    recall  f1-score   support

         ham       0.94      1.00      0.97       885
        spam       1.00      0.60      0.75       149

    accuracy                           0.94      1034
   macro avg       0.97      0.80      0.86      1034
weighted avg       0.95      0.94      0.94      1034


confusion Matrix:
 [[885   0]
 [ 60  89]]

testing accuracy score:
 0.941972920696325


In [68]:
#creating and training the naive bayes classifier for for dataset vectorized using countvectorizer
from sklearn.naive_bayes import MultinomialNB
classifier1=MultinomialNB().fit(X_train1,y_train1)

In [69]:
#evaluate the model and training dataset
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred1=classifier1.predict(X_train1)
print(classification_report(y_train1,pred1))
print()
print('confusion Matrix:\n',confusion_matrix(y_train1,pred1))
print()
print(' training accuracy score:\n',accuracy_score(y_train1,pred1))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3631
        spam       0.98      0.97      0.97       504

    accuracy                           0.99      4135
   macro avg       0.99      0.98      0.99      4135
weighted avg       0.99      0.99      0.99      4135


confusion Matrix:
 [[3623    8]
 [  17  487]]

 training accuracy score:
 0.9939540507859734


In [70]:
#prediction from the model
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred1_test=classifier1.predict(X_test1)
print(classification_report(y_test1,pred1_test))
print()
print('confusion Matrix:\n',confusion_matrix(y_test1,pred1_test))
print()
print('testing accuracy score:\n',accuracy_score(y_test1,pred1_test))


              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       885
        spam       0.91      0.93      0.92       149

    accuracy                           0.98      1034
   macro avg       0.95      0.96      0.96      1034
weighted avg       0.98      0.98      0.98      1034


confusion Matrix:
 [[872  13]
 [ 10 139]]

testing accuracy score:
 0.9777562862669246


# Finding accuracy using 5-fold cross-validation

In [71]:
#import library and building 5-fold cross validation 
from sklearn.model_selection import KFold,cross_val_score
k_fold=KFold(len(df['type']),n_splits=5,shuffle=True,random_state=0)
clf=MultinomialNB()

In [72]:
#printing the cross validation accuray for for dataset vectorized using tf-idfvectorizer
accuracy=cross_val_score(clf,message,df['type'],cv=k_fold,n_jobs=1)
accuracy.mean()*100

96.15012574966144

In [73]:
#printing the cross validation accuray for for dataset vectorized using countvectorizer
accuracy1=cross_val_score(clf,message1,df['type'],cv=k_fold,n_jobs=1)
accuracy1.mean()*100

97.92996711162701

# Using support-vector-machine Algorithm

In [74]:
#prediction using LinearSVC and GridsearchCV and tokens obtained fron TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
param_grid={'C':[0.1,1,10,100]}
grid=GridSearchCV(LinearSVC(),param_grid,refit=True)
grid.fit(X_train,y_train)



GridSearchCV(estimator=LinearSVC(), param_grid={'C': [0.1, 1, 10, 100]})

In [75]:
#finding best C for best parameter
print(grid.best_params_)

{'C': 100}


In [76]:
#finding best accuracy
print(grid.best_score_)

0.9794437726723096


In [77]:
#prediction of test data
pred2=grid.predict(X_test)

In [78]:
#evaluate the model and training dataset
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(classification_report(y_test,pred2))
print()
print('confusion Matrix:\n',confusion_matrix(y_test,pred2))
print()
print('accuracy score:\n',accuracy_score(y_test,pred2))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       885
        spam       0.99      0.87      0.92       149

    accuracy                           0.98      1034
   macro avg       0.99      0.93      0.96      1034
weighted avg       0.98      0.98      0.98      1034


confusion Matrix:
 [[884   1]
 [ 20 129]]

accuracy score:
 0.9796905222437138


In [79]:
#prediction using LinearSVC and GridsearchCV and tokens obtained fron CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
param_grid1={'C':[0.1,1,10,100]}
grid1=GridSearchCV(LinearSVC(),param_grid,refit=True)
grid1.fit(X_train1,y_train1)



GridSearchCV(estimator=LinearSVC(), param_grid={'C': [0.1, 1, 10, 100]})

In [80]:
#finding best C for best parameter
print(grid1.best_params_)

{'C': 1}


In [81]:
#finding best accuracy
print(grid1.best_score_)

0.9823458282950422


In [82]:
#training teh dataset
grid1.fit(X_train1,y_train1)



GridSearchCV(estimator=LinearSVC(), param_grid={'C': [0.1, 1, 10, 100]})

In [122]:
#prediction of test data
pred3=grid1.predict(X_test1)
pred3

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [84]:
#evaluate the model and training dataset
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(classification_report(y_test1,pred3))
print()
print('confusion Matrix:\n',confusion_matrix(y_test1,pred3))
print()
print('accuracy score:\n',accuracy_score(y_test1,pred3))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       885
        spam       0.99      0.91      0.95       149

    accuracy                           0.99      1034
   macro avg       0.99      0.95      0.97      1034
weighted avg       0.99      0.99      0.99      1034


confusion Matrix:
 [[884   1]
 [ 14 135]]

accuracy score:
 0.9854932301740812


In [85]:
import pickle

In [86]:
with open('email_predictor','wb') as f:
    pickle.dump(grid1,f)

In [87]:
with open('email_predictor','rb') as f:
    predict_spam=pickle.load(f)