# The following program demonstrates the classification of emial into spam or ham using NLTK and ML algorithms

In [2]:
#importing the required libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

# Reading the Data

In [3]:
#reading the data(CSV file)
df=pd.read_csv("spam.csv",encoding='latin1')
df.head(5)

Unnamed: 0,type,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df=df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
df.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
#printing the size of the dataset
df.shape

(5572, 2)

In [6]:
#getting feature names
df.columns

Index(['type', 'text'], dtype='object')

In [7]:
#checking the duplicates and remove them
df.drop_duplicates(inplace=True)
df.shape

(5169, 2)

In [8]:
#show the number of missing data(NAN,NaN,na) for each column
df.isnull().sum()

type    0
text    0
dtype: int64

# Message Preprocessing

In [33]:
#function to tokenize each and every word
def tokenizer(text):
    tokenized=nltk.word_tokenize(text)
    tokenized=' '.join(tokenized)
    tokenized=tokenized.replace('n\'t','not')
    return tokenized
                

In [34]:
#creating a function to process punctuation and stopwords in the text data
def process_stop_punc(text):
    #remove punctuations
    #remove stopwords
    #return a list of clen text words
    nopunc=[char for char in text if char not in string.punctuation]
    nopunc=''.join(nopunc)
    
    clean_words=[word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    return clean_words

In [35]:
#functions to convert words into single form i.e. converting plural to singular and past ,past continuous to present
def stemming(List):
    stem_obj=nltk.stem.PorterStemmer()
    List=[stem_obj.stem(i) for i in List]
    message=' '.join(List)
    return message

In [36]:
#function to compile each and every operation
def process(text):
    return stemming(process_stop_punc(tokenizer(text)))

In [37]:
#show the tokenization
df['text'].head().apply(process)

0    Go jurong point crazi avail bugi n great world...
1                                Ok lar joke wif u oni
2    free entri 2 wkli comp win FA cup final tkt 21...
3                  U dun say earli hor U c alreadi say
4                 nah think goe usf live around though
Name: text, dtype: object

# Vectorizing the words

In [38]:
#convert a collection of data to matrix of tokens using tf-idf vectorizer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
message=TfidfVectorizer(analyzer=process).fit_transform(df['text'])

In [83]:
#getting the shape of message
message.shape

(5169, 90)

In [82]:
#using countvectorizer
from sklearn.feature_extraction.text import CountVectorizer
message1=CountVectorizer(analyzer=process).fit_transform(df['text'])

# Splitting data into training tesing set

In [39]:
#splitting the data into 80:20 train test ratio for dataset vectorized using tf-idfvectorizer
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(message,df['type'],test_size=0.2,random_state=0)

In [84]:
#splitting the data into 80:20 train test ratio for dataset vectorized using countvectorizer
from sklearn.model_selection import train_test_split
X_train1,X_test1,y_train1,y_test1=train_test_split(message1,df['type'],test_size=0.2,random_state=0)

# Feeding Data and classification report of Naive Bayes classifier 

In [41]:
#creating and training the naive bayes classifier for dataset vectorized using tf-idfvectorizer
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB().fit(X_train,y_train)

In [85]:
#evaluate the model and training dataset
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred=classifier.predict(X_train)
print(classification_report(y_train,pred))
print()
print('confusion Matrix:\n',confusion_matrix(y_train,pred))
print()
print(' training accuracy score:\n',accuracy_score(y_train,pred))


              precision    recall  f1-score   support

         ham       0.97      0.99      0.98      3631
        spam       0.93      0.81      0.86       504

    accuracy                           0.97      4135
   macro avg       0.95      0.90      0.92      4135
weighted avg       0.97      0.97      0.97      4135


confusion Matrix:
 [[3598   33]
 [  97  407]]

 training accuracy score:
 0.9685610640870617


In [86]:
#printing the predictions
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred=classifier.predict(X_test)
print(classification_report(y_test,pred))
print()
print('confusion Matrix:\n',confusion_matrix(y_test,pred))
print()
print('testing accuracy score:\n',accuracy_score(y_test,pred))


              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       885
        spam       0.97      0.77      0.86       149

    accuracy                           0.96      1034
   macro avg       0.96      0.88      0.92      1034
weighted avg       0.96      0.96      0.96      1034


confusion Matrix:
 [[881   4]
 [ 34 115]]

testing accuracy score:
 0.9632495164410058


In [91]:
#creating and training the naive bayes classifier for for dataset vectorized using countvectorizer
from sklearn.naive_bayes import MultinomialNB
classifier1=MultinomialNB().fit(X_train1,y_train1)

In [90]:
#evaluate the model and training dataset
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred1=classifier1.predict(X_train1)
print(classification_report(y_train1,pred1))
print()
print('confusion Matrix:\n',confusion_matrix(y_train1,pred1))
print()
print(' training accuracy score:\n',accuracy_score(y_train1,pred1))

              precision    recall  f1-score   support

         ham       0.98      0.98      0.98      3631
        spam       0.88      0.86      0.87       504

    accuracy                           0.97      4135
   macro avg       0.93      0.92      0.93      4135
weighted avg       0.97      0.97      0.97      4135


confusion Matrix:
 [[3572   59]
 [  72  432]]

 training accuracy score:
 0.9683192261185006


In [89]:
#prediction from the model
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred1_test=classifier1.predict(X_test1)
print(classification_report(y_test1,pred1_test))
print()
print('confusion Matrix:\n',confusion_matrix(y_test1,pred1_test))
print()
print('testing accuracy score:\n',accuracy_score(y_test1,pred1_test))


              precision    recall  f1-score   support

         ham       0.97      0.98      0.98       885
        spam       0.90      0.82      0.86       149

    accuracy                           0.96      1034
   macro avg       0.93      0.90      0.92      1034
weighted avg       0.96      0.96      0.96      1034


confusion Matrix:
 [[871  14]
 [ 27 122]]

testing accuracy score:
 0.960348162475822


# Finding accuracy using 5-fold cross-validation

In [92]:
#import library and building 5-fold cross validation 
from sklearn.model_selection import KFold,cross_val_score
k_fold=KFold(len(df['type']),n_splits=5,shuffle=True,random_state=0)
clf=MultinomialNB()

In [93]:
#printing the cross validation accuray for for dataset vectorized using tf-idfvectorizer
accuracy=cross_val_score(clf,message,df['type'],cv=k_fold,n_jobs=1)
accuracy.mean()*100

96.51770168311086

In [94]:
#printing the cross validation accuray for for dataset vectorized using countvectorizer
accuracy1=cross_val_score(clf,message1,df['type'],cv=k_fold,n_jobs=1)
accuracy1.mean()*100

96.65312439543432

# Using support-vector-machine Algorithm

In [69]:
#prediction using LinearSVC and GridsearchCV and tokens obtained fron TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
param_grid={'C':[0.1,1,10,100]}
grid=GridSearchCV(LinearSVC(),param_grid,refit=True)
grid.fit(X_train,y_train)



GridSearchCV(estimator=LinearSVC(), param_grid={'C': [0.1, 1, 10, 100]})

In [71]:
#finding best C for best parameter
print(grid.best_params_)

{'C': 100}


In [72]:
#finding best accuracy
print(grid.best_score_)

0.9782345828295043


In [73]:
#prediction of test data
pred2=grid.predict(X_test)

In [78]:
#evaluate the model and training dataset
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(classification_report(y_test,pred2))
print()
print('confusion Matrix:\n',confusion_matrix(y_test,pred2))
print()
print('accuracy score:\n',accuracy_score(y_test,pred2))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       885
        spam       0.97      0.85      0.90       149

    accuracy                           0.97      1034
   macro avg       0.97      0.92      0.94      1034
weighted avg       0.97      0.97      0.97      1034


confusion Matrix:
 [[881   4]
 [ 23 126]]

accuracy score:
 0.9738878143133463


In [65]:
#prediction using LinearSVC and GridsearchCV and tokens obtained fron CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
param_grid={'C':[0.1,1,10,100]}
grid=GridSearchCV(LinearSVC(),param_grid,refit=True)
grid.fit(X_train1,y_train1)



GridSearchCV(estimator=LinearSVC(), param_grid={'C': [0.1, 1, 10, 100]})

In [66]:
#finding best C for best parameter
print(grid.best_params_)

{'C': 10}


In [70]:
#finding best accuracy
print(grid.best_score_)

0.9782345828295043


In [79]:
#training teh dataset
grid.fit(X_train1,y_train1)



GridSearchCV(estimator=LinearSVC(), param_grid={'C': [0.1, 1, 10, 100]})

In [80]:
#prediction of test data
pred3=grid.predict(X_test1)

In [81]:
#evaluate the model and training dataset
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(classification_report(y_test1,pred3))
print()
print('confusion Matrix:\n',confusion_matrix(y_test1,pred3))
print()
print('accuracy score:\n',accuracy_score(y_test1,pred3))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       885
        spam       0.98      0.87      0.92       149

    accuracy                           0.98      1034
   macro avg       0.98      0.93      0.95      1034
weighted avg       0.98      0.98      0.98      1034


confusion Matrix:
 [[883   2]
 [ 20 129]]

accuracy score:
 0.9787234042553191
