In [23]:
import pandas as pd
import re
import nltk
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
messages = pd.read_csv("SMSSpamCollection" , sep = '\t' , names = ['label' , 'message'])

In [25]:
messages['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [26]:
messages[['label','message']].sample(10,random_state = 5)

Unnamed: 0,label,message
2095,spam,PRIVATE! Your 2004 Account Statement for 07742...
5343,ham,No go. No openings for that room 'til after th...
564,spam,GENT! We are trying to contact you. Last weeke...
3849,ham,Can you plz tell me the ans. BSLVYL sent via f...
3317,ham,I know girls always safe and selfish know i go...
5277,ham,"Best msg: It's hard to be with a person, when ..."
1674,spam,Monthly password for wap. mobsi.com is 391784....
3753,ham,Why are u up so early?
5507,ham,I want to be inside you every night...
265,ham,Why you Dint come with us.


In [27]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
corpus1 = []
corpus2 = []

In [28]:
#Doing the task of preprocessing. Using regular expression to replace any other character except a-z and A-Z
#with a blank space.Then canging them into lowercase. After that applying the process of Stemming on each word
#to decompose them into their base form. Then the words are appended to the empty list, corpus.

for i in range(0,len(messages)):
    review = re.sub("[^a-zA-Z]" , " " , messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus1.append(review)

In [29]:
for i in range(0,len(messages)):
    review = re.sub("[^a-zA-Z]" , " " , messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus2.append(review)

In [32]:
#Creating the Bag of Words (BOW) model using Count Vecotrizer
#X1 is fitting the BOW model into the Stemmed corpus.
#X2 is fitting the BOW model into the Lemmatized corpus.
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000)
X1 = cv.fit_transform(corpus1).toarray()
X2 = cv.fit_transform(corpus2).toarray()

In [33]:
#Creating the Term Frequency-Inverse Document Frequeuncy (TF-IDF) model
#X1 is fitting the Tf-Idf model into the Stemmed corpus.
#X2 is fitting the Tf-Idf model into the Lemmatized corpus.
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 5000)
X3 = tfidf.fit_transform(corpus1).toarray()
X4 = tfidf.fit_transform(corpus2).toarray()

In [35]:
y = pd.get_dummies(messages['label'])
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [36]:
#Classifying the spam messages to be 1 and the non-spam messages to be 0.
y = y.iloc[:,1].values

In [38]:
from sklearn.model_selection import train_test_split
X1_train,X1_test,y1_train,y1_test = train_test_split(X1,y,test_size=0.25,random_state = 42)
X2_train,X2_test,y2_train,y2_test = train_test_split(X2,y,test_size=0.25,random_state = 42)
X3_train,X3_test,y3_train,y3_test = train_test_split(X3,y,test_size=0.25,random_state = 42)
X4_train,X4_test,y4_train,y4_test = train_test_split(X4,y,test_size=0.25,random_state = 42)

In [39]:
from sklearn.naive_bayes import MultinomialNB
spam_model1 = MultinomialNB().fit(X1_train,y1_train)
spam_model2 = MultinomialNB().fit(X2_train,y2_train)
spam_model3 = MultinomialNB().fit(X3_train,y3_train)
spam_model4 = MultinomialNB().fit(X4_train,y4_train)

In [44]:
y_pred1 = spam_model1.predict(X1_test)
y_pred2 = spam_model2.predict(X2_test)
y_pred3 = spam_model3.predict(X3_test)
y_pred4 = spam_model4.predict(X4_test)

In [49]:
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(y1_test , y_pred1)
cm1

array([[1191,   16],
       [  10,  176]], dtype=int64)

In [50]:
cm2 = confusion_matrix(y2_test , y_pred2)
cm2

array([[1188,   19],
       [  10,  176]], dtype=int64)

In [52]:
cm3 = confusion_matrix(y3_test , y_pred3)
cm3

array([[1206,    1],
       [  41,  145]], dtype=int64)

In [53]:
cm4 = confusion_matrix(y4_test , y_pred4)
cm4

array([[1206,    1],
       [  38,  148]], dtype=int64)

In [55]:
from sklearn.metrics import accuracy_score
accuracy1 = accuracy_score(y1_test , y_pred1)
accuracy1*100

98.1335247666906

In [56]:
accuracy2 = accuracy_score(y2_test , y_pred2)
accuracy2*100

97.91816223977028

In [57]:
accuracy3 = accuracy_score(y3_test , y_pred3)
accuracy3*100

96.98492462311557

In [58]:
accuracy4 = accuracy_score(y4_test , y_pred4)
accuracy4*100

97.20028715003589

In [61]:
df = pd.DataFrame({'Model Used': ['BOW' , 'BOW' , 'TfIdf' , 'TfIdf'] , 
                   'Type of Text Normalization used': ['Stemming' , 'Lemmatization' , 'Stemming' , 'Lemmatization'] , 
                   'Accuracy': [accuracy1*100 , accuracy2*100 , accuracy3*100 , accuracy4*100]})

In [62]:
df

Unnamed: 0,Model Used,Type of Text Normalization used,Accuracy
0,BOW,Stemming,98.133525
1,BOW,Lemmatization,97.918162
2,TfIdf,Stemming,96.984925
3,TfIdf,Lemmatization,97.200287


In [None]:
#We can conclude from the above table that the first model, ie BOW+Stemming performs the best
#among all the four models with an accuracy score of 98.13%. The TfIdf+Stemming model performs
#the worst with an accuracy score of 96.98%.