In [1]:
# Loading Dataset

import pandas as pd

df = pd.read_csv("Dataset/SMSSpamCollection",delimiter="\t",names=['label',"message"])

In [2]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
# checking null values
df.isnull().sum()

label      0
message    0
dtype: int64

In [6]:
# text cleaning (removing special character(including digits),stopwords and doing lemmatization )
import re
import nltk
# nltk.download('stopwords') #already downloaded

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer # Using Lemmataization 

wnl=WordNetLemmatizer()

def clean(sentence):
    words = sentence.lower()
    words = re.sub('[^a-z]'," ",words)
    words = words.split()
    words = [wnl.lemmatize(word) for word in words if word not in stopwords.words("english") ]
    words = ' '.join(words)
    
    return words

In [7]:
# Example of how the cleaning is done
sample = df["message"][0]
print("Before cleaning :\n",sample)
print()
print("After cleaning :\n",clean(sample))

Before cleaning :
 Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...

After cleaning :
 go jurong point crazy available bugis n great world la e buffet cine got amore wat


In [8]:
# now applying cleaning function to our text
df["message"] = df["message"].apply(lambda x : clean(x))

In [9]:
df

Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though
...,...,...
5567,spam,nd time tried contact u u pound prize claim ea...
5568,ham,b going esplanade fr home
5569,ham,pity mood suggestion
5570,ham,guy bitching acted like interested buying some...


In [10]:
# checking unique values of independent variable so we can transform into 0s and 1s
df["label"].unique()

array(['ham', 'spam'], dtype=object)

In [32]:
# Creating Train Test Split
from sklearn.model_selection import train_test_split
X = df["message"]
Y = df["label"].apply(lambda x : 0 if x=="ham" else 1)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)


In [33]:
# converting text to vector

#using BOW
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2),max_features=5000)

X_train_BOW = cv.fit_transform(X_train).toarray()

In [34]:
# now X_train[i] is turned into vector of 5000 dimension(Vocabulary)

print(len(cv.vocabulary_)) 
cv.vocabulary_           # to see vocabulary

5000


{'good': np.int64(1540),
 'movie': np.int64(2653),
 'ok': np.int64(2974),
 'leave': np.int64(1989),
 'ok leave': np.int64(2984),
 'free': np.int64(1370),
 'give': np.int64(1490),
 'otherwise': np.int64(3079),
 'nalla': np.int64(2737),
 'something': np.int64(4171),
 'maybe': np.int64(2392),
 'bit': np.int64(289),
 'got': np.int64(1558),
 'home': np.int64(1728),
 'babe': np.int64(191),
 'still': np.int64(4258),
 'awake': np.int64(179),
 'since': np.int64(4104),
 'already': np.int64(72),
 'workin': np.int64(4904),
 'get': np.int64(1462),
 'job': np.int64(1875),
 'get job': np.int64(1470),
 'said': np.int64(3819),
 'matter': np.int64(2375),
 'mind': np.int64(2485),
 'saying': np.int64(3870),
 'oh': np.int64(2957),
 'yeah': np.int64(4963),
 'diet': np.int64(1082),
 'window': np.int64(4862),
 'oh yeah': np.int64(2969),
 'sorry': np.int64(4185),
 'thing': np.int64(4434),
 'may': np.int64(2384),
 'pub': np.int64(3475),
 'later': np.int64(1959),
 'got thing': np.int64(1566),
 'may pub': np.int6

In [35]:
#  Now based on these vocabluary we will transform test split into vector

X_test_BOW = cv.transform(X_test).toarray()
X_test_BOW

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [36]:
# Now last step , we will model our dataset

In [37]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train_BOW,Y_train)

In [38]:
Y_pred = mnb.predict(X_test_BOW)
Y_pred

array([0, 1, 0, ..., 0, 1, 0])

In [39]:
# now we will check our model performance

In [40]:
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

accuracy_score(Y_test,Y_pred)

0.9865470852017937

In [41]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       955
           1       0.98      0.93      0.95       160

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [42]:
print(confusion_matrix(Y_test,Y_pred))

[[952   3]
 [ 12 148]]
