In [2]:
# Loading Dataset

import pandas as pd

df = pd.read_csv("Dataset/SMSSpamCollection",delimiter="\t",names=['label',"message"])

In [3]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# checking null values
df.isnull().sum()

label      0
message    0
dtype: int64

In [5]:
# text cleaning (removing special character(including digits),stopwords and doing lemmatization )
import re
import nltk
# nltk.download('stopwords') #already downloaded

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer # Using Lemmataization 

wnl=WordNetLemmatizer()

def clean(sentence):
    words = sentence.lower()
    words = re.sub('[^a-z]'," ",words)
    words = words.split()
    words = [wnl.lemmatize(word) for word in words if word not in stopwords.words("english") ]
    words = ' '.join(words)
    
    return words
    

In [6]:
# Example of how the cleaning is done
sample = df["message"][0]
print("Before cleaning :\n",sample)
print()
print("After cleaning :\n",clean(sample))

Before cleaning :
 Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...

After cleaning :
 go jurong point crazy available bugis n great world la e buffet cine got amore wat


In [7]:
# now applying cleaning function to our text
df["message"] = df["message"].apply(lambda x : clean(x))

In [8]:
df

Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though
...,...,...
5567,spam,nd time tried contact u u pound prize claim ea...
5568,ham,b going esplanade fr home
5569,ham,pity mood suggestion
5570,ham,guy bitching acted like interested buying some...


In [9]:
# checking unique values of independent variable so we can transform into 0s and 1s
df["label"].unique()

array(['ham', 'spam'], dtype=object)

In [13]:
# Creating Train Test Split
from sklearn.model_selection import train_test_split
X = df["message"]
Y = df["label"].apply(lambda x : 0 if x=="ham" else 1)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)


In [14]:
# converting text to vector

#using BOW
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,2),max_features=5000)

X_train_TFIDF = tfidf.fit_transform(X_train).toarray()

In [15]:
# now X_train[i] is turned into vector of 5000 dimension(Vocabulary)

print(len(tfidf.vocabulary_)) 
tfidf.vocabulary_  

5000


{'good': 1705,
 'movie': 2776,
 'ok': 2944,
 'leave': 2301,
 'ok leave': 2952,
 'free': 1419,
 'give': 1613,
 'otherwise': 3047,
 'something': 3883,
 'maybe': 2634,
 'bit': 359,
 'got': 1723,
 'home': 1893,
 'babe': 244,
 'still': 4019,
 'awake': 232,
 'babe still': 248,
 'still awake': 4022,
 'since': 3805,
 'already': 88,
 'workin': 4869,
 'get': 1540,
 'job': 2119,
 'get job': 1561,
 'said': 3578,
 'matter': 2624,
 'mind': 2689,
 'saying': 3605,
 'oh': 2939,
 'yeah': 4951,
 'diet': 978,
 'window': 4816,
 'oh yeah': 2943,
 'sorry': 3900,
 'thing': 4294,
 'may': 2631,
 'pub': 3342,
 'later': 2261,
 'sorry got': 3906,
 'got thing': 1731,
 'thing may': 4303,
 'ill': 1993,
 'call': 491,
 'evening': 1194,
 'idea': 1981,
 'ill call': 1994,
 'dear': 883,
 'room': 3560,
 'look': 2396,
 'daddy': 820,
 'want': 4658,
 'eat': 1139,
 'night': 2885,
 'long': 2390,
 'chennai': 616,
 'surely': 4110,
 'pick': 3149,
 'competition': 708,
 'da': 809,
 'car': 555,
 'park': 3079,
 'da car': 810,
 'car par

In [16]:
#  Now based on these vocabluary we will transform test split into vector

X_test_TFIDF = tfidf.transform(X_test).toarray()
X_test_TFIDF

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
# Now last step , we will model our dataset

In [18]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train_TFIDF,Y_train)

In [19]:
Y_pred = mnb.predict(X_test_TFIDF)
Y_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [20]:
# now we will check our model performance

In [21]:
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

accuracy_score(Y_test,Y_pred)

0.9721973094170404

In [22]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       955
           1       1.00      0.81      0.89       160

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [23]:
print(confusion_matrix(Y_test,Y_pred))

[[955   0]
 [ 31 129]]
