# sms spam classification

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('emailspam.csv')

In [3]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
1496,ham,I'm always on yahoo messenger now. Just send t...,,,
3322,ham,"She said,'' do u mind if I go into the bedroom...",,,
2164,ham,"Nothing really, just making sure everybody's u...",,,
3692,ham,I was about to do it when i texted. I finished...,,,
3360,ham,You only hate me. You can call any but you did...,,,


In [4]:
df.shape

(5572, 5)

# Data Cleaning

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [6]:
# droping the  last 3 columns
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [7]:
df.sample(5)

Unnamed: 0,v1,v2
4116,ham,Sure thing big man. i have hockey elections at...
2573,spam,Congrats 2 mobile 3G Videophones R yours. call...
5392,ham,Ooooooh I forgot to tell u I can get on yovill...
4649,ham,Finally it has happened..! Aftr decades..! BEE...
4747,ham,The beauty of life is in next second.. which h...


In [8]:
df.rename(columns={'v1':'objective','v2':'message'},inplace=True)

In [9]:
df.sample(5)

Unnamed: 0,objective,message
772,ham,"idc get over here, you are not weaseling your ..."
1511,ham,Oops sorry. Just to check that you don't mind ...
2908,spam,URGENT! Your Mobile number has been awarded wi...
998,ham,Then �_ wait 4 me at bus stop aft ur lect lar....
2683,ham,I'm okay. Chasing the dream. What's good. What...


In [10]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [11]:
df['objective'] = encoder.fit_transform(df['objective'])

In [12]:
df.sample(5)

Unnamed: 0,objective,message
1233,0,Lol ok. I'll snatch her purse too.
175,0,Let me know when you've got the money so carlo...
3300,1,RCT' THNQ Adrian for U text. Rgds Vatian
1109,0,S s..first time..dhoni rocks...
654,0,Did u got that persons story


In [13]:
# finding missing values from dataset
df.isnull().sum()

objective    0
message      0
dtype: int64

In [14]:
# check for duplicate values
df.duplicated().sum()

403

In [15]:
# remove duplicates
df = df.drop_duplicates(keep='first')

In [16]:
df.shape

(5169, 2)

# Exploratory data analysis

In [17]:
df['objective'].value_counts()

0    4516
1     653
Name: objective, dtype: int64

In [18]:
df.head()

Unnamed: 0,objective,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
# Data is imbalanced

In [20]:
import nltk

In [21]:
#finding number of characters in df

In [22]:
df['total_characters'] = df['message'].apply(len)

In [23]:
df.sample(5)

Unnamed: 0,objective,message,total_characters
4651,0,Where r e meeting tmr?,22
447,0,I wont get concentration dear you know you are...,73
4732,0,Good Morning my Dear Shijutta........... Have ...,75
5030,0,Hey... Very inconvenient for your sis a not huh?,48
4508,0,�� takin linear algebra today?,30


In [24]:
#finding nmber of characters

In [25]:
df['tatal_words'] = df['message'].apply(lambda x:len(nltk.word_tokenize(x)))

In [26]:
df.rename(columns={'tatal_words':'total_words'},inplace=True)

In [27]:
df.sample(5)

Unnamed: 0,objective,message,total_characters,total_words
4995,0,Happy new year. Hope you are having a good sem...,51,11
3426,0,Haha okay... Today weekend leh...,34,7
3871,0,I am joining today formally.Pls keep praying.w...,61,10
1203,0,Thanks for understanding. I've been trying to ...,61,13
2013,1,Great News! Call FREEFONE 08006344447 to claim...,118,23


In [28]:
#finding number of sentences

In [29]:
df['total_sentences'] = df['message'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [30]:
df.sample(5)

Unnamed: 0,objective,message,total_characters,total_words,total_sentences
2429,1,Guess who am I?This is the first time I create...,152,35,3
1968,0,2 laptop... I noe infra but too slow lar... I ...,58,15,3
1593,1,PRIVATE! Your 2003 Account Statement for shows...,136,23,3
3246,0,* You gonna ring this weekend or wot?,37,10,1
3384,0,Ok can...,9,3,1


In [31]:

df[['total_characters','total_words','total_sentences']].describe()

Unnamed: 0,total_characters,total_words,total_sentences
count,5169.0,5169.0,5169.0
mean,78.924163,18.456375,1.962275
std,58.175349,13.323322,1.433892
min,2.0,1.0,1.0
25%,36.0,9.0,1.0
50%,60.0,15.0,1.0
75%,117.0,26.0,2.0
max,910.0,220.0,38.0


In [32]:
#ham

In [33]:
df[df['objective']==0][['total_characters','total_words','total_sentences']].describe()

Unnamed: 0,total_characters,total_words,total_sentences
count,4516.0,4516.0,4516.0
mean,70.457263,17.123339,1.815545
std,56.357463,13.491315,1.364098
min,2.0,1.0,1.0
25%,34.0,8.0,1.0
50%,52.0,13.0,1.0
75%,90.0,22.0,2.0
max,910.0,220.0,38.0


In [34]:
#spam

In [35]:
df[df['objective']==1][['total_characters','total_words','total_sentences']].describe()

Unnamed: 0,total_characters,total_words,total_sentences
count,653.0,653.0,653.0
mean,137.479326,27.675345,2.977029
std,30.014336,7.011513,1.493676
min,13.0,2.0,1.0
25%,131.0,25.0,2.0
50%,148.0,29.0,3.0
75%,157.0,32.0,4.0
max,223.0,46.0,9.0


# Data Preprocessing

In [40]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [45]:
def transform_text(text):
    #1.Lower case
    text = text.lower()  #Lower case
    #2.Tokenization
    text = nltk.word_tokenize(text) #Tokenization 
    
    #3.Removing special characters
    y = [] #a list is created by placing all the items (elements) inside square brackets [] 
    for i in text:
        if i.isalnum():    # Removing special characters,The isalnum() method returns True
             y.append(i)   #if all characters in the string are alphanumeric (either alphabets or numbers). If not, it returns False.
    text = y[:]
    y.clear()
    
    #4.Removing stop words and punctuation
    for i in text:  #Removing stop words and punctuation   #A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore
        if i not in stopwords.words('english') and i not in string.punctuation: #punctuation values !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
            y.append(i)    
            
    text = y[:]
    y.clear()
    
    #5.Stemming
    ps = PorterStemmer()
    for i in text:  # A stemming algorithm reduces the words “chocolates”, “chocolatey”, “choco”
        y.append(ps.stem(i))
    
            
    return " ".join(y)

In [46]:
transform_text("I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.")

'gon na home soon want talk stuff anymor tonight k cri enough today'

In [47]:
df['transformed_text'] = df['message'].apply(transform_text)

In [48]:
df.head()

Unnamed: 0,objective,message,total_characters,total_words,total_sentences,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",111,24,2,go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,29,8,2,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,49,13,1,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",61,15,1,nah think goe usf live around though


# model Building

In [49]:
#vectrization

In [50]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(df['transformed_text']).toarray()

In [51]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [52]:
x.shape

(5169, 6677)

In [53]:
y = df['objective'].values

In [54]:
#training test split

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)


In [57]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [58]:
bnb = BernoulliNB()
bnb.fit(x_train,y_train)
y_pred = bnb.predict(x_test)

print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(precision_score(y_test,y_pred))

0.971953578336557
[[894   2]
 [ 27 111]]
0.9823008849557522


In [59]:
import pickle
pickle.dump(cv,open('vectorizer.pkl','wb'))
pickle.dump(bnb,open('model.pkl','wb'))