In [27]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [6]:
df = pd.read_csv("/Users/Nidhi/Desktop/Jupyter Notebooks/SMSSpamCollection",sep='\t',names=['Status','Message'])

In [7]:
df.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.loc[df["Status"]=='ham',"Status"]=1
df.loc[df["Status"]=='spam',"Status"]=0

In [9]:
df.head()

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
x = df['Message']
y = df['Status']

### Count Vectorizer is used to count the occurence of the words in the doc


In [12]:
cv = CountVectorizer()

In [21]:
#x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [22]:
x_df = cv.fit_transform(x_train)

In [23]:
x_df.toarray()
# counts the frequency of each word corresponding to each message

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
cv.get_feature_names() # gets all the words that has ever occured in the doc

In [25]:
cv.inverse_transform(x_df.toarray()[0])

[array(['an', 'for', 'good', 'hourish', 'if', 'in', 'is', 'it', 'leave',
        'movie', 'no', 'ok', 'the'], dtype='<U34')]

In [26]:
x_train.iloc[0]

"No I'm good for the movie, is it ok if I leave in an hourish?"

# Using TFID vectorization


In [69]:
tf = TfidfVectorizer(min_df=1,stop_words='english')

In [70]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [71]:
x_tf = tf.fit_transform(x_train)

In [72]:
x_tf.toarray() 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [73]:
tf.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07090201529',
 '07090298926',
 '07099833605',
 '0721072',
 '07734396839',
 '07742676969',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '08452810071',
 '08452810073',
 '08452810075over18',
 '0870',
 '08700435505150p',
 '08700621170150p',
 '08701213186',
 '08701237397',
 '08701417012',

TFIDF = Term frequency inverse distribution frequency
It reduces importance of words such as "a","the","or" etc which might occur alot of times thereby dominating our results
Therefore to avoid common words we use tfId

In [74]:
tf.inverse_transform(x_tf.toarray()[0])

[array(['good', 'hourish', 'leave', 'movie', 'ok'], dtype='<U34')]

### Output using count vectorization:
[array(['an', 'for', 'good', 'hourish', 'if', 'in', 'is', 'it', 'leave',
        'movie', 'no', 'ok', 'the'], dtype='<U34')]
Words such as 'an','if','in' etc are ommited in tfId

In [75]:
mnb = MultinomialNB()

In [76]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [77]:
mnb.fit(x_tf,y_train)

MultinomialNB()

In [78]:
x_testtf = tf.transform(x_test)

In [79]:
pred_y= mnb.predict(x_testtf)

In [80]:
pred_y

array([1, 1, 1, ..., 1, 0, 1])

In [81]:
from sklearn.metrics import confusion_matrix

In [82]:
cm = confusion_matrix(y_test,pred_y)

In [83]:
cm

array([[134,  26],
       [  0, 955]], dtype=int64)

In [84]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred_y)

0.9766816143497757

97.66% accuracy