#### Data source: https://archive.ics.uci.edu/ml/machine-learning-databases/00228/

Adapted from: https://github.com/shreyans29/thesemicolon

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
df=pd.read_csv('smsspam',sep='\t',names=['Status','Message'])

In [3]:
df.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
len(df)

5572

In [5]:
len(df[df.Status=='spam'])

747

In [6]:
len(df[df.Status=='ham'])

4825

In [7]:
df.loc[df["Status"]=='ham',"Status"]=1

In [8]:
df.loc[df["Status"]=='spam',"Status"]=0

In [9]:
df.head()

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df_x=df["Message"]
df_y=df["Status"]

In [11]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [12]:
x_train.head()

1457    U sleeping now.. Or you going to take? Haha.. ...
472     How long has it been since you screamed, princ...
2481    Urgent! call 09066612661 from landline. Your c...
243     Okay. No no, just shining on. That was meant t...
1413    Wen ur lovable bcums angry wid u, dnt take it ...
Name: Message, dtype: object

In [13]:
cv = CountVectorizer()

In [14]:
x_traincv = cv.fit_transform(["Hi How are you How are you doing","Hi what's up","Wow that's awesome"])

In [15]:
x_traincv.toarray()

array([[2, 0, 1, 1, 2, 0, 0, 0, 0, 2],
       [0, 0, 0, 1, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 0, 1, 0]], dtype=int64)

In [16]:
cv.get_feature_names()

['are', 'awesome', 'doing', 'hi', 'how', 'that', 'up', 'what', 'wow', 'you']

In [17]:
cv.inverse_transform([2, 0, 1, 1, 2, 0, 0, 0, 0, 2])

[array(['are', 'doing', 'hi', 'how', 'you'], 
       dtype='<U7')]

In [18]:
x_traincv=cv.fit_transform(x_train)

In [19]:
a=x_traincv.toarray()

In [20]:
a[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
cv.inverse_transform(a[0])

[array(['checking', 'going', 'got', 'haha', 'lor', 'mails', 'me', 'now',
        'online', 'or', 'replying', 'sleeping', 'spys', 'take', 'to', 'wat',
        'you'], 
       dtype='<U27')]

In [22]:
x_train.iloc[0]

'U sleeping now.. Or you going to take? Haha.. I got spys wat.. Me online checking n replying mails lor..'

In [23]:
x_testcv=cv.transform(x_test)

In [24]:
mnb = MultinomialNB()

In [25]:
# transform to np arrays
y_train= np.int32( np.array(y_train) )
y_test = np.int32( np.array(y_test)  )

In [26]:
mnb.fit(x_traincv,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [27]:
from sklearn.metrics import accuracy_score

y_pred=mnb.predict(x_testcv)

accuracy_score(y_test, y_pred)

0.97937219730941705