In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [2]:
data = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['class', 'msg'])
data.head()

Unnamed: 0,class,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.groupby('class').nunique()

Unnamed: 0_level_0,msg
class,Unnamed: 1_level_1
ham,4516
spam,653


In [4]:
data.isna().sum()

class    0
msg      0
dtype: int64

In [5]:
X = data['msg']
y = data['class']

In [6]:
count_vect = CountVectorizer(stop_words='english', 
                             token_pattern=r'[a-z]+',
                             lowercase=True,
                             strip_accents='unicode')
pipeline1 = Pipeline([('vect', count_vect),
                      ('clf', MultinomialNB())
                    ])
pipeline2 = Pipeline([('vect', count_vect),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
                    ])

In [7]:
count_vect.fit(X)
count_vect.vocabulary_

{'jurong': 3356,
 'point': 4850,
 'crazy': 1397,
 'available': 433,
 'bugis': 846,
 'n': 4211,
 'great': 2669,
 'world': 7342,
 'la': 3479,
 'e': 1881,
 'buffet': 844,
 'cine': 1127,
 'got': 2630,
 'amore': 210,
 'wat': 7139,
 'ok': 4461,
 'lar': 3512,
 'joking': 3324,
 'wif': 7247,
 'u': 6839,
 'oni': 4487,
 'free': 2400,
 'entry': 2006,
 'wkly': 7300,
 'comp': 1240,
 'win': 7259,
 'fa': 2135,
 'cup': 1456,
 'final': 2254,
 'tkts': 6642,
 'st': 6116,
 'text': 6509,
 'receive': 5238,
 'question': 5132,
 'std': 6158,
 'txt': 6824,
 'rate': 5185,
 't': 6388,
 'c': 888,
 's': 5509,
 'apply': 288,
 'dun': 1866,
 'say': 5576,
 'early': 1887,
 'hor': 2951,
 'nah': 4217,
 'don': 1778,
 'think': 6562,
 'goes': 2595,
 'usf': 6957,
 'lives': 3665,
 'freemsg': 2406,
 'hey': 2869,
 'darling': 1513,
 'week': 7185,
 'word': 7333,
 'd': 1485,
 'like': 3616,
 'fun': 2461,
 'tb': 6444,
 'xxx': 7416,
 'chgs': 1081,
 'send': 5664,
 'rcv': 5197,
 'brother': 816,
 'speak': 6037,
 'treat': 6759,
 'aids': 14

In [8]:
print('Train/Test\tScore')
for n in range(10, 100, 10):
    list1 = []
    list2 = []
    for m in range(0, 10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=n/100, 
                                                            stratify=y)
        pipeline1.fit(X_train, y_train)
        pipeline2.fit(X_train, y_train)
        list1.append(pipeline1.score(X_test, y_test))
        list2.append(pipeline2.score(X_test, y_test))
    print(str(100-n) + '/' + str(n) + '\t\t'
          + str(sum(list1) / len(list1))
          + '\n\t\t'
          + str(sum(list2) / len(list2))
         )

Train/Test	Score
90/10		0.9811827956989247
		0.9673835125448029
80/20		0.9832286995515694
		0.971121076233184
70/30		0.984090909090909
		0.9688995215311003
60/40		0.9842081650964559
		0.9692687303723643
50/50		0.9824479540559944
		0.9647523330940416
40/60		0.9836124401913873
		0.9583732057416269
30/70		0.9815944629582158
		0.9521661112535249
20/80		0.9800358905338717
		0.9418573351278601
10/90		0.9767298105682951
		0.9126420737786642


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    stratify=y)
pipeline1.fit(X_train, y_train)
print(pipeline1.score(X_test, y_test))
print(metrics.confusion_matrix(y_test, pipeline1.predict(X_test)))
pipeline2.fit(X_train, y_train)
print(pipeline2.score(X_test, y_test))
print(metrics.confusion_matrix(y_test, pipeline2.predict(X_test)))

0.9813352476669059
[[1200    6]
 [  20  167]]
0.9691313711414213
[[1206    0]
 [  43  144]]
