In [253]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import re  #regex
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [254]:
data = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['class', 'msg'])
data.head()

Unnamed: 0,class,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [255]:
data.groupby('class').nunique()

Unnamed: 0_level_0,msg
class,Unnamed: 1_level_1
ham,4516
spam,653


In [256]:
data.isna().sum()

class    0
msg      0
dtype: int64

In [257]:
X = data['msg']
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y)

In [258]:
count_vect = CountVectorizer(strip_accents='unicode', stop_words='english', 
                       token_pattern=r'[a-zA-Z]+')
pipeline = Pipeline([('vect', count_vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(stop_words='english', strip_accents='unicode',
                                 token_pattern='[a-zA-Z]+')),
                ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [259]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(stop_words='english', strip_accents='unicode',
                   token_pattern='[a-zA-Z]+')),
  ('tfidf', TfidfTransformer()),
  ('clf', MultinomialNB())],
 'verbose': False,
 'vect': CountVectorizer(stop_words='english', strip_accents='unicode',
                 token_pattern='[a-zA-Z]+'),
 'tfidf': TfidfTransformer(),
 'clf': MultinomialNB(),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': 'english',
 'vect__strip_accents': 'unicode',
 'vect__token_pattern': '[a-zA-Z]+',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'clf__al

In [260]:
pipeline.score(X_test, y_test)

0.974910394265233

In [261]:
results = pd.DataFrame({'class': y_test,
                        'prediction': pipeline.predict(X_test),
                        'msg': X_test,
                        'score': pipeline.predict_proba(X_test)[:,0]})

In [268]:
results.loc[results['class'] != results['prediction']]

Unnamed: 0,class,prediction,msg,score
2430,spam,ham,Guess who am I?This is the first time I create...,0.884318
4914,spam,ham,"Goal! Arsenal 4 (Henry, 7 v Liverpool 2 Henry ...",0.730894
2699,spam,ham,FROM 88066 LOST £12 HELP,0.909383
731,spam,ham,Email AlertFrom: Jeri StewartSize: 2KBSubject:...,0.920482
4527,spam,ham,"I want some cock! My hubby's away, I need a re...",0.766904
4754,spam,ham,Cashbin.co.uk (Get lots of cash this weekend!)...,0.629347
5566,spam,ham,REMINDER FROM O2: To get 2.50 pounds free call...,0.502194
1663,spam,ham,Hi if ur lookin 4 saucy daytime fun wiv busty ...,0.726097
3302,spam,ham,RCT' THNQ Adrian for U text. Rgds Vatian,0.890725
1640,spam,ham,FreeMsg:Feelin kinda lnly hope u like 2 keep m...,0.716567


In [269]:
results.loc[(results['class'] == 'spam') & 
            (results['class'] == results['prediction'])]

Unnamed: 0,class,prediction,msg,score
910,spam,spam,"January Male Sale! Hot Gay chat now cheaper, c...",0.135253
191,spam,spam,Are you unique enough? Find out from 30th Augu...,0.382460
5468,spam,spam,URGENT! Last weekend's draw shows that you hav...,0.049412
2023,spam,spam,U can WIN £100 of Music Gift Vouchers every we...,0.019263
5141,spam,spam,FREE for 1st week! No1 Nokia tone 4 ur mobile ...,0.009607
...,...,...,...,...
2,spam,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0.036831
2705,spam,spam,FreeMsg: Fancy a flirt? Reply DATE now & join ...,0.032428
1207,spam,spam,"As a SIM subscriber, you are selected to recei...",0.134839
824,spam,spam,25p 4 alfie Moon's Children in need song on ur...,0.224717
