In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#from nltk.tokenize import RegexpTokenizer  
#from nltk.corpus import stopwords
#from nltk.stem.snowball import SnowballStemmer

2. Import data

In [2]:
data = pd.read_csv("./spam.csv", encoding='latin-1', usecols=["v1","v2"])

In [3]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data = data.rename(columns={"v1":"label", "v2":"text"})

In [5]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

Convert labels to numerical variables

In [7]:
data['label_num'] = data.label.map({'ham':0, 'spam':1})
#from sklearn.preprocessing import LabelEncoder
#le = LabelEncoder()
#le.fit(data['label'])
#label = le.transform(data['label'])
#print(np.unique(label))
#print(np.unique(data['label_num']))

In [8]:
data.head()

Unnamed: 0,label,text,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_test,y_train,y_test = train_test_split(data["text"],data["label_num"], test_size = 0.2, random_state = 10)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
vect = TfidfVectorizer()

In [14]:
vect.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [15]:
print(vect.get_feature_names()[0:10])
print(vect.get_feature_names()[-10:])

['00', '000', '000pes', '008704050406', '0089', '0121', '01223585236', '01223585334', '0125698789', '02']
['ó_', 'û_', 'û_thanks', 'ûªm', 'ûªt', 'ûªve', 'ûï', 'ûïharry', 'ûò', 'ûówell']


In [16]:
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
#print("Vocabulary content:\n {}".format(vect.vocabulary_))

Vocabulary size: 7757


In [17]:
X_train_df = vect.fit_transform(X_train)

In [18]:
X_train_df[:3].nonzero()

(array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32),
 array([ 938, 3514, 7699, 7514, 2369, 6813, 5414,  938, 7699, 6864, 1873,
        3592, 3749, 4318, 2802, 1313, 3726, 1024, 7601, 3073, 4831, 3752,
        6113, 1180, 2942, 2659, 7382, 4928, 7602,  741, 4923, 7430, 4782,
         913, 6829, 1566, 4932, 6901], dtype=int32))

In [19]:
prediction = dict()
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_df,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
X_test_df = vect.transform(X_test)
prediction["Multinomial"] = model.predict(X_test_df)

In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [22]:
accuracy_score(y_test,prediction["Multinomial"])

0.9596412556053812

In [23]:
print(classification_report(y_test,prediction["Multinomial"]))

             precision    recall  f1-score   support

          0       0.96      1.00      0.98       965
          1       1.00      0.70      0.82       150

avg / total       0.96      0.96      0.96      1115



In [24]:
conf_mat = confusion_matrix(y_test, prediction['Multinomial'])
print(conf_mat)
conf_mat_normalized = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]

[[965   0]
 [ 45 105]]


In [25]:
print(conf_mat_normalized)

[[1.  0. ]
 [0.3 0.7]]


In [26]:
print("train score:", model.score(X_train_df, y_train))
print("test score:", model.score(X_test_df, y_test))

train score: 0.974646623289208
test score: 0.9596412556053812
