In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

#text vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score


In [2]:
dataset = pd.read_csv('spam.csv',encoding='latin-1')
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# remove un-required_columns
remove_cols = {'Unnamed: 2', 'Unnamed: 3','Unnamed: 4'}
dataset.drop(labels=remove_cols, axis=1,inplace=True)
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#rename working columns
col_names = { 'v1' : 'status' , 'v2':'text'}
dataset = dataset.rename(columns=col_names)
dataset.head()

Unnamed: 0,status,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
print(dataset['status'].value_counts())
dictin = { 'spam' : 1, 'ham' : 0}
dataset['is_spam'] = dataset['status'].map(dictin)
dataset.head()

ham     4825
spam     747
Name: status, dtype: int64


Unnamed: 0,status,text,is_spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
X = dataset['text']
Y = dataset['is_spam']

In [7]:
x_train,x_test, y_train,  y_test = train_test_split(X,Y, random_state=0)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((4179,), (4179,), (1393,), (1393,))

In [8]:
cVectorizer = CountVectorizer()

In [9]:
cVectorizer.fit(x_train)
#cVectorizer.get_feature_names()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
X_train_df = cVectorizer.transform(x_train)
X_test_df = cVectorizer.transform(x_test)

###### Apply machine learning algorithms

In [11]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_df,y_train)
y_predict_knn = knn.predict(X_test_df)

In [12]:
lgtscReg = LogisticRegression()
lgtscReg.fit(X_train_df,y_train)
y_predict_logisticReg = lgtscReg.predict(X_test_df)

In [13]:
decisionTreeClsfr = DecisionTreeClassifier(random_state=1)
decisionTreeClsfr.fit(X_train_df,y_train)
y_predict_decTreeClassifier = decisionTreeClsfr.predict(X_test_df)

In [14]:
randForestClsfr = RandomForestClassifier()
randForestClsfr.fit(X_train_df,y_train)
y_predict_randForestClassifier = randForestClsfr.predict(X_test_df)

In [28]:
naiveBayes = MultinomialNB()
naiveBayes.fit(X_train_df,y_train)
y_predict_naiveBayes = naiveBayes.predict(X_test_df)

In [33]:
print('k-NN Accuracy : ',accuracy_score(y_test,y_predict_knn))
print('Logistic Reg Accuracy : ',accuracy_score(y_test,y_predict_logisticReg))
print('Decision Tree Classifier Accuracy : ',accuracy_score(y_test,y_predict_decTreeClassifier))
print('Random Forest Accuracy : ',accuracy_score(y_test,y_predict_randForestClassifier))
print('MultinomialNaiveBayes Accuracy : ', accuracy_score(y_test,y_predict_naiveBayes))

k-NN Accuracy :  0.909547738693
Logistic Reg Accuracy :  0.978463747308
Decision Tree Classifier Accuracy :  0.9720028715
Random Forest Accuracy :  0.959798994975
MultinomialNaiveBayes Accuracy :  0.986360373295


### Using Naive Bayes approach because it has maximum accuracy

In [32]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict_randForestClassifier)

array([[1196,    0],
       [  56,  141]], dtype=int64)