In [1]:
import numpy as np
import pandas as pd
import chardet
from gensim import parsing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score,  accuracy_score

In [2]:
with open('spam.csv', 'rb') as f:
      result= chardet.detect(f.read())

df = pd.read_csv('spam.csv', encoding = result['encoding']) 

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df = df.drop(["Unnamed: 2"], axis=1 )  
df = df.drop(["Unnamed: 3"], axis=1 )  
df = df.drop(["Unnamed: 4"], axis=1 )  

In [5]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [7]:
df['v1'] = df['v1'].map({'ham' : 0, 'spam' : 1})                  #v1  0&1  v2 emails
dataset = df.values
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
np.random.shuffle(dataset)

In [9]:
dataset[:5]

array([[1,
        'Hottest pics straight to your phone!! See me getting Wet and Wanting, just for you xx Text PICS to 89555 now! txt costs 150p textoperator g696ga 18 XxX'],
       [0, 'When Ì_ login dat time... Dad fetching Ì_ home now?'],
       [0,
        'Hey no I ad a crap nite was borin without ya 2 boggy with me u boring biatch! Thanx but u wait til nxt time il ave ya '],
       [1, '08714712388 between 10am-7pm Cost 10p'],
       [0, 'Lol enjoy role playing much?']], dtype=object)

In [10]:
X = dataset[:,1]
Y = dataset[:,0]
Y = Y.astype('int32')

In [11]:
X.shape[0]

5572

In [12]:
dataset.shape

(5572, 2)

In [13]:
for i in range(X.shape[0]):
      X[i] = parsing.stem_text(X[i].lower())

In [14]:
#Data ino bag of wordformat
vectorizer = CountVectorizer()     
X_transformed = vectorizer.fit_transform(X)

In [15]:
X_transformed.shape

(5572, 8265)

In [16]:
X_train = X_transformed[0:4000, :]  
Y_train = Y[0:4000]
X_test = X_transformed[4000:, :]
Y_test = Y[4000:]

In [17]:
X_train.shape

(4000, 8265)

In [18]:
def print_metrics(Y_true, Y_predicted):
    print("Accuracy score :" + str(accuracy_score(Y_true, Y_predicted)))
    print("Precision score :" + str(precision_score(Y_true, Y_predicted)))
    print("Recall score :" + str(recall_score(Y_true, Y_predicted)))
    print("ROC AUC score :" + str(roc_auc_score(Y_true, Y_predicted))) #receiving area characteristics area under the curve
    print("Confusion Matrix : \n")
    print(confusion_matrix(Y_true, Y_predicted))

# using Naive Bayes Multinomial

In [19]:
bayes_clf = MultinomialNB()
bayes_clf.fit(X_train, Y_train)

In [20]:
Y_predicted = bayes_clf.predict(X_test)
print_metrics(Y_test, Y_predicted)

Accuracy score :0.9790076335877863
Precision score :0.9241706161137441
Recall score :0.9198113207547169
ROC AUC score :0.954023307436182
Confusion Matrix : 

[[1344   16]
 [  17  195]]


# using support vector classifier

In [21]:
svm_clf = SVC(C = 2000)
svm_clf.fit(X_train, Y_train)

In [22]:
Y_predicted_svm = svm_clf.predict(X_test)
print_metrics(Y_test, Y_predicted_svm)

Accuracy score :0.9758269720101781
Precision score :0.9943181818181818
Recall score :0.8254716981132075
ROC AUC score :0.9123682019977802
Confusion Matrix : 

[[1359    1]
 [  37  175]]


# using logistic regression

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
logistic_model = LogisticRegression()


In [25]:
logistic_model.fit(X_train, Y_train)


In [26]:
y_pred = logistic_model.predict(X_test)

In [27]:
print_metrics(Y_test, y_pred)

Accuracy score :0.9821882951653944
Precision score :0.9946236559139785
Recall score :0.8726415094339622
ROC AUC score :0.9359531076581576
Confusion Matrix : 

[[1359    1]
 [  27  185]]


# using j48 model

In [28]:
from sklearn.tree import DecisionTreeClassifier
j48_model = DecisionTreeClassifier(criterion='entropy')

In [29]:
j48_model.fit(X_train, Y_train)

In [30]:
y_pred_j48 = j48_model.predict(X_test)
print_metrics(Y_test, y_pred_j48)

Accuracy score :0.9618320610687023
Precision score :0.8725490196078431
Recall score :0.839622641509434
ROC AUC score :0.9102524972253052
Confusion Matrix : 

[[1334   26]
 [  34  178]]


# using naive bayes

In [31]:
from sklearn.naive_bayes import GaussianNB
naive_bayes_model = GaussianNB()


In [32]:
X_train_dense = X_train.toarray()

# Fit the model to the dense numpy array
naive_bayes_model.fit(X_train_dense, Y_train)


In [33]:
# Fit the model to the dense numpy array
X_test_dense = X_test.toarray()
y_pred_naive_byes = naive_bayes_model.predict(X_test_dense)
print_metrics(Y_test, y_pred_naive_byes)

Accuracy score :0.9026717557251909
Precision score :0.5907692307692308
Recall score :0.9056603773584906
ROC AUC score :0.9039331298557159
Confusion Matrix : 

[[1227  133]
 [  20  192]]
