In [3]:
from sklearn.datasets import load_files
import pandas as pd

data = pd.read_csv('spam.csv',encoding='latin-1')
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":"label", "v2":"text"})
data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
# Label and Feature Extraction
label = data.iloc[:,0]
feature = data.iloc[:,1]

In [5]:
# Splitting
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(
    feature, label,random_state=4)

# Logistic Regression

In [6]:
# Library
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [7]:
# Train the dataset
##vect = CountVectorizer()
vect = TfidfVectorizer(stop_words="english")
vect.fit(X_train)

newX_train = vect.transform(X_train)
newX_test = vect.transform(X_test)

In [8]:
logReg = LogisticRegression().fit(newX_train,y_train)
pred = logReg.predict(newX_test)

print("Label test: " + format(y_test[:10]))
print("Prediction test: " + format(pred[:10]))
print("Accuration: " + format(logReg.score(newX_test,y_test)))

Label test: 4004     ham
2276     ham
4498    spam
3755     ham
111      ham
3662     ham
4282     ham
1991     ham
2298     ham
2438     ham
Name: label, dtype: object
Prediction test: ['ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham']
Accuration: 0.9583632447954056


# Using SVM

In [9]:
# Library
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC

In [10]:
# Train the dataset
##vect = CountVectorizer()
vect = TfidfVectorizer()
vect.fit(X_train)

newX_train = vect.transform(X_train)
newX_test = vect.transform(X_test)

In [11]:
svc = SVC(C=10).fit(newX_train,y_train)
pred = svc.predict(newX_test)

print("Label test: " + format(y_test[:10]))
print("Prediction test: " + format(pred[:10]))
print("Accuration: " + format(svc.score(newX_test,y_test)))

Label test: 4004     ham
2276     ham
4498    spam
3755     ham
111      ham
3662     ham
4282     ham
1991     ham
2298     ham
2438     ham
Name: label, dtype: object
Prediction test: ['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham']
Accuration: 0.8585786073223259


# Using Naive Bayes - BEST

In [12]:
# Library
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [13]:
# Train the dataset
##vect = CountVectorizer()
vect = TfidfVectorizer()
vect.fit(X_train)

newX_train = vect.transform(X_train)
newX_test = vect.transform(X_test)

In [14]:
nb = MultinomialNB().fit(newX_train,y_train)
pred = nb.predict(newX_test)

print("Label test: " + format(y_test[:10]))
print("Prediction test: " + format(pred[:10]))
print("Accuration: " + format(nb.score(newX_test,y_test)))

Label test: 4004     ham
2276     ham
4498    spam
3755     ham
111      ham
3662     ham
4282     ham
1991     ham
2298     ham
2438     ham
Name: label, dtype: object
Prediction test: ['ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham']
Accuration: 0.9540559942569993


# Pipeline for Naive Bayes

In [15]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(TfidfVectorizer(stop_words="english"), MultinomialNB())
pipe.fit(X_train,y_train)

pred = pipe.predict(X_test)

print("Label test: " + format(y_test[:10]))
print("Prediction test: " + format(pred[:10]))
print("Accuration: " + format(pipe.score(X_test,y_test)))

Label test: 4004     ham
2276     ham
4498    spam
3755     ham
111      ham
3662     ham
4282     ham
1991     ham
2298     ham
2438     ham
Name: label, dtype: object
Prediction test: ['ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham']
Accuration: 0.9691313711414213


In [16]:
# PKL CONVERTER
tempRest=[pipe, data]

from sklearn.externals import joblib
joblib.dump(tempRest, 'SpamSMS.pkl')

['SpamSMS.pkl']

## Model Evaluation

In [31]:
#Confusion Matrix

from sklearn.metrics import confusion_matrix
print("Confusion Matrix for MultinomialMB left ro right, up to down: spam - ham ")
confusion_matrix(y_test,pred,labels=["spam","ham"])

Confusion Matrix for MultinomialMB left ro right, up to down: spam - ham 


array([[ 154,   43],
       [   0, 1196]], dtype=int64)

In [37]:
#f1 score

from sklearn.metrics import f1_score
print("f1 score MultinomialNB pipeline: {:.2f}".format(f1_score(y_test,pred,average='macro')))

f1 score MultinomialNB pipeline: 0.93


In [38]:
#Recall score

from sklearn.metrics import recall_score
print("f1 score MultinomialNB pipeline: {:.2f}".format(recall_score(y_test,pred,average='macro')))

f1 score MultinomialNB pipeline: 0.89


In [39]:
#Precision score

from sklearn.metrics import precision_score
print("f1 score MultinomialNB pipeline: {:.2f}".format(precision_score(y_test,pred,average='macro')))

f1 score MultinomialNB pipeline: 0.98


In [44]:
# Accuracy score

from sklearn.metrics import accuracy_score
print("f1 score MultinomialNB pipeline: {:.2f}".format(accuracy_score(y_test,pred)))

f1 score MultinomialNB pipeline: 0.97
