In [19]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

In [20]:
test_data = pandas.read_csv('test.csv',sep=',',header=None)[1:]
validation_data = pandas.read_csv('validation.csv',sep=',',header=None)[1:]
train_data = pandas.read_csv('train.csv',sep=',',header=None)[1:]
test_data.columns=["message", "label"]
validation_data.columns=["message", "label"]
train_data.columns=["message", "label"]
msg_test,msg_validation,msg_train = test_data['message'],validation_data['message'],train_data['message']
label_test,label_validation,label_train = test_data['label'],validation_data['label'],train_data['label']

# Naive Bayes 

In [21]:
pipeline_nb = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])
pipeline_nb.fit(msg_train,label_train)


In [22]:
def score(model):
    print("ACCURACY")
    print('Train Accuracy:', model.score(msg_train,label_train))
    print('Validation Accuracy:',model.score(msg_validation,label_validation))
    print(" ")
    print("CONFUSION MATRIX ")
    print("Confusion Matrix of Train:")
    print(confusion_matrix(label_train, model.predict(msg_train)))
    print("Confusion Matrix of Validation:")
    print(confusion_matrix(label_validation, model.predict(msg_validation)))
    print(" ")
    #Evaluation Of data
    print("EVALUATION")
    print('Train Evaluation:')
    report =classification_report(label_train, model.predict(msg_train),output_dict=True)
    # Extract metrics for each class
    precision = [round(report[str(i)]['precision'],2) for i in range(len(np.unique(label_train)))]
    recall = [round(report[str(i)]['recall'],2) for i in range(len(np.unique(label_train)))]
    f1_score = [round(report[str(i)]['f1-score'],2) for i in range(len(np.unique(label_train)))]

    # Create a table
    classify=["text","spam"]
    print("     \t\tPrecision\tRecall\t\tF1-Score")
    print("--------------------------------------------------------")
    for i in range(len(np.unique(label_train))):
        print(f"{classify[i]}\t\t{precision[i]}\t\t{recall[i]}\t\t{f1_score[i]}")
    print(" ")
    print('Validation Evaluation:')
    report =classification_report(label_validation, model.predict(msg_validation),output_dict=True)
    # Extract metrics for each class
    precision = [round(report[str(i)]['precision'],2) for i in range(len(np.unique(label_train)))]
    recall = [round(report[str(i)]['recall'],2) for i in range(len(np.unique(label_train)))]
    f1_score = [round(report[str(i)]['f1-score'],2) for i in range(len(np.unique(label_train)))]
    # Create a table
    print("     \t\tPrecision\tRecall\t\tF1-Score")
    print("--------------------------------------------------------")
    for i in range(len(np.unique(label_train))):
        print(f"{classify[i]}\t\t{precision[i]}\t\t{recall[i]}\t\t{f1_score[i]}")
    return

score(pipeline_nb)

ACCURACY
Train Accuracy: 0.889988358556461
Validation Accuracy: 0.8586387434554974
 
CONFUSION MATRIX 
Confusion Matrix of Train:
[[2617    2]
 [ 376  441]]
Confusion Matrix of Validation:
[[885   0]
 [162  99]]
 
EVALUATION
Train Evaluation:
     		Precision	Recall		F1-Score
--------------------------------------------------------
text		0.87		1.0		0.93
spam		1.0		0.54		0.7
 
Validation Evaluation:
     		Precision	Recall		F1-Score
--------------------------------------------------------
text		0.85		1.0		0.92
spam		1.0		0.38		0.55


# Support Vector Machine

In [23]:
pipeline_svm = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', SVC()),  # train on TF-IDF vectors w/ Support Vector classifier
])
pipeline_svm.fit(msg_train,label_train)

In [24]:
score(pipeline_svm)

ACCURACY
Train Accuracy: 1.0
Validation Accuracy: 0.9904013961605584
 
CONFUSION MATRIX 
Confusion Matrix of Train:
[[2619    0]
 [   0  817]]
Confusion Matrix of Validation:
[[884   1]
 [ 10 251]]
 
EVALUATION
Train Evaluation:
     		Precision	Recall		F1-Score
--------------------------------------------------------
text		1.0		1.0		1.0
spam		1.0		1.0		1.0
 
Validation Evaluation:
     		Precision	Recall		F1-Score
--------------------------------------------------------
text		0.99		1.0		0.99
spam		1.0		0.96		0.98


# Decision Tree

In [25]:
pipeline_DTC = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', DecisionTreeClassifier()),  # train on TF-IDF vectors w/ Decision Tree classifier
])
pipeline_DTC.fit(msg_train,label_train)

In [26]:
score(pipeline_DTC)

ACCURACY
Train Accuracy: 1.0
Validation Accuracy: 0.9406631762652705
 
CONFUSION MATRIX 
Confusion Matrix of Train:
[[2619    0]
 [   0  817]]
Confusion Matrix of Validation:
[[857  28]
 [ 40 221]]
 
EVALUATION
Train Evaluation:
     		Precision	Recall		F1-Score
--------------------------------------------------------
text		1.0		1.0		1.0
spam		1.0		1.0		1.0
 
Validation Evaluation:
     		Precision	Recall		F1-Score
--------------------------------------------------------
text		0.96		0.97		0.96
spam		0.89		0.85		0.87


# Logistic Regression

In [27]:
pipeline_LR = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ Logistic Regression classifier
])
pipeline_LR.fit(msg_train,label_train)

In [28]:
score(pipeline_LR)

ACCURACY
Train Accuracy: 0.9950523864959255
Validation Accuracy: 0.9773123909249564
 
CONFUSION MATRIX 
Confusion Matrix of Train:
[[2618    1]
 [  16  801]]
Confusion Matrix of Validation:
[[885   0]
 [ 26 235]]
 
EVALUATION
Train Evaluation:
     		Precision	Recall		F1-Score
--------------------------------------------------------
text		0.99		1.0		1.0
spam		1.0		0.98		0.99
 
Validation Evaluation:
     		Precision	Recall		F1-Score
--------------------------------------------------------
text		0.97		1.0		0.99
spam		1.0		0.9		0.95


# AdaBoost Classifier

In [29]:
pipeline_ABC = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', AdaBoostClassifier()),  # train on TF-IDF vectors w/ AdaBoost classifier
])
pipeline_ABC.fit(msg_train,label_train)



In [30]:
score(pipeline_ABC)

ACCURACY
Train Accuracy: 0.989522700814901
Validation Accuracy: 0.9703315881326352
 
CONFUSION MATRIX 
Confusion Matrix of Train:
[[2602   17]
 [  19  798]]
Confusion Matrix of Validation:
[[869  16]
 [ 18 243]]
 
EVALUATION
Train Evaluation:
     		Precision	Recall		F1-Score
--------------------------------------------------------
text		0.99		0.99		0.99
spam		0.98		0.98		0.98
 
Validation Evaluation:
     		Precision	Recall		F1-Score
--------------------------------------------------------
text		0.98		0.98		0.98
spam		0.94		0.93		0.93


# Compare Test Accuracy of different Models

In [31]:
print('Support Vector Accuracy:     ',pipeline_svm.score(msg_test,label_test))
print('Logistic Regression Accuracy:',pipeline_LR.score(msg_test,label_test))
print('AdaBoost Accuracy:           ',pipeline_ABC.score(msg_test,label_test))
print('DecisionTree Accuracy:       ',pipeline_DTC.score(msg_test,label_test))
print('Naive Bayes Accuracy:        ',pipeline_nb.score(msg_test,label_test))


Support Vector Accuracy:      0.987783595113438
Logistic Regression Accuracy: 0.9738219895287958
AdaBoost Accuracy:            0.9668411867364747
DecisionTree Accuracy:        0.9458987783595113
Naive Bayes Accuracy:         0.8420593368237347


Support Vector has way better accuracy on the test data compared to other classifier methods