# Mahdi Javaheri Saber 
# 9243088017

## a) Preparing Train and Test lists

In [1]:
# corpus file must be at same directory as .ipynb file
with open('SMSSpamCollection', encoding='UTF-8') as f:
    text_file = f.read()

In [2]:
from collections import defaultdict

def create_corpus(text):
    corpus = defaultdict(list)
    text_lines = text.split(sep='\n')
    for line in text_lines:
        row = line.split(sep='\t')
        if row[0] == 'ham':
            corpus['ham'].append(row[1])
        elif row[0] == 'spam':
            corpus['spam'].append(row[1])
    return corpus  # output format -> {'ham':["sent", "sent", ...], 'spam':["sent", "sent", ...]}

In [3]:
def bag_of_words(words):
    return dict([(word, True) for word in words])

In [4]:
from nltk.tokenize import word_tokenize
from collections import defaultdict

def label_feats_from_personal_corpus(corp, feature_detector=bag_of_words):
    label_feats = defaultdict(list)
    for label in corp:
        for item in corp[label]:
            feats = feature_detector(word_tokenize(item))
            label_feats[label].append(feats)
    return label_feats  # output format -> {'ham':[{feat0}, {feat1}, {}, ...], 'spam':[]}

In [5]:
import numpy as np

# create two train and test lists by spliting labelfeats
def split_label_feats(lfeats, split=0.75):
    train_feats = []
    test_feats = []
    for label, feats in lfeats.items():
        cutoff = int(len(feats) * split)
        np.random.shuffle(feats)  # shuffle list
        train_feats.extend([(feat, label) for feat in feats[:cutoff]])
        test_feats.extend([(feat, label) for feat in feats[cutoff:]])
    return train_feats, test_feats  # output format for each -> [({feat0}, 'ham/spam'), (), (), (),  ...]

In [6]:
corpus = create_corpus(text_file)

In [7]:
lfeats = label_feats_from_personal_corpus(corpus)

In [8]:
train_feats, test_feats = split_label_feats(lfeats, split=0.8)

## b) Training Classifiers

In [56]:
from nltk.classify.util import accuracy
from nltk.classify.scikitlearn import SklearnClassifier

# contains all classifier objects plus it's name
classifier_list = list()  # structure -> [(classifier, 'classifier_name'), (), (), ...]

In [57]:
# NaiveBayes Classifier
from nltk.classify import NaiveBayesClassifier
nb_classifier = NaiveBayesClassifier.train(train_feats)
classifier_list.append((nb_classifier, 'NaiveBayes'))
accuracy(nb_classifier, test_feats)

0.9283154121863799

In [58]:
# DecisionTree Classifier
from nltk.classify import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier.train(train_feats, binary=True, 
                                  entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30)
classifier_list.append((dt_classifier, 'DecisionTree'))
accuracy(dt_classifier, test_feats)

0.8933691756272402

In [59]:
# Maxent Classifier
from nltk.classify import MaxentClassifier

maxent_classifier = MaxentClassifier.train(train_feats, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)
classifier_list.append((maxent_classifier, 'Maxent'))
accuracy(maxent_classifier, test_feats)

0.9256272401433692

In [60]:
# SkLearn Classifier -> MultinomialNB
from sklearn.naive_bayes import MultinomialNB

mn_sk_classifier = SklearnClassifier(MultinomialNB())
mn_sk_classifier.train(train_feats)
classifier_list.append((mn_sk_classifier, 'Sklearn_MultinomialNB'))
accuracy(mn_sk_classifier, test_feats)

0.989247311827957

In [61]:
# Sklearn Classifier -> BernoulliNB
from sklearn.naive_bayes import BernoulliNB

bn_sk_classifier = SklearnClassifier(BernoulliNB())
bn_sk_classifier.train(train_feats)
classifier_list.append((bn_sk_classifier, 'Sklearn_BernoulliNB'))
accuracy(bn_sk_classifier, test_feats)

0.9775985663082437

In [62]:
# Sklearn Classifier -> LogisticRegression
from sklearn.linear_model import LogisticRegression

lr_sk_classifier = SklearnClassifier(LogisticRegression())
lr_sk_classifier.train(train_feats)
classifier_list.append((lr_sk_classifier, 'Sklearn_LogisticRegression'))
accuracy(lr_sk_classifier, test_feats)

0.989247311827957

In [63]:
# Sklearn Classifier -> SVC
from sklearn.svm import SVC

svc_sk_classifier = SklearnClassifier(SVC())
svc_sk_classifier.train(train_feats)
classifier_list.append((svc_sk_classifier, 'Sklearn_SVC'))
accuracy(svc_sk_classifier, test_feats)

0.8655913978494624

In [64]:
# Sklearn Classifier -> LinearSVC
from sklearn.svm import LinearSVC

lsvc_sk_classifier = SklearnClassifier(LinearSVC())
lsvc_sk_classifier.train(train_feats)
classifier_list.append((lsvc_sk_classifier, 'Sklearn_LinearSVC'))
accuracy(lsvc_sk_classifier, test_feats)

0.9874551971326165

In [65]:
# Sklearn Classifier -> AdaBoost
from sklearn.ensemble import AdaBoostClassifier

ada_sk_classifier = SklearnClassifier(AdaBoostClassifier())
ada_sk_classifier.train(train_feats)
classifier_list.append((ada_sk_classifier, 'Sklearn_AdaBoost'))
accuracy(ada_sk_classifier, test_feats)

0.9704301075268817

In [66]:
# Sklearn Classifier -> RandomForest
from sklearn.ensemble import RandomForestClassifier

rf_sk_classifier = SklearnClassifier(RandomForestClassifier())
rf_sk_classifier.train(train_feats)
classifier_list.append((rf_sk_classifier, 'Sklearn_RandomForest'))
accuracy(rf_sk_classifier, test_feats)

0.967741935483871

In [67]:
import itertools
from nltk.classify import ClassifierI
from nltk.probability import FreqDist
class MaxVoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        self._labels = sorted(set(itertools.chain(*[c.labels() for c in classifiers])))
    def labels(self):  # required to override
        return self._labels
    def classify(self, feats):  # required to override
        counts = FreqDist()
        for classifier in self._classifiers:
            counts[classifier.classify(feats)] += 1
        return counts.max()

In [68]:
# MaxVoteClassifier -> NaiveBayes + DecisionTree + Maxent
maxvote_classifier = MaxVoteClassifier(nb_classifier, dt_classifier, maxent_classifier)
classifier_list.append((maxvote_classifier, 'MaxVote : NaiveBayes + DecisionTree + Maxent'))
accuracy(maxvote_classifier, test_feats)

0.9345878136200717

In [69]:
# MaxVoteClassifier -> MaltinomialNB + BernuliNB + LogisticRegression
maxvote_classifier = MaxVoteClassifier(mn_sk_classifier, bn_sk_classifier, lr_sk_classifier)
classifier_list.append((maxvote_classifier, 'MaxVote : MaltinomialNB + BernuliNB + LogisticRegression'))
accuracy(maxvote_classifier, test_feats)

0.9883512544802867

In [70]:
# MaxVoteClassifier -> MaltinomialNB + LogisticRegression + LinearSVC
maxvote_classifier = MaxVoteClassifier(mn_sk_classifier, lr_sk_classifier, lsvc_sk_classifier)
classifier_list.append((maxvote_classifier, 'MaxVote : MaltinomialNB + LogisticRegression + LinearSVC'))
accuracy(maxvote_classifier, test_feats)

0.9910394265232975

In [71]:
# MaxVoteClassifier -> MaltinomialNB + LogisticRegression + LinearSVC + BernuliNB + Adaboost
maxvote_classifier = MaxVoteClassifier(mn_sk_classifier, lr_sk_classifier, 
                                       lsvc_sk_classifier, bn_sk_classifier, ada_sk_classifier)
classifier_list.append((maxvote_classifier, 'MaxVote : MaltinomialNB + LogisticRegression + LinearSVC + BernuliNB + Adaboost'))
accuracy(maxvote_classifier, test_feats)

0.9883512544802867

## c) Accuracy, Precision, Recall for each classifier

In [72]:
import collections
from nltk.metrics import precision, recall

# precision and recall scores
def precision_recall(classifier, testfeats): # the same arguments you pass to accuracy()
    refsets = collections.defaultdict(set)  # real output
    testsets = collections.defaultdict(set)  # system output 
    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    precisions = {}
    recalls = {}
    for label in classifier.labels():
        precisions[label] = precision(refsets[label], testsets[label]) # len(reference.intersection(test)) / len(test)
        recalls[label] = recall(refsets[label], testsets[label])# len(reference.intersection(test)) / len(reference)
    return precisions, recalls  # output format for each -> {'label1': float, 'label2': float, 'label3':float, ...}

In [79]:
from nltk.classify.util import accuracy

for item in classifier_list:
    classifier, name = item
    precisions, recalls = precision_recall(classifier, test_feats)
    print(name, ' :')
    print('Ac = ', accuracy(classifier, test_feats))
    print("P(ham)= ", precisions['ham'], ", P(spam)= ", precisions['spam'])
    print("R(ham)= ", recalls['ham'], ", R(spam)= ", recalls['spam'], '\n')

NaiveBayes  :
Ac =  0.9283154121863799
P(ham)=  0.9977528089887641 , P(spam)=  0.6548672566371682
R(ham)=  0.9192546583850931 , R(spam)=  0.9866666666666667 

DecisionTree  :
Ac =  0.8933691756272402
P(ham)=  0.8924930491195552 , P(spam)=  0.918918918918919
R(ham)=  0.9968944099378882 , R(spam)=  0.22666666666666666 

Maxent  :
Ac =  0.9256272401433692
P(ham)=  0.9216809933142311 , P(spam)=  0.9855072463768116
R(ham)=  0.9989648033126294 , R(spam)=  0.4533333333333333 

Sklearn_MultinomialNB  :
Ac =  0.989247311827957
P(ham)=  0.9907407407407407 , P(spam)=  0.9791666666666666
R(ham)=  0.9968944099378882 , R(spam)=  0.94 

Sklearn_BernoulliNB  :
Ac =  0.9775985663082437
P(ham)=  0.9747729566094854 , P(spam)=  1.0
R(ham)=  1.0 , R(spam)=  0.8333333333333334 

Sklearn_LogisticRegression  :
Ac =  0.989247311827957
P(ham)=  0.9877300613496932 , P(spam)=  1.0
R(ham)=  1.0 , R(spam)=  0.92 

Sklearn_SVC  :
Ac =  0.8655913978494624
P(ham)=  0.8655913978494624 , P(spam)=  None
R(ham)=  1.0 , R(