# Spam Filtering

## Import libraries

In [1]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from random import randint
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

import nltk
import numpy as np
import pandas as pd
import re

  from collections import Sequence
  from numpy.core.umath_tests import inner1d


## Pre-processing

### Functions

In [2]:
def remove_stop_words(contents):
    stop_words = list(stopwords.words('english'))
    for w in stop_words:
        contents = contents.replace(w, '')
        
    return contents

In [3]:
def lemmatize(contents):
    wordnet_lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(contents)
    
    lemmatized_tokens = []
    for token in tokens:
        lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token))
        
    return ' '.join(lemmatized_tokens)

In [4]:
def stem(contents):
    porter_stemmer = PorterStemmer()
    tokens = nltk.word_tokenize(contents)
    
    stemmed_tokens = []
    for token in tokens:
        stemmed_tokens.append(porter_stemmer.stem(token))
    
    return ' '.join(stemmed_tokens)

#### Tokenize and remove unnecessary characters

In [5]:
def remove_unnecessary_characters(contents):
    contents = contents.replace('\n', ' ')
    contents = contents.replace('..', '')
    contents = contents.replace('--', '')
    contents = contents.replace('==', '')
    contents = contents.replace('///', '')
    contents = contents.replace('\\\\', '')
    contents = ' '.join(contents.split())
    contents = contents.strip().lower()
    
#     contents = remove_stop_words(contents)
    contents = lemmatize(contents)
    contents = stem(contents)
    tokenizer = RegexpTokenizer('[A-Za-z0-9\@\.\&\/\:\$\-\_]+')
    tokens = tokenizer.tokenize(contents)
    
    tokens = ' '.join( [i for i in tokens if len(i) > 1])
    
    return tokens

#### Replace e-mail address with "this_is_email"

In [6]:
def replace_email(content):
    pattern = re.compile('[\w\/\.\-]+\@[\w\/\.\-]+\.[\w]+')
    replaced_content = re.sub(pattern, 'this_is_email', content)
    return replaced_content

#### Replace link address with "this_is_link"

In [7]:
def replace_link(content):
    pattern = re.compile('(http[s]?:\/\/|www\.)?[\w\/\.\-]+\.(com|html|php)([\/][\w\/\.\-]*)*')
    replaced_content = re.sub(pattern, 'this_is_link', content)
    return replaced_content

### Main program

#### Read CSV data for train data and test data

In [8]:
train_data = pd.read_csv('dataset/train_data.csv')
test_data = pd.read_csv('dataset/test_data.csv')

#### Tokenize remove unnecessary characters for train data and test data

In [9]:
preproc_train_data = train_data['content'].copy().apply(remove_unnecessary_characters).apply(replace_email).apply(replace_link)

print('Train data')
preproc_train_data.head()

Train data


0    daytip poem-a-day 09/13/02 sponsor child today...
1    jodi sent you messagejodi sent you messag tri ...
2    re tricki perl question ascend orderjozsi vadk...
3    suscribeyordanisp dmesd.vcl.rimed.cu to unsubs...
4    re re moment of silenc for the first amend fwd...
Name: content, dtype: object

In [10]:
preproc_train_data[0]

'daytip poem-a-day 09/13/02 sponsor child today through child intern give desper poor child hope for brighter futur for onli 15 month you can make differ http this_is_link this_is_link daili list poem-a-day info this_is_link http this_is_link septemb 13 2002 i. from fairest creatur we desir increas that therebi beauti rose might never die but the riper should by time deceas hi tender heir might bear hi memori but thou contract to thine own bright eye feed st thi light st flame with self-substanti fuel make famin where abund lie thyself thi foe to thi sweet self too cruel thou that art now the world fresh ornament and onli herald to the gaudi spring within thine own bud buriest thi content and tender churl makest wast in niggard piti the world or els thi glutton be to eat the world due by the grave and thee shakespear william sonnet william shakespear 1564-1616 wa famou english poet dramatist and actor often regard the greatest english writer of all time shakespear earli life wa spent i

In [11]:
preproc_test_data = test_data['content'].copy().apply(remove_unnecessary_characters).apply(replace_email).apply(replace_link)

print('Test data')
preproc_test_data.head()

Test data


0    re acroread not see printerson thu 2010-04-15 ...
1    america great misleaderurl http this_is_link 8...
2    lowestpric guarante on flea and tick med now y...
3    re problem with apt-get -f install onc upon ti...
4    ack apt-get still fail for me stump rh8 post a...
Name: content, dtype: object

## Feature extraction

### TFIDF

In [12]:
tfidf_vectorizer = TfidfVectorizer('english')
tfidf_features = tfidf_vectorizer.fit_transform(preproc_train_data)

### Count vectorizer features

In [13]:
cv_vectorizer = CountVectorizer('english')
cv_features = cv_vectorizer.fit_transform(preproc_train_data)

### Train test split

In [14]:
features_train, features_test, labels_train, labels_test = train_test_split(tfidf_features, train_data['prediction'], test_size=0.2, random_state=36)

## Model

### Function for hyperparameter tuning

In [15]:
def hyperparameter_tuning(model, tuned_parameters):
    scores = ['precision', 'recall']
    
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(model, tuned_parameters, cv=5,
                           scoring='%s_macro' % score)
        clf.fit(features_train, labels_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        prediction = clf.predict(features_test)
        print(classification_report(labels_test, prediction))
        print()

### Support Vector Classification (SVC / SVM)

#### Default setting

In [16]:
svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(features_train, labels_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.0, kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
prediction = svc.predict(features_test)
accuracy_score(labels_test, prediction)

0.988

In [18]:
f1_score(labels_test, prediction)

0.9914529914529915

#### Hyperparameter tuning

In [19]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [20]:
# hyperparameter_tuning(SVC(), tuned_parameters)

#### After hyperparameter tuning

In [21]:
# Recall > precision because it is more dangerous for not-spam email marked as spam than the other way around

svc = SVC(kernel='linear', C=10)
svc.fit(features_train, labels_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [22]:
prediction = svc.predict(features_test)
accuracy_score(labels_test, prediction)

0.99

In [23]:
f1_score(labels_test, prediction)

0.9928673323823112

In [24]:
confusion_matrix(labels_test, prediction, labels=[1, 0])

array([[348,   3],
       [  2, 147]])

In [25]:
misclassified_samples = labels_test[labels_test != prediction]

misclassified_samples

442     1
1440    1
1391    0
1770    0
1503    1
Name: prediction, dtype: int64

In [26]:
mis_indexes = misclassified_samples.index.tolist()

In [27]:
pd.options.display.max_colwidth = 10000

In [70]:
scores = zip(tfidf_vectorizer.get_feature_names(),
             np.asarray(tfidf_features.sum(axis=0)).ravel())
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
temp_arr = []
for item in sorted_scores:
    for word in preproc_train_data[442].split():
        if item[0] == word and word not in temp_arr:
            temp_arr.append(word)
            print(word, item[1])

the 240.33175366893477
to 190.11930259218894
of 125.72375108519061
and 124.81130872562416
you 102.83483723502889
in 97.28697137751
it 90.72693380405312
for 82.88066892536122
that 81.97376074474337
nbsp 73.19669142035976
this_is_link 72.40383161801171
http 72.24456230962299
thi 70.90620685096799
on 68.32369397757274
your 63.683416395491534
with 62.95066625509182
have 53.23009355292699
are 52.19581375513457
from 51.355316263644035
do 49.690963606230376
unsubscrib 47.12443783231154
or 46.78975560505487
use 45.45441111370598
if 44.441646381349734
list 44.07645763971184
email 42.06396823792177
can 38.416223659384265
all 36.36086027030789
by 35.587421613223555
get 35.46003240418657
here 35.21439850608956
user 31.722560691735538
no 31.25086148567395
click 30.887894003070425
what 30.48857208036214
more 29.08014516821048
2002 28.80702383507804
out 26.558524491264592
free 25.440513470209034
now 24.68535482405277
about 24.636610594200185
time 24.576602579879168
onli 24.573513911871697
new 23.9763

In [71]:
scores = zip(tfidf_vectorizer.get_feature_names(),
             np.asarray(tfidf_features.sum(axis=0)).ravel())
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
temp_arr = []
for item in sorted_scores:
    for word in preproc_train_data[1440].split():
        if item[0] == word and word not in temp_arr:
            temp_arr.append(word)
            print(word, item[1])

thi 70.90620685096799
pleas 23.342162881837357
address 17.36702250504322
instead 7.192817185279665
geeg 3.262003852791345
erus 0.5786509184067304
prerog 0.5786509184067304


In [29]:
def display_scores(vectorizer, tfidf_result):
    # http://stackoverflow.com/questions/16078015/
    scores = zip(vectorizer.get_feature_names(),
                 np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    for item in sorted_scores[0:50]:
        print("{0:50} Score: {1}".format(item[0], item[1]))

In [30]:
display_scores(tfidf_vectorizer, tfidf_features)

the                                                Score: 240.33175366893477
to                                                 Score: 190.11930259218894
of                                                 Score: 125.72375108519061
and                                                Score: 124.81130872562416
you                                                Score: 102.83483723502889
is                                                 Score: 97.35270767958957
in                                                 Score: 97.28697137751
debian                                             Score: 95.69655404699763
it                                                 Score: 90.72693380405312
for                                                Score: 82.88066892536122
that                                               Score: 81.97376074474337
org                                                Score: 76.59923744033216
nbsp                                               Score: 73.19669142035976
this_is_li

In [31]:
for i in mis_indexes:
    print(i, preproc_train_data[i])

442 person your palm devicefrom nobodi thu sep 17:44:27 2018 content-typ text/html content-transfer-encod 7bit if you ca read thi email pleas go to http this_is_link palm reg edit nbsp nbsp nbsp issu 33 septemb 2002 nbsp nbsp nbsp nbsp person your palm devic thi issu featur app that are uniqu you are custom your launch screen plan trip design special menu protect your extra-speci inform and choos your game puzzl or tenni it all here in septemb handango champion newslett featur softwar what new tip of the trade champion choic nbsp nbsp infosaf plu carri import person info with you secur and alway have quick access to it 16.99 read more buy now nbsp nbsp pocket cook delux full-featur recip app with menu planner shop list and 000 suppli recip 19.95 read more buy now nbsp nbsp launcher plu custom your palm with thi cool applic launcher and use fun theme color and graphic 18.00 read more buy now nbsp nbsp flipdi fruit-flip festiv of fun come to the screen of your palm handheld price reduc 4

In [32]:
for i in range(len(mis_indexes)):
    print(train_data[train_data.index == mis_indexes[i]]['id'], train_data[train_data.index == mis_indexes[i]]['content'])

442    373
Name: id, dtype: int64 442    Personalize your Palm OS deviceFrom nobody Thu Sep  6 17:44:27 2018\nContent-Type: text/html\nContent-Transfer-Encoding: 7bit\n\n\nIf you can't read this email, please go to: http://www.handango.com/palmnewsletter \n\n\n\n   \n    \n    \n    Palm \n      OS&reg; Edition&nbsp;&nbsp;&nbsp;\n      Issue \n      #33, September 2002&nbsp;&nbsp;&nbsp;\n  \n\n\n   \n    &nbsp;\n     \n      \n         \n          \n           \n            Personalize \n              your Palm OS device! This issue features apps that are as unique \n              as you are. Customize your launch screen, plan a trip, design a \n              special menu, protect your extra-special information and choose \n              your game: puzzles or tennis. It's all here in September's Handango \n              Champion newsletter!\n            \n              \n                \n                  Featured \n                    Software\n                \n                 \n  

#### Cross-validation score

In [33]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(svc, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)

In [34]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9874884167604543
Accuracy:  0.9828047712190848


### Multinomial Naive Bayes (MNB)

#### Default setting

In [35]:
mnb = MultinomialNB(alpha=0.2)
mnb.fit(features_train, labels_train)

MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True)

In [36]:
prediction = mnb.predict(features_test)
accuracy_score(labels_test, prediction)

0.934

In [37]:
f1_score(labels_test, prediction)

0.9549795361527967

#### Hyperparameter tuning

In [38]:
tuned_parameters = [{'alpha': [0, 0.5, 1.0, 1.5, 2.0]}]

In [39]:
# hyperparameter_tuning(MultinomialNB(), tuned_parameters)

#### After hyperparameter tuning

In [40]:
mnb = MultinomialNB(alpha=0)
mnb.fit(features_train, labels_train)

  'setting alpha = %.1e' % _ALPHA_MIN)


MultinomialNB(alpha=0, class_prior=None, fit_prior=True)

In [41]:
prediction = mnb.predict(features_test)
accuracy_score(labels_test, prediction)

0.946

In [42]:
f1_score(labels_test, prediction)

0.9612625538020086

#### Cross-validation

In [43]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(mnb, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


In [44]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9657397774528963
Accuracy:  0.9528127200508802


### Decision Tree Learning (DTL)

#### Default setting

In [45]:
dtl = DecisionTreeClassifier()
dtl.fit(features_train, labels_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [46]:
prediction = dtl.predict(features_test)
accuracy_score(labels_test, prediction)

0.922

In [47]:
f1_score(labels_test, prediction)

0.943884892086331

#### Cross-validation

In [48]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(dtl, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)

In [49]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9435089690771952
Accuracy:  0.9220070800283201


### Random Forest

#### Default setting

In [50]:
rf = RandomForestClassifier(max_depth=50, random_state=0)
rf.fit(features_train, labels_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [51]:
prediction = rf.predict(features_test)
accuracy_score(labels_test, prediction)

0.944

In [52]:
f1_score(labels_test, prediction)

0.9605633802816902

#### Cross-validation

In [53]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(dtl, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)

In [54]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.941622020811716
Accuracy:  0.9192038672154688


### K-Nearest Neighbors

#### Default setting

In [55]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(features_train, labels_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [56]:
prediction = knn.predict(features_test)
accuracy_score(labels_test, prediction)

0.906

In [57]:
f1_score(labels_test, prediction)

0.9295352323838081

#### Cross-validation 

In [58]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(knn, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)  

In [59]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9205576150705946
Accuracy:  0.8956046464185856
