# Spam Filtering

## Import libraries

In [108]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import re

## Pre-processing

### Functions

In [109]:
def remove_stop_words(contents):
    stop_words = list(stopwords.words('english'))
    for w in stop_words:
        contents = contents.replace(w, '')
        
    return contents

#### Tokenize and remove unnecessary characters

In [146]:
def remove_unnecessary_characters(contents):
    contents = contents.replace('\n', ' ')
    contents = contents.replace('..', '')
    contents = contents.replace('--', '')
    contents = contents.replace('==', '')
    contents = contents.replace('///', '')
    contents = contents.replace('\\\\', '')
    contents = ' '.join(contents.split())
    contents = contents.strip().lower()
    
#     contents = remove_stop_words(contents)
    tokenizer = RegexpTokenizer('[A-Za-z0-9\@\.\&\/\:\$\-\_]+')
    tokens = tokenizer.tokenize(contents)
    
    tokens = ' '.join( [i for i in tokens if len(i) > 1])
    
    return tokens

In [3]:
def replace_email(content):
    pattern = re.compile('[\w\/\.\-]+\@[\w\/\.\-]+\.[\w]+')
    replaced_content = re.sub(pattern, 'this_is_email', content)
    return replaced_content

In [4]:
def replace_link(content):
    pattern = re.compile('(http[s]?:\/\/|www\.)?[\w\/\.\-]+\.(com|html|php)([\/][\w\/\.\-]*)*')
    replaced_content = re.sub(pattern, 'this_is_link', content)
    return replaced_content

### Main program

#### Read CSV data for train data and test data

In [5]:
train_data = pd.read_csv('dataset/train_data.csv')
test_data = pd.read_csv('dataset/test_data.csv')

#### Tokenize remove unnecessary characters for train data and test data

In [147]:
preproc_train_data = train_data['content'].copy().apply(remove_unnecessary_characters).apply(replace_email).apply(replace_link)

print('Train data')
preproc_train_data.head()

Train data


0    daytips poem-a-day: 09/13/02 sponsor child tod...
1    jody sent you messagejody sent you message. tr...
2    re: tricky perl question ascending orderjozsi ...
3    this_is_email to unsubscribe email to this_is_...
4    re: re moment of silence for the first amendme...
Name: content, dtype: object

In [148]:
preproc_test_data = test_data['content'].copy().apply(remove_unnecessary_characters).apply(replace_email).apply(replace_link)

print('Test data')
preproc_test_data.head()

Test data


0    re: acroread not seeing printerson thu 2010-04...
1    america great misleaderurl: this_is_link 86740...
2    lowestprices guaranteed on flea and tick meds ...
3    re: problems with apt-get -f install once upon...
4    ack apt-get still failing for me stumped. rh8 ...
Name: content, dtype: object

## Feature extraction

In [149]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### TFIDF

In [150]:
vectorizer = TfidfVectorizer('english')
tfidf_features = vectorizer.fit_transform(preproc_train_data)

### Count vectorizer features

In [151]:
vectorizer = CountVectorizer('english')
cv_features = vectorizer.fit_transform(preproc_train_data)

### Train test split

In [193]:
features_train, features_test, labels_train, labels_test = train_test_split(tfidf_features, train_data['prediction'], test_size=0.2, random_state=24)

## Model

In [194]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [195]:
def hyperparameter_tuning(model, tuned_parameters):
    scores = ['precision', 'recall']
    
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(model, tuned_parameters, cv=5,
                           scoring='%s_macro' % score)
        clf.fit(features_train, labels_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        prediction = clf.predict(features_test)
        print(classification_report(labels_test, prediction))
        print()

### Support Vector Classification (SVC / SVM)

In [196]:
svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(features_train, labels_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.0, kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [197]:
prediction = svc.predict(features_test)
accuracy_score(labels_test, prediction)

0.978

In [198]:
f1_score(labels_test, prediction)

0.983941605839416

#### Hyperparameter tuning

In [199]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [200]:
hyperparameter_tuning(SVC(), tuned_parameters)

# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Best parameters set found on development set:

{'C': 10, 'kernel': 'linear'}

Grid scores on development set:

0.345 (+/-0.001) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.345 (+/-0.001) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.345 (+/-0.001) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.345 (+/-0.001) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.932 (+/-0.013) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.345 (+/-0.001) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.980 (+/-0.018) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.932 (+/-0.013) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.976 (+/-0.021) for {'C': 1, 'kernel': 'linear'}
0.980 (+/-0.019) for {'C': 10, 'kernel': 'linear'}
0.980 (+/-0.019) for {'C': 100, 'kernel': 'linear'}
0.980 (+/-0.019) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    

In [201]:
# Recall > precision because it is more dangerous for not-spam email marked as spam than the other way around

svc = SVC(kernel='linear', C=10)
svc.fit(features_train, labels_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [202]:
prediction = svc.predict(features_test)
accuracy_score(labels_test, prediction)

0.982

In [203]:
f1_score(labels_test, prediction)

0.9868613138686132

### Multinomial Naive Bayes (MNB)

In [204]:
mnb = MultinomialNB(alpha=0.2)
mnb.fit(features_train, labels_train)

MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True)

In [205]:
prediction = mnb.predict(features_test)
accuracy_score(labels_test, prediction)

0.94

In [206]:
f1_score(labels_test, prediction)

0.9577464788732395

### Hyperparameter tuning

In [207]:
tuned_parameters = [{'alpha': [0, 0.5, 1.0, 1.5, 2.0]}]

In [208]:
hyperparameter_tuning(MultinomialNB(), tuned_parameters)

# Tuning hyper-parameters for precision



  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Best parameters set found on development set:

{'alpha': 0}

Grid scores on development set:

0.944 (+/-0.018) for {'alpha': 0}
0.910 (+/-0.015) for {'alpha': 0.5}
0.890 (+/-0.012) for {'alpha': 1.0}
0.881 (+/-0.009) for {'alpha': 1.5}
0.874 (+/-0.011) for {'alpha': 2.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

          0       0.92      0.91      0.92       160
          1       0.96      0.96      0.96       340

avg / total       0.95      0.95      0.95       500


# Tuning hyper-parameters for recall

Best parameters set found on development set:

{'alpha': 0}

Grid scores on development set:

0.938 (+/-0.027) for {'alpha': 0}
0.764 (+/-0.058) for {'alpha': 0.5}
0.690 (+/-0.063) for {'alpha': 1.0}
0.652 (+/-0.035) for {'alpha': 1.5}
0.623 (+/-0.043) for {'alpha': 2.0}

Detailed classification report:

The model is trained on the ful

  'setting alpha = %.1e' % _ALPHA_MIN)


In [209]:
mnb = MultinomialNB(alpha=0)
mnb.fit(features_train, labels_train)

  'setting alpha = %.1e' % _ALPHA_MIN)


MultinomialNB(alpha=0, class_prior=None, fit_prior=True)

In [210]:
prediction = mnb.predict(features_test)
accuracy_score(labels_test, prediction)

0.948

In [211]:
f1_score(labels_test, prediction)

0.9618768328445748

### Decision Tree Learning (DTL)

In [212]:
dtl = DecisionTreeClassifier()
dtl.fit(features_train, labels_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [213]:
prediction = dtl.predict(features_test)
accuracy_score(labels_test, prediction)

0.946

In [214]:
f1_score(labels_test, prediction)

0.9604685212298683

### Random Forest

In [215]:
rf = RandomForestClassifier(max_depth=50, random_state=0)
rf.fit(features_train, labels_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [216]:
prediction = rf.predict(features_test)
accuracy_score(labels_test, prediction)

0.94

In [217]:
f1_score(labels_test, prediction)

0.9572649572649573

### K-Nearest Neighbors

In [218]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(features_train, labels_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [219]:
prediction = knn.predict(features_test)
accuracy_score(labels_test, prediction)

0.89

In [220]:
f1_score(labels_test, prediction)

0.9144634525660965