# Spam Filtering

## Import libraries

In [73]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from random import randint
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

import numpy as np
import pandas as pd
import re

## Pre-processing

### Functions

In [50]:
def remove_stop_words(contents):
    stop_words = list(stopwords.words('english'))
    for w in stop_words:
        contents = contents.replace(w, '')
        
    return contents

#### Tokenize and remove unnecessary characters

In [51]:
def remove_unnecessary_characters(contents):
    contents = contents.replace('\n', ' ')
    contents = contents.replace('..', '')
    contents = contents.replace('--', '')
    contents = contents.replace('==', '')
    contents = contents.replace('///', '')
    contents = contents.replace('\\\\', '')
    contents = ' '.join(contents.split())
    contents = contents.strip().lower()
    
#     contents = remove_stop_words(contents)
    tokenizer = RegexpTokenizer('[A-Za-z0-9\@\.\&\/\:\$\-\_]+')
    tokens = tokenizer.tokenize(contents)
    
    tokens = ' '.join( [i for i in tokens if len(i) > 1])
    
    return tokens

#### Replace e-mail address with "this_is_email"

In [52]:
def replace_email(content):
    pattern = re.compile('[\w\/\.\-]+\@[\w\/\.\-]+\.[\w]+')
    replaced_content = re.sub(pattern, 'this_is_email', content)
    return replaced_content

#### Replace link address with "this_is_link"

In [53]:
def replace_link(content):
    pattern = re.compile('(http[s]?:\/\/|www\.)?[\w\/\.\-]+\.(com|html|php)([\/][\w\/\.\-]*)*')
    replaced_content = re.sub(pattern, 'this_is_link', content)
    return replaced_content

### Main program

#### Read CSV data for train data and test data

In [54]:
train_data = pd.read_csv('dataset/train_data.csv')
test_data = pd.read_csv('dataset/test_data.csv')

#### Tokenize remove unnecessary characters for train data and test data

In [55]:
preproc_train_data = train_data['content'].copy().apply(remove_unnecessary_characters).apply(replace_email).apply(replace_link)

print('Train data')
preproc_train_data.head()

Train data


0    daytips poem-a-day: 09/13/02 sponsor child tod...
1    jody sent you messagejody sent you message. tr...
2    re: tricky perl question ascending orderjozsi ...
3    this_is_email to unsubscribe email to this_is_...
4    re: re moment of silence for the first amendme...
Name: content, dtype: object

In [56]:
preproc_test_data = test_data['content'].copy().apply(remove_unnecessary_characters).apply(replace_email).apply(replace_link)

print('Test data')
preproc_test_data.head()

Test data


0    re: acroread not seeing printerson thu 2010-04...
1    america great misleaderurl: this_is_link 86740...
2    lowestprices guaranteed on flea and tick meds ...
3    re: problems with apt-get -f install once upon...
4    ack apt-get still failing for me stumped. rh8 ...
Name: content, dtype: object

## Feature extraction

### TFIDF

In [58]:
vectorizer = TfidfVectorizer('english')
tfidf_features = vectorizer.fit_transform(preproc_train_data)

### Count vectorizer features

In [59]:
vectorizer = CountVectorizer('english')
cv_features = vectorizer.fit_transform(preproc_train_data)

### Train test split

In [102]:
features_train, features_test, labels_train, labels_test = train_test_split(tfidf_features, train_data['prediction'], test_size=0.2, random_state=randint(0, 100))

## Model

### Function for hyperparameter tuning

In [103]:
def hyperparameter_tuning(model, tuned_parameters):
    scores = ['precision', 'recall']
    
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(model, tuned_parameters, cv=5,
                           scoring='%s_macro' % score)
        clf.fit(features_train, labels_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        prediction = clf.predict(features_test)
        print(classification_report(labels_test, prediction))
        print()

### Support Vector Classification (SVC / SVM)

#### Default setting

In [104]:
svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(features_train, labels_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.0, kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [105]:
prediction = svc.predict(features_test)
accuracy_score(labels_test, prediction)

0.974

In [106]:
f1_score(labels_test, prediction)

0.9808541973490427

#### Hyperparameter tuning

In [107]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [108]:
# hyperparameter_tuning(SVC(), tuned_parameters)

#### After hyperparameter tuning

In [109]:
# Recall > precision because it is more dangerous for not-spam email marked as spam than the other way around

svc = SVC(kernel='linear', C=10)
svc.fit(features_train, labels_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [110]:
prediction = svc.predict(features_test)
accuracy_score(labels_test, prediction)

0.976

In [111]:
f1_score(labels_test, prediction)

0.9823529411764705

#### Cross-validation score

In [112]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(svc, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)

In [113]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9874994639745711
Accuracy:  0.9828015712062849


### Multinomial Naive Bayes (MNB)

#### Default setting

In [114]:
mnb = MultinomialNB(alpha=0.2)
mnb.fit(features_train, labels_train)

MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True)

In [115]:
prediction = mnb.predict(features_test)
accuracy_score(labels_test, prediction)

0.944

In [116]:
f1_score(labels_test, prediction)

0.9598853868194843

#### Hyperparameter tuning

In [117]:
tuned_parameters = [{'alpha': [0, 0.5, 1.0, 1.5, 2.0]}]

In [118]:
# hyperparameter_tuning(MultinomialNB(), tuned_parameters)

#### After hyperparameter tuning

In [119]:
mnb = MultinomialNB(alpha=0)
mnb.fit(features_train, labels_train)

  'setting alpha = %.1e' % _ALPHA_MIN)


MultinomialNB(alpha=0, class_prior=None, fit_prior=True)

In [120]:
prediction = mnb.predict(features_test)
accuracy_score(labels_test, prediction)

0.954

In [121]:
f1_score(labels_test, prediction)

0.9662261380323054

#### Cross-validation

In [122]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(mnb, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


In [123]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.963450668623338
Accuracy:  0.9496119216476867


### Decision Tree Learning (DTL)

#### Default setting

In [124]:
dtl = DecisionTreeClassifier()
dtl.fit(features_train, labels_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [125]:
prediction = dtl.predict(features_test)
accuracy_score(labels_test, prediction)

0.922

In [126]:
f1_score(labels_test, prediction)

0.9422222222222223

#### Cross-validation

In [127]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(dtl, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)

In [128]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9445164539442132
Accuracy:  0.923619046476186


### Random Forest

#### Default setting

In [129]:
rf = RandomForestClassifier(max_depth=50, random_state=0)
rf.fit(features_train, labels_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [130]:
prediction = rf.predict(features_test)
accuracy_score(labels_test, prediction)

0.95

In [131]:
f1_score(labels_test, prediction)

0.9636098981077147

#### Cross-validation

In [132]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(dtl, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)

In [133]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9465885384725949
Accuracy:  0.9264166544666178


### K-Nearest Neighbors

#### Default setting

In [134]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(features_train, labels_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [135]:
prediction = knn.predict(features_test)
accuracy_score(labels_test, prediction)

0.884

In [136]:
f1_score(labels_test, prediction)

0.9088050314465409

#### Cross-validation 

In [137]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(knn, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)  

In [138]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9115089685450982
Accuracy:  0.8852110304441216
