# Spam Filtering

## Import libraries

In [1]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from random import randint
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

import nltk
import numpy as np
import pandas as pd
import re

  from collections import Sequence
  from numpy.core.umath_tests import inner1d


## Pre-processing

### Functions

In [2]:
def remove_stop_words(contents):
    stop_words = list(stopwords.words('english'))
    for w in stop_words:
        contents = contents.replace(w, '')
        
    return contents

In [3]:
def lemmatize(contents):
    wordnet_lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(contents)
    
    lemmatized_tokens = []
    for token in tokens:
        lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token))
        
    return ' '.join(lemmatized_tokens)

In [4]:
def stem(contents):
    porter_stemmer = PorterStemmer()
    tokens = nltk.word_tokenize(contents)
    
    stemmed_tokens = []
    for token in tokens:
        stemmed_tokens.append(porter_stemmer.stem(token))
    
    return ' '.join(stemmed_tokens)

#### Tokenize and remove unnecessary characters

In [5]:
def remove_unnecessary_characters(contents):
    contents = contents.replace('\n', ' ')
    contents = contents.replace('..', '')
    contents = contents.replace('--', '')
    contents = contents.replace('==', '')
    contents = contents.replace('///', '')
    contents = contents.replace('\\\\', '')
    contents = ' '.join(contents.split())
    contents = contents.strip().lower()
    
#     contents = remove_stop_words(contents)
    contents = lemmatize(contents)
    contents = stem(contents)
    tokenizer = RegexpTokenizer('[A-Za-z0-9\@\.\&\/\:\$\-\_]+')
    tokens = tokenizer.tokenize(contents)
    
    tokens = ' '.join( [i for i in tokens if len(i) > 1])
    
    return tokens

#### Replace e-mail address with "this_is_email"

In [6]:
def replace_email(content):
    pattern = re.compile('[\w\/\.\-]+\@[\w\/\.\-]+\.[\w]+')
    replaced_content = re.sub(pattern, 'this_is_email', content)
    return replaced_content

#### Replace link address with "this_is_link"

In [7]:
def replace_link(content):
    pattern = re.compile('(http[s]?:\/\/|www\.)?[\w\/\.\-]+\.(com|html|php)([\/][\w\/\.\-]*)*')
    replaced_content = re.sub(pattern, 'this_is_link', content)
    return replaced_content

### Main program

#### Read CSV data for train data and test data

In [8]:
train_data = pd.read_csv('dataset/train_data.csv')
test_data = pd.read_csv('dataset/test_data.csv')

#### Tokenize remove unnecessary characters for train data and test data

In [9]:
preproc_train_data = train_data['content'].copy().apply(remove_unnecessary_characters).apply(replace_email).apply(replace_link)

print('Train data')
preproc_train_data.head()

Train data


0    daytip poem-a-day 09/13/02 sponsor child today...
1    jodi sent you messagejodi sent you messag tri ...
2    re tricki perl question ascend orderjozsi vadk...
3    suscribeyordanisp dmesd.vcl.rimed.cu to unsubs...
4    re re moment of silenc for the first amend fwd...
Name: content, dtype: object

In [10]:
preproc_test_data = test_data['content'].copy().apply(remove_unnecessary_characters).apply(replace_email).apply(replace_link)

print('Test data')
preproc_test_data.head()

Test data


0    re acroread not see printerson thu 2010-04-15 ...
1    america great misleaderurl http this_is_link 8...
2    lowestpric guarante on flea and tick med now y...
3    re problem with apt-get -f install onc upon ti...
4    ack apt-get still fail for me stump rh8 post a...
Name: content, dtype: object

## Feature extraction

### TFIDF

In [11]:
vectorizer = TfidfVectorizer('english')
tfidf_features = vectorizer.fit_transform(preproc_train_data)

### Count vectorizer features

In [12]:
vectorizer = CountVectorizer('english')
cv_features = vectorizer.fit_transform(preproc_train_data)

### Train test split

In [13]:
features_train, features_test, labels_train, labels_test = train_test_split(tfidf_features, train_data['prediction'], test_size=0.2, random_state=randint(0, 100))

## Model

### Function for hyperparameter tuning

In [14]:
def hyperparameter_tuning(model, tuned_parameters):
    scores = ['precision', 'recall']
    
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(model, tuned_parameters, cv=5,
                           scoring='%s_macro' % score)
        clf.fit(features_train, labels_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        prediction = clf.predict(features_test)
        print(classification_report(labels_test, prediction))
        print()

### Support Vector Classification (SVC / SVM)

#### Default setting

In [15]:
svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(features_train, labels_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.0, kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
prediction = svc.predict(features_test)
accuracy_score(labels_test, prediction)

0.988

In [17]:
f1_score(labels_test, prediction)

0.9914529914529915

#### Hyperparameter tuning

In [18]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [19]:
# hyperparameter_tuning(SVC(), tuned_parameters)

#### After hyperparameter tuning

In [20]:
# Recall > precision because it is more dangerous for not-spam email marked as spam than the other way around

svc = SVC(kernel='linear', C=10)
svc.fit(features_train, labels_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
prediction = svc.predict(features_test)
accuracy_score(labels_test, prediction)

0.99

In [22]:
f1_score(labels_test, prediction)

0.9928673323823112

#### Cross-validation score

In [23]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(svc, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)

In [24]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9874884167604543
Accuracy:  0.9828047712190848


### Multinomial Naive Bayes (MNB)

#### Default setting

In [25]:
mnb = MultinomialNB(alpha=0.2)
mnb.fit(features_train, labels_train)

MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True)

In [26]:
prediction = mnb.predict(features_test)
accuracy_score(labels_test, prediction)

0.934

In [27]:
f1_score(labels_test, prediction)

0.9549795361527967

#### Hyperparameter tuning

In [28]:
tuned_parameters = [{'alpha': [0, 0.5, 1.0, 1.5, 2.0]}]

In [29]:
# hyperparameter_tuning(MultinomialNB(), tuned_parameters)

#### After hyperparameter tuning

In [30]:
mnb = MultinomialNB(alpha=0)
mnb.fit(features_train, labels_train)

  'setting alpha = %.1e' % _ALPHA_MIN)


MultinomialNB(alpha=0, class_prior=None, fit_prior=True)

In [31]:
prediction = mnb.predict(features_test)
accuracy_score(labels_test, prediction)

0.946

In [32]:
f1_score(labels_test, prediction)

0.9612625538020086

#### Cross-validation

In [33]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(mnb, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


In [34]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9657397774528963
Accuracy:  0.9528127200508802


### Decision Tree Learning (DTL)

#### Default setting

In [35]:
dtl = DecisionTreeClassifier()
dtl.fit(features_train, labels_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [36]:
prediction = dtl.predict(features_test)
accuracy_score(labels_test, prediction)

0.918

In [37]:
f1_score(labels_test, prediction)

0.9410071942446044

#### Cross-validation

In [38]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(dtl, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)

In [39]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9435910189322184
Accuracy:  0.9220070736282946


### Random Forest

#### Default setting

In [40]:
rf = RandomForestClassifier(max_depth=50, random_state=0)
rf.fit(features_train, labels_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [41]:
prediction = rf.predict(features_test)
accuracy_score(labels_test, prediction)

0.944

In [42]:
f1_score(labels_test, prediction)

0.9605633802816902

#### Cross-validation

In [43]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(dtl, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)

In [44]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9441058673530656
Accuracy:  0.9228086768347075


### K-Nearest Neighbors

#### Default setting

In [45]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(features_train, labels_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [46]:
prediction = knn.predict(features_test)
accuracy_score(labels_test, prediction)

0.906

In [47]:
f1_score(labels_test, prediction)

0.9295352323838081

#### Cross-validation 

In [48]:
scoring = {'acc': 'accuracy',
           'f1': 'f1',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(knn, tfidf_features, train_data['prediction'], scoring=scoring,
                         cv=5, return_train_score=True)  

In [49]:
print('F1-score: ', np.mean(scores['test_f1']))
print('Accuracy: ', np.mean(scores['test_acc']))

F1-score:  0.9205576150705946
Accuracy:  0.8956046464185856
