In [26]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import re

## Pre-processing

### Functions

In [2]:
def remove_stop_words(contents):
    stop_words = list(stopwords.words('english'))
    for w in stop_words:
        contents = contents.replace(w, '')
        
    return contents

#### Tokenize and remove unnecessary characters

In [3]:
def remove_unnecessary_characters(contents):
    contents = contents.replace('\n', ' ')
    contents = contents.replace('..', '')
    contents = contents.replace('--', '')
    contents = contents.replace('==', '')
    contents = contents.replace('///', '')
    contents = contents.replace('\\\\', '')
    contents = ' '.join(contents.split())
    contents = contents.strip().lower()
    
#     contents = remove_stop_words(contents)
    tokenizer = RegexpTokenizer('[A-Za-z0-9\@\.\&\/\:\$\-\_]+')
    tokens = tokenizer.tokenize(contents)
    
    tokens = ' '.join( [i for i in tokens if len(i) > 1])
    
    return tokens

In [4]:
def replace_email(content):
    pattern = re.compile('[\w\/\.\-]+\@[\w\/\.\-]+\.[\w]+')
    replaced_content = re.sub(pattern, 'this_is_email', content)
    return replaced_content

In [5]:
def replace_link(content):
    pattern = re.compile('(http[s]?:\/\/|www\.)?[\w\/\.\-]+\.(com|html|php)([\/][\w\/\.\-]*)*')
    replaced_content = re.sub(pattern, 'this_is_link', content)
    return replaced_content

### Main program

#### Read CSV data for train data and test data

In [6]:
train_data = pd.read_csv('dataset/train_data.csv')
test_data = pd.read_csv('dataset/test_data.csv')

#### Tokenize remove unnecessary characters for train data and test data

In [7]:
preproc_train_data = train_data['content'].copy().apply(remove_unnecessary_characters).apply(replace_email).apply(replace_link)

print('Train data')
preproc_train_data.head()

Train data


0    daytips poem-a-day: 09/13/02 sponsor child tod...
1    jody sent you messagejody sent you message. tr...
2    re: tricky perl question ascending orderjozsi ...
3    this_is_email to unsubscribe email to this_is_...
4    re: re moment of silence for the first amendme...
Name: content, dtype: object

In [8]:
preproc_test_data = test_data['content'].copy().apply(remove_unnecessary_characters).apply(replace_email).apply(replace_link)

print('Test data')
preproc_test_data.head()

Test data


0    re: acroread not seeing printerson thu 2010-04...
1    america great misleaderurl: this_is_link 86740...
2    lowestprices guaranteed on flea and tick meds ...
3    re: problems with apt-get -f install once upon...
4    ack apt-get still failing for me stumped. rh8 ...
Name: content, dtype: object

In [9]:
corpus = []

In [10]:
for row in preproc_train_data.iteritems():
    corpus.append(row[1].split())

In [11]:
for row in preproc_test_data.iteritems():
    corpus.append(row[1].split())

In [12]:
import gensim

In [13]:
model_cbow = gensim.models.Word2Vec(
    corpus,
    size=50,
    sg=0,
    window=2,
    min_count=5,
    workers=4
)

In [14]:
model_cbow.train(corpus, total_examples=len(corpus), epochs=50)

(43343874, 58472250)

In [15]:
w2v_cbow = dict(zip(model_cbow.wv.index2word, model_cbow.wv.syn0))

  """Entry point for launching an IPython kernel.


In [16]:
model_sg = gensim.models.Word2Vec(
    corpus,
    size=50,
    sg=1,
    window=2,
    min_count=5,
    workers=4
)

In [17]:
model_sg.train(corpus, total_examples=len(corpus), epochs=50)

(43340437, 58472250)

In [18]:
w2v_sg = dict(zip(model_sg.wv.index2word, model_sg.wv.syn0))

  """Entry point for launching an IPython kernel.


In [19]:
len(w2v_sg.items())

16893

In [49]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = 50

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.max([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])


In [50]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC

etree_w2v_cbow = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v_cbow)),
    ("svc", SVC(kernel='sigmoid', gamma=1.0))])
etree_w2v_sg = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v_sg)),
    ("svc", SVC(kernel='sigmoid', gamma=1.0))])

In [51]:
features_train, features_test, labels_train, labels_test = train_test_split(preproc_train_data, train_data['prediction'], test_size=0.2, random_state=24)

In [52]:
etree_w2v_cbow.fit(features_train, labels_train)
etree_w2v_sg.fit(features_train, labels_train)

Pipeline(memory=None,
     steps=[('word2vec vectorizer', <__main__.MeanEmbeddingVectorizer object at 0x7fa97560f588>), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.0, kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

## CBOW

In [53]:
prediction = etree_w2v_cbow.predict(features_test)
accuracy_score(labels_test, prediction)

0.68

## SG

In [54]:
prediction = etree_w2v_sg.predict(features_test)
accuracy_score(labels_test, prediction)

0.68