## Fix mistakes in the feature_extraction to approach real case, but still will change vocabulary building procedure once we obtain larger dataset.
## Also will apply new minority adjustment method using SMOTE.


In this version, it fixes some training strategy and apply multiple classifiers for the future ensemble classifier. 
Here are some prior assumptions I made:
- Email signatures and bodies class are balanced in the overall dataset. (not in a real case)
- Training set are majority and the split ratio is between 0.6 and 0.9. 
- In my case, I convert the whole dataset at once due to the limited quantity of dataset size, while in real case it should use the bag-of-words from training set to express test set.

In [1]:
import logging # record log event
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Load positive(email signatures) and negative(email body contents) data respectively.

In [2]:
import re
import math
import collections
import nltk
import random
random.seed(0)

def load_data(path_to_data):
    train_pos = []
    train_neg = []
    sigwords = []
    with open(path_to_data+"TotalSignatures.txt",'r') as fes:
        for line in fes:
            sigwords = line.lower().split()#
            train_pos.append((' '.join(sigwords)).split()) #
    with open(path_to_data+"train_content.txt",'r') as ftc:
        for line in ftc:
            words = [w.lower() for w in line.strip().split()]
            if len(words) < 2:
                continue
            train_neg.append(words)
    return train_pos, train_neg

def data_filter(raw_text_data):
    clean_text_data = []
    for entry in raw_text_data:
        clean_entry = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#()_+|~\-={}\[\]:\";'<>?,.\/\\]|[0-9]|[@]", ' ', ' '.join(entry))
        #clean_entry = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#_+|\={}\[\]\";'<>?,.\/]|[0-9]", ' ', ' '.join(entry))
        #clean_entry = ' '.join(entry)
        clean_entry = re.sub('\s+', ' ', clean_entry)
        clean_text_data.append(clean_entry.split())
    return clean_text_data

data_pos, data_neg = load_data('./')
data_pos = data_filter(data_pos)
data_neg = data_filter(data_neg)
data_size = min(len(data_pos), len(data_neg))
data_pos = data_pos[:data_size]
data_neg = data_neg[:data_size]
print data_pos[:5]
print data_neg[:5]
print data_size, len(data_pos), len(data_pos)

[['sincerely'], ['thaleia', 'zariphopoulou'], ['chair', 'of', 'the', 'scientific', 'committee'], ['v', 'n', 'neuhaus', 'professor'], ['dpts', 'of', 'mathematics', 'and', 'msis']]
[['please', 'register', 'wincenty', 'j', 'vince', 'kaminski', 'managing', 'director', 'research'], ['enron', 'wholesale', 'services', 'to', 'the', 'subject', 'conference', 'to', 'be', 'held', 'in', 'houston'], ['on', 'june'], ['if', 'you', 'need', 'more', 'information', 'please', 'contact', 'me', 'at'], ['dear', 'mr', 'ray']]
338 338 338


Extract features using gensim models based on word vectors, and split the transformed dataset into training and test sets. 

In [3]:
from gensim.models.doc2vec import LabeledSentence, Doc2Vec
def feature_extraction_Doc2Vec(data_pos, data_neg): # use the word2vec under the hood
    labeled_data_pos = []
    for index, words in enumerate(data_pos):
        sentence = LabeledSentence(words, ["DATA_POS_%s"%index])
        labeled_data_pos.append(sentence)
    labeled_data_neg = []
    for index, words in enumerate(data_neg):
        sentence = LabeledSentence(words, ["DATA_NEG_%s"%index])
        labeled_data_neg.append(sentence)
    model = Doc2Vec(min_count=1, window=20, size=4000, sample=1e-4, negative=5, workers=4)
    sentences = labeled_data_pos + labeled_data_neg
    model.build_vocab(sentences)
    for i in range(5):
        print "Training iteration %d" %(i)
        random.shuffle(sentences)
        model.train(sentences)
    data_pos_vec, data_neg_vec = [], []
    for index in range(len(labeled_data_pos)):
        doc_vec = model.docvecs["DATA_POS_%s"%index]
        data_pos_vec.append(doc_vec)
    for index in range(len(labeled_data_neg)):
        doc_vec = model.docvecs["DATA_NEG_%s"%index]
        data_neg_vec.append(doc_vec)
    return data_pos_vec, data_neg_vec

data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos, data_neg)
split_ratio = 0.8
cutoff = int(math.floor(data_size*split_ratio))
random.shuffle(data_pos_vec)
random.shuffle(data_neg_vec)
train_pos_vec = data_pos_vec[:cutoff]
train_neg_vec = data_neg_vec[:cutoff]
test_pos_vec = data_pos_vec[cutoff:]
test_neg_vec = data_neg_vec[cutoff:]
#test_pos_vec, test_neg_vec = feature_extraction_Doc2Vec(test_pos, test_neg)
print "data_size = %d" % data_size
print "training_size = %d" % cutoff
print(len(train_pos_vec[0]))
#print(map(list,test_pos_vec[:2]))

Training iteration 0
Training iteration 1
Training iteration 2
Training iteration 3
Training iteration 4
data_size = 338
training_size = 270
4000


Apply binary classifiers to find signature lines.

In [4]:
import sklearn.naive_bayes
import sklearn.linear_model
import sklearn.svm
import sklearn.neighbors.nearest_centroid
import sklearn.tree
def build_model(train_pos_vec, train_neg_vec):
    Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)
    X = train_pos_vec + train_neg_vec
    # use multiple classification methods
    svm_model = sklearn.svm.SVC() # SVM
    svm_model.fit(X,Y)
    nnc_model = sklearn.neighbors.nearest_centroid.NearestCentroid() # Nearest Neighbor
    nnc_model.fit(X,Y)
    lr_model = sklearn.linear_model.LogisticRegression() # Logistic Regression
    lr_model.fit(X,Y)
    nb_model = sklearn.naive_bayes.GaussianNB() # Naive Bayes
    nb_model.fit(X,Y)
    dt_model = sklearn.tree.DecisionTreeClassifier()
    dt_model.fit(X,Y)
    return svm_model, nnc_model, lr_model, nb_model, dt_model

svm_model, nnc_model, lr_model, nb_model, dt_model = build_model(train_pos_vec, train_neg_vec)

Evaluate all the models listed by confusion matrix.

In [5]:
def evaluate_model(model, test_pos_vec, test_neg_vec, print_confusion=False):
    test_pos_predict = model.predict(test_pos_vec)
    test_neg_predict = model.predict(test_neg_vec)
    test_pos_Y = ["pos"]*len(test_pos_vec)
    test_neg_Y = ["neg"]*len(test_neg_vec)
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(test_pos_predict)):
        if test_pos_predict[i] == test_pos_Y[i]:
            tp += 1
        else:
            fn += 1
    for i in range(len(test_neg_predict)):
        if test_neg_predict[i] == test_neg_Y[i]:
            tn += 1
        else:
            fp += 1
    accuracy = float(tp+tn) / float(tp+tn+fp+fn+1)
    precision = float(tp) / float(tp+fp+1)
    recall = float(tp) / float(tp+fn+1)
    Fscore = (2*recall*precision) / (recall + precision)
    print str(model)
    if print_confusion:
        print "predicted:\tpos\tneg"
        print "actual:"
        print "pos\t\t%d\t%d" % (tp, fn)
        print "neg\t\t%d\t%d" % (fp, tn)
    print "accuracy: %f" % (accuracy)
    print "precision: %f" % (precision)
    print "recall: %f" % (recall)
    print "Fscore: %f" % (Fscore)
    print '\n'

evaluate_model(svm_model, test_pos_vec, test_neg_vec, True)
evaluate_model(nnc_model, test_pos_vec, test_neg_vec, True)
evaluate_model(lr_model, test_pos_vec, test_neg_vec, True)
evaluate_model(nb_model, test_pos_vec, test_neg_vec, True)
evaluate_model(dt_model, test_pos_vec, test_neg_vec, True)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
predicted:	pos	neg
actual:
pos		49	19
neg		38	30
accuracy: 0.576642
precision: 0.556818
recall: 0.710145
Fscore: 0.624204


NearestCentroid(metric='euclidean', shrink_threshold=None)
predicted:	pos	neg
actual:
pos		41	27
neg		32	36
accuracy: 0.562044
precision: 0.554054
recall: 0.594203
Fscore: 0.573427


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)
predicted:	pos	neg
actual:
pos		40	28
neg		32	36
accuracy: 0.554745
precision: 0.547945
recall: 0.579710
Fscore: 0.563380


GaussianNB()
predicted:	pos	neg
actual:
pos		40	28
neg		33	35
accuracy: 0.547445
precision: 0.540541
recall: 0.579710
Fscore: 0.559441


Decisi

### Reference site:
- [gensim Doc2Vec API](https://radimrehurek.com/gensim/models/doc2vec.html#gensim.models.doc2vec.TaggedDocument)
- [Doc2Vec Tutorial](http://rare-technologies.com/doc2vec-tutorial/)
- [Scikit-learn Classifier](http://scikit-learn.org/stable/supervised_learning.html#supervised-learning)