Continue with v1, but filter the characters by removing some symbols and make positive-negative training set balanced.

In [1]:
import logging # record log event
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Load positive(email signatures) and negative(email body contents) data respectively.

In [2]:
import re
import collections
import sklearn.naive_bayes
import sklearn.linear_model
import nltk
import random
random.seed(0)
from gensim.models.doc2vec import LabeledSentence, Doc2Vec

def load_data(path_to_data):
    train_pos = []
    train_neg = []
    sigwords = []
    with open(path_to_data+"EnronSignatures.txt",'r') as fes:
        for line in fes:
            if len(line) <= 3:
                train_pos.append((' '.join(sigwords)).split()) # combine all the texts of each signature
                sigwords = []
            else:
                sigwords.append(' '.join(line.lower().strip().split()))
    with open(path_to_data+"train_content.txt",'r') as ftc:
        for line in ftc:
            words = [w.lower() for w in line.strip().split()]
            if len(words) < 2:
                continue
            train_neg.append(words)
    return train_pos, train_neg

def data_filter(raw_text_data):
    clean_text_data = []
    for entry in raw_text_data:
        clean_entry = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#()_+|~\-={}\[\]:\";'<>?,.\/\\]|[0-9]|[@]", ' ', ' '.join(entry))
        #clean_entry = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#_+|\={}\[\]\";'<>?,.\/]|[0-9]", ' ', ' '.join(entry))
        clean_entry = re.sub('\s+', ' ', clean_entry)
        clean_text_data.append(clean_entry.split())
    return clean_text_data

train_pos, train_neg = load_data('./')
train_pos = data_filter(train_pos)
train_neg = data_filter(train_neg)
train_size = min(len(train_pos), len(train_neg))
random.shuffle(train_pos)
random.shuffle(train_neg)
train_pos = train_pos[:train_size]
train_neg = train_neg[:train_size]
print train_pos[:10]
print train_neg[:10]

[['helyette', 'geman', 'universite', 'de', 'paris', 'dauphine', 'finance', 'department', 'au', 'de', 'ka', 'grand', 'ecole', 'corgy', 'pontois', 'paris', 'france', 't:', '-', '-'], ['kind', 'regards', 'nia', 'mansell', 'infocast', 'conference', 'manager', '(', ')', '-', 'ext', '(', ')', '-', 'fax', 'niam@informationforecast', 'com'], ['sincerely', 'daniel', 'yergin', 'chairman', 'cambridge', 'energy', 'research', 'associates'], ['thank', 'you', 'clare', 'fitzgerald', 'director', 'training', 'courses', 'marcus', 'evans', '-', '-', 'x'], ['westin', 'galleria', 'hotel', 'west', 'alabama', 'houston', 'texas'], ['helyette', 'geman', 'universite', 'de', 'paris', 'dauphine', 'finance', 'department', 'au', 'de', 'ka', 'grand', 'ecole', 'corgy', 'pontois', 'paris', 'france', 't:', '-', '-'], ['vincent', 'kaminski', 'enron', 'credit', 'smith', 'street', 'room', 'eb', 'houston', 'tx', '-', 't:', '-', '-', 'f:', '-', '-'], ['duane', 'seppi', 'carnegie', 'mellon', 'university', 'graduate', 'school'

Extract features using gensim models.

In [3]:
def feature_extraction(train_pos, train_neg):
    labeled_train_pos = []
    for index, words in enumerate(train_pos):
        sentence = LabeledSentence(words, ["TRAIN_POS_%s"%index])
        labeled_train_pos.append(sentence)
    labeled_train_neg = []
    for index, words in enumerate(train_neg):
        sentence = LabeledSentence(words, ["TRAIN_NEG_%s"%index])
        labeled_train_neg.append(sentence)
    model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4)
    sentences = labeled_train_pos + labeled_train_neg
    model.build_vocab(sentences)
    for i in range(5):
        print "Training iteration %d" %(i)
        random.shuffle(sentences)
        model.train(sentences)
    train_pos_vec, train_neg_vec = [], []
    for index in range(len(labeled_train_pos)):
        doc_vec = model.docvecs["TRAIN_POS_%s"%index]
        train_pos_vec.append(doc_vec)
    for index in range(len(labeled_train_neg)):
        doc_vec = model.docvecs["TRAIN_NEG_%s"%index]
        train_neg_vec.append(doc_vec)
    return train_pos_vec, train_neg_vec

train_pos_vec, train_neg_vec = feature_extraction(train_pos, train_neg)

Training iteration 0
Training iteration 1
Training iteration 2
Training iteration 3
Training iteration 4


Use featured training set to build model for binary classification.

In [4]:
def build_model(train_pos_vec, train_neg_vec):
    Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)
    X = train_pos_vec + train_neg_vec
    lr_model = sklearn.linear_model.LogisticRegression()
    lr_model.fit(X,Y)
    return lr_model

lr_model = build_model(train_pos_vec, train_neg_vec)

Evaluate the model by confusion matrix.

In [5]:
def evaluate_model(model, test_pos_vec, test_neg_vec, print_confusion=False):
    test_pos_predict = model.predict(test_pos_vec)
    test_neg_predict = model.predict(test_neg_vec)
    test_pos_Y = ["pos"]*len(test_pos_vec)
    test_neg_Y = ["neg"]*len(test_neg_vec)
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(test_pos_predict)):
        if test_pos_predict[i] == test_pos_Y[i]:
            tp += 1
        else:
            fn += 1
    for i in range(len(test_neg_predict)):
        if test_neg_predict[i] == test_neg_Y[i]:
            tn += 1
        else:
            fp += 1
    accuracy = float(tp+tn) / float(tp+tn+fp+fn+1)
    precision = float(tp) / float(tp+fp+1)
    recall = float(tp) / float(tp+fn+1)
    if print_confusion:
        print "predicted:\tpos\tneg"
        print "actual:"
        print "pos\t\t%d\t%d" % (tp, fn)
        print "neg\t\t%d\t%d" % (fp, tn)
    print "accuracy: %f" % (accuracy)
    print "precision: %f" % (precision)
    print "recall: %f" % (recall)
    
evaluate_model(lr_model, train_pos_vec, train_neg_vec, True)

predicted:	pos	neg
actual:
pos		38	7
neg		8	37
accuracy: 0.824176
precision: 0.808511
recall: 0.826087
