Continue with v2, split the dataset into training and testing datasets by ratio of 60:40.

In [1]:
import logging # record log event
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Load positive(email signatures) and negative(email body contents) data respectively.

In [2]:
import re
import math
import collections
import sklearn.naive_bayes
import sklearn.linear_model
import nltk
import random
random.seed(0)
from gensim.models.doc2vec import LabeledSentence, Doc2Vec

def load_data(path_to_data):
    train_pos = []
    train_neg = []
    sigwords = []
    with open(path_to_data+"EnronSignatures.txt",'r') as fes:
        for line in fes:
            if len(line) <= 3:
                train_pos.append((' '.join(sigwords)).split()) # combine all the texts of each signature
                sigwords = []
            else:
                sigwords.append(' '.join(line.lower().strip().split()))
    with open(path_to_data+"train_content.txt",'r') as ftc:
        for line in ftc:
            words = [w.lower() for w in line.strip().split()]
            if len(words) < 2:
                continue
            train_neg.append(words)
    return train_pos, train_neg

def data_filter(raw_text_data):
    clean_text_data = []
    for entry in raw_text_data:
        clean_entry = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#()_+|~\-={}\[\]:\";'<>?,.\/\\]|[0-9]|[@]", ' ', ' '.join(entry))
        #clean_entry = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#_+|\={}\[\]\";'<>?,.\/]|[0-9]", ' ', ' '.join(entry))
        clean_entry = re.sub('\s+', ' ', clean_entry)
        clean_text_data.append(clean_entry.split())
    return clean_text_data

data_pos, data_neg = load_data('./')
data_pos = data_filter(data_pos)
data_neg = data_filter(data_neg)
#data_size = min(len(data_pos), len(data_neg))
random.shuffle(data_pos)
random.shuffle(data_neg)
#data_pos = data_pos[:data_size]
#data_neg = data_neg[:data_size]
data_size_p = len(data_pos) # --
data_size_n = len(data_neg) # --
cutoff_size_p = int(math.floor(data_size_p*0.6)) # ratio to divide dataset into train and test
cutoff_size_n = int(math.floor(data_size_n*0.6)) #--
train_pos = data_pos[:cutoff_size_p] #--
train_neg = data_neg[:cutoff_size_n] #--
test_pos = data_pos[cutoff_size_p:] #--
test_neg = data_neg[cutoff_size_n:] #--
print train_pos[:5]
print train_neg[:5]
print data_size_p, data_size_n, len(train_pos), len(test_pos) # --

[['helyette', 'geman', 'universite', 'de', 'paris', 'dauphine', 'finance', 'department', 'au', 'de', 'ka', 'grand', 'ecole', 'corgy', 'pontois', 'paris', 'france', 't'], ['kind', 'regards', 'nia', 'mansell', 'infocast', 'conference', 'manager', 'ext', 'fax', 'niam', 'informationforecast', 'com'], ['sincerely', 'daniel', 'yergin', 'chairman', 'cambridge', 'energy', 'research', 'associates'], ['thank', 'you', 'clare', 'fitzgerald', 'director', 'training', 'courses', 'marcus', 'evans', 'x'], ['westin', 'galleria', 'hotel', 'west', 'alabama', 'houston', 'texas']]
[['symposium', 'features', 'leaders', 'from', 'ferc', 'electric'], ['since', 'we', 'add', 'to', 'it', 'frequently'], ['your', 'registration', 'fee', 'is', 'inclusive', 'of', 'the', 'following'], ['and', 'decision', 'makers', 'of', 'the', 'electric', 'power', 'industry'], ['assets', 'management', 'is', 'also', 'very', 'informative', 'a', 'talk', 'about', 'using', 'real', 'option']]
45 338 27 18


Extract features using gensim models.

def feature_extraction(train_pos, train_neg):
    labeled_train_pos = []
    for index, words in enumerate(train_pos):
        sentence = LabeledSentence(words, ["TRAIN_POS_%s"%index])
        labeled_train_pos.append(sentence)
    labeled_train_neg = []
    for index, words in enumerate(train_neg):
        sentence = LabeledSentence(words, ["TRAIN_NEG_%s"%index])
        labeled_train_neg.append(sentence)
    model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4)
    sentences = labeled_train_pos + labeled_train_neg
    model.build_vocab(sentences)
    for i in range(5):
        print "Training iteration %d" %(i)
        random.shuffle(sentences)
        model.train(sentences)
    train_pos_vec, train_neg_vec = [], []
    for index in range(len(labeled_train_pos)):
        doc_vec = model.docvecs["TRAIN_POS_%s"%index]
        train_pos_vec.append(doc_vec)
    for index in range(len(labeled_train_neg)):
        doc_vec = model.docvecs["TRAIN_NEG_%s"%index]
        train_neg_vec.append(doc_vec)
    return train_pos_vec, train_neg_vec

train_pos_vec, train_neg_vec = feature_extraction(train_pos, train_neg)
test_pos_vec, test_neg_vec = feature_extraction(test_pos, test_neg)

In [3]:
def feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg):
    """
    Returns the feature vectors for all text in the train and test datasets.
    """
    # Doc2Vec requires LabeledSentence objects as input.
    # Turn the datasets from lists of words to lists of LabeledSentence objects.
    # YOUR CODE HERE
    labeled_train_pos = []
    for index, words in enumerate(train_pos):
        sentence = LabeledSentence(words, ["TRAIN_POS_%s"%index])
        labeled_train_pos.append(sentence)
    labeled_train_neg = []
    for index, words in enumerate(train_neg):
        sentence = LabeledSentence(words, ["TRAIN_NEG_%s"%index])
        labeled_train_neg.append(sentence)
    labeled_test_pos = []
    for index, words in enumerate(test_pos):
        sentence = LabeledSentence(words, ["TEST_POS_%s"%index])
        labeled_test_pos.append(sentence)
    labeled_test_neg = []
    for index, words in enumerate(test_neg):
        sentence = LabeledSentence(words, ["TEST_NEG_%s"%index])
        labeled_test_neg.append(sentence)
    # Initialize model
    model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4)
    sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg
    model.build_vocab(sentences)
    # Train the model
    # This may take a bit to run 
    for i in range(5):
        print "Training iteration %d" % (i)
        random.shuffle(sentences)
        model.train(sentences)
    # Use the docvecs function to extract the feature vectors for the training and test data
    # YOUR CODE HERE
    train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec = [], [], [], []
    for index in range(len(labeled_train_pos)):
        doc_vec = model.docvecs["TRAIN_POS_%s"%index]
        train_pos_vec.append(doc_vec)
    for index in range(len(labeled_train_neg)):
        doc_vec = model.docvecs["TRAIN_NEG_%s"%index]
        train_neg_vec.append(doc_vec)
    for index in range(len(labeled_test_pos)):
        doc_vec = model.docvecs["TEST_POS_%s"%index]
        test_pos_vec.append(doc_vec)
    for index in range(len(labeled_test_neg)):
        doc_vec = model.docvecs["TEST_NEG_%s"%index]
        test_neg_vec.append(doc_vec)
    # Return the four feature vectors
    return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec

train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec = feature_vecs_DOC(train_pos, train_neg, test_pos, test_neg)

Training iteration 0
Training iteration 1
Training iteration 2
Training iteration 3
Training iteration 4


Use featured training set to build model for binary classification.

def build_model(train_pos_vec, train_neg_vec):
    Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)
    X = train_pos_vec + train_neg_vec
    lr_model = sklearn.linear_model.LogisticRegression()
    lr_model.fit(X,Y)
    return lr_model

lr_model = build_model(train_pos_vec, train_neg_vec)

In [4]:
import sklearn.naive_bayes
import sklearn.linear_model
import sklearn.svm
import sklearn.neighbors.nearest_centroid
import sklearn.tree
def build_model(train_pos_vec, train_neg_vec):
    Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)
    X = train_pos_vec + train_neg_vec
    # use multiple classification methods
    svm_model = sklearn.svm.SVC() # SVM
    svm_model.fit(X,Y)
    nnc_model = sklearn.neighbors.nearest_centroid.NearestCentroid() # Nearest Neighbor
    nnc_model.fit(X,Y)
    lr_model = sklearn.linear_model.LogisticRegression() # Logistic Regression
    lr_model.fit(X,Y)
    nb_model = sklearn.naive_bayes.GaussianNB() # Naive Bayes
    nb_model.fit(X,Y)
    dt_model = sklearn.tree.DecisionTreeClassifier()
    dt_model.fit(X,Y)
    return svm_model, nnc_model, lr_model, nb_model, dt_model

svm_model, nnc_model, lr_model, nb_model, dt_model = build_model(train_pos_vec, train_neg_vec)

Evaluate the model by confusion matrix.

def evaluate_model(model, test_pos_vec, test_neg_vec, print_confusion=False):
    test_pos_predict = model.predict(test_pos_vec)
    test_neg_predict = model.predict(test_neg_vec)
    test_pos_Y = ["pos"]*len(test_pos_vec)
    test_neg_Y = ["neg"]*len(test_neg_vec)
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(test_pos_predict)):
        if test_pos_predict[i] == test_pos_Y[i]:
            tp += 1
        else:
            fn += 1
    for i in range(len(test_neg_predict)):
        if test_neg_predict[i] == test_neg_Y[i]:
            tn += 1
        else:
            fp += 1
    accuracy = float(tp+tn) / float(tp+tn+fp+fn+1)
    precision = float(tp) / float(tp+fp+1)
    recall = float(tp) / float(tp+fn+1)
    if print_confusion:
        print "predicted:\tpos\tneg"
        print "actual:"
        print "pos\t\t%d\t%d" % (tp, fn)
        print "neg\t\t%d\t%d" % (fp, tn)
    print "accuracy: %f" % (accuracy)
    print "precision: %f" % (precision)
    print "recall: %f" % (recall)
    
evaluate_model(lr_model, test_pos_vec, test_neg_vec, True)

In [5]:
def evaluate_model(model, test_pos_vec, test_neg_vec, print_confusion=False):
    test_pos_predict = model.predict(test_pos_vec)
    test_neg_predict = model.predict(test_neg_vec)
    test_pos_Y = ["pos"]*len(test_pos_vec)
    test_neg_Y = ["neg"]*len(test_neg_vec)
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(test_pos_predict)):
        if test_pos_predict[i] == test_pos_Y[i]:
            tp += 1
        else:
            fn += 1
    for i in range(len(test_neg_predict)):
        if test_neg_predict[i] == test_neg_Y[i]:
            tn += 1
        else:
            fp += 1
    accuracy = float(tp+tn) / float(tp+tn+fp+fn+1)
    precision = float(tp) / float(tp+fp+1)
    recall = float(tp) / float(tp+fn+1)
    Fscore = (2*recall*precision) / (recall + precision+0.00001)
    if print_confusion:
        print "predicted:\tpos\tneg"
        print "actual:"
        print "pos\t\t%d\t%d" % (tp, fn)
        print "neg\t\t%d\t%d" % (fp, tn)
    print "accuracy: %f" % (accuracy)
    print "precision: %f" % (precision)
    print "recall: %f" % (recall)
    print "Fscore: %f" % (Fscore)
    
evaluate_model(svm_model, test_pos_vec, test_neg_vec, True)
evaluate_model(nnc_model, test_pos_vec, test_neg_vec, True)
evaluate_model(lr_model, test_pos_vec, test_neg_vec, True)
evaluate_model(nb_model, test_pos_vec, test_neg_vec, True)
evaluate_model(dt_model, test_pos_vec, test_neg_vec, True)

predicted:	pos	neg
actual:
pos		0	18
neg		0	136
accuracy: 0.877419
precision: 0.000000
recall: 0.000000
Fscore: 0.000000
predicted:	pos	neg
actual:
pos		3	15
neg		33	103
accuracy: 0.683871
precision: 0.081081
recall: 0.157895
Fscore: 0.107138
predicted:	pos	neg
actual:
pos		0	18
neg		0	136
accuracy: 0.877419
precision: 0.000000
recall: 0.000000
Fscore: 0.000000
predicted:	pos	neg
actual:
pos		0	18
neg		5	131
accuracy: 0.845161
precision: 0.000000
recall: 0.000000
Fscore: 0.000000
predicted:	pos	neg
actual:
pos		2	16
neg		28	108
accuracy: 0.709677
precision: 0.064516
recall: 0.105263
Fscore: 0.079995
