### Update code with cross-fold validation and SMOTE minority data synthesis

In [1]:
import logging # record log event
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Load and filter raw text data with regular expressions.

In [2]:
import re
import math
import collections
import nltk
import random
random.seed(0)

def load_data(path_to_data):
    train_pos = []
    train_neg = []
    sigwords = []
    with open(path_to_data+"EnronSignatures.txt",'r') as fes:
        for line in fes:
            if len(line) <= 3:
                train_pos.append((' '.join(sigwords)).split()) # combine all the texts of each signature
                sigwords = []
            else:
                sigwords.append(' '.join(line.lower().strip().split()))
    with open(path_to_data+"train_content.txt",'r') as ftc:
        for line in ftc:
            words = [w.lower() for w in line.strip().split()]
            if len(words) < 2:
                continue
            train_neg.append(words)
    return train_pos, train_neg

def data_filter(raw_text_data):
    clean_text_data = []
    for entry in raw_text_data:
        clean_entry = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#()_+|~\-={}\[\]:\";'<>?,.\/\\]|[0-9]|[@]", ' ', ' '.join(entry))
        #clean_entry = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#_+|\={}\[\]\";'<>?,.\/]|[0-9]", ' ', ' '.join(entry))
        #clean_entry = ' '.join(entry)
        clean_entry = re.sub('\s+', ' ', clean_entry)
        clean_text_data.append(clean_entry.split())
    return clean_text_data

Extract features using Doc2Vec and convert text tokens into numerical vectors.

In [3]:
from gensim.models.doc2vec import LabeledSentence, Doc2Vec
def feature_extraction_Doc2Vec(data_pos, data_neg): # use the word2vec under the hood
    labeled_data_pos = []
    for index, words in enumerate(data_pos):
        sentence = LabeledSentence(words, ["DATA_POS_%s"%index])
        labeled_data_pos.append(sentence)
    labeled_data_neg = []
    for index, words in enumerate(data_neg):
        sentence = LabeledSentence(words, ["DATA_NEG_%s"%index])
        labeled_data_neg.append(sentence)
    model = Doc2Vec(min_count=1, window=20, size=4000, sample=1e-4, negative=5, workers=4)
    sentences = labeled_data_pos + labeled_data_neg
    model.build_vocab(sentences)
    for i in range(5):
        #print "Training iteration %d" %(i)
        random.shuffle(sentences)
        model.train(sentences)
    data_pos_vec, data_neg_vec = [], []
    for index in range(len(labeled_data_pos)):
        doc_vec = model.docvecs["DATA_POS_%s"%index]
        data_pos_vec.append(doc_vec)
    for index in range(len(labeled_data_neg)):
        doc_vec = model.docvecs["DATA_NEG_%s"%index]
        data_neg_vec.append(doc_vec)
    return data_pos_vec, data_neg_vec

Binary classifications using multiple methods

In [4]:
import sklearn.naive_bayes
import sklearn.linear_model
import sklearn.svm
import sklearn.neighbors.nearest_centroid
import sklearn.tree
def build_model(train_pos_vec, train_neg_vec):
    Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)
    X = train_pos_vec + train_neg_vec
    # use multiple classification methods
    svm_model = sklearn.svm.SVC() # SVM
    svm_model.fit(X,Y)
    nnc_model = sklearn.neighbors.nearest_centroid.NearestCentroid() # Nearest Neighbor
    nnc_model.fit(X,Y)
    lr_model = sklearn.linear_model.LogisticRegression() # Logistic Regression
    lr_model.fit(X,Y)
    nb_model = sklearn.naive_bayes.GaussianNB() # Naive Bayes
    nb_model.fit(X,Y)
    dt_model = sklearn.tree.DecisionTreeClassifier() # Decision Tree
    dt_model.fit(X,Y)
    return svm_model, nnc_model, lr_model, nb_model, dt_model

Evaluate model using classification statistics.

In [5]:
def evaluate_model(model, test_pos_vec, test_neg_vec, print_confusion=False):
    test_pos_predict = model.predict(test_pos_vec)
    test_neg_predict = model.predict(test_neg_vec)
    test_pos_Y = ["pos"]*len(test_pos_vec)
    test_neg_Y = ["neg"]*len(test_neg_vec)
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(test_pos_predict)):
        if test_pos_predict[i] == test_pos_Y[i]:
            tp += 1
        else:
            fn += 1
    for i in range(len(test_neg_predict)):
        if test_neg_predict[i] == test_neg_Y[i]:
            tn += 1
        else:
            fp += 1
    accuracy = float(tp+tn) / float(tp+tn+fp+fn+1)
    precision = float(tp) / float(tp+fp+1)
    recall = float(tp) / float(tp+fn+1)
    Fscore = (2*recall*precision) / (recall + precision)
    if print_confusion:
        print str(model)
        print "predicted:\tpos\tneg"
        print "actual:"
        print "pos\t\t%d\t%d" % (tp, fn)
        print "neg\t\t%d\t%d" % (fp, tn)
        print "accuracy: %f" % (accuracy)
        print "precision: %f" % (precision)
        print "recall: %f" % (recall)
        print "Fscore: %f" % (Fscore)
        print '\n'
    return accuracy, precision, recall, Fscore

Implement cross-fold validation and SMOTE algorithm in main function. Present the statistics by Fscore

In [6]:
import numpy as np
import sklearn.neighbors
import matplotlib.pyplot as plt

NUM_OF_CROSSFOLD = 5
NUM_OF_ITERATION = 5

def main():
    stats_Fscore = list()
    stats_recall = list()
    stats_precision = list()
    data_pos, data_neg = load_data('./') # read in raw dataset
    data_pos = data_filter(data_pos) # filter use regular expression
    data_neg = data_filter(data_neg)
    data_size = max(len(data_pos), len(data_neg))
    minority_size = min(len(data_pos), len(data_neg))
    for iteration in range(NUM_OF_ITERATION): # training and validatingn for each iteration
        random.seed(iteration)
        random.shuffle(data_pos)
        random.shuffle(data_neg)
        data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos, data_neg) # convert to doc vectors
        data_pos_vec_new = [] # use SMOTE method to synthesize minority data  
        nbrs = sklearn.neighbors.NearestNeighbors(n_neighbors=3, algorithm="brute").fit(data_pos_vec)
        distances, indices = nbrs.kneighbors(data_pos_vec)
        while (minority_size+len(data_pos_vec_new) < data_size):
            indexA = random.randint(0, minority_size-1)
            data_vec_A = data_pos_vec[indexA]
            indexB = indices[indexA][random.randint(0, len(indices[indexA])-1)]
            data_vec_B = data_pos_vec[indexB]
            alpha = [random.random() for i in range(len(data_pos_vec[0]))] # blending ratio
            new_vec = [alpha[i]*data_vec_A[i] + (1-alpha[i])*data_vec_B[i] for i in range(len(data_vec_A))]
            data_pos_vec_new.append(new_vec)
        data_pos_vec.extend(data_pos_vec_new) # add the synthesis dataset
        unit_size = int(data_size*1.0 / NUM_OF_CROSSFOLD)
        crossfold_vec_pos = []
        crossfold_vec_neg = []
        pos_index_pool = [i for i in range(data_size)] # randomly divide the dataset using index
        random.shuffle(pos_index_pool)
        neg_index_pool = [i for i in range(data_size)]
        random.shuffle(neg_index_pool)
        start = 0
        for crossfold in range(NUM_OF_CROSSFOLD): # divide total dataset into smaller datasets
            pos_unit_dataset = [data_pos_vec[index] for index in pos_index_pool[start:(start+unit_size)]]
            crossfold_vec_pos.append(pos_unit_dataset)
            neg_unit_dataset = [data_neg_vec[index] for index in neg_index_pool[start:(start+unit_size)]]
            crossfold_vec_neg.append(neg_unit_dataset)
            start += unit_size
        Fscores_list = list()
        recall_list = list()
        precision_list = list()
        for crossfold in range(NUM_OF_CROSSFOLD): # cross-fold validation
            train_pos_vec = []
            train_neg_vec = []
            test_pos_vec = []
            test_neg_vec = []
            for i in range(NUM_OF_CROSSFOLD): # divide into training and validation set
                if i == crossfold:
                    test_pos_vec.extend(crossfold_vec_pos[i])
                    test_neg_vec.extend(crossfold_vec_neg[i])
                else:
                    train_pos_vec.extend(crossfold_vec_pos[i])
                    train_neg_vec.extend(crossfold_vec_neg[i])
            svm_model, nnc_model, lr_model, nb_model, dt_model = build_model(train_pos_vec, train_neg_vec) # training
            accuracy, precision, recall, Fscore = evaluate_model(lr_model, test_pos_vec, test_neg_vec) # validation
            Fscores_list.append(Fscore)
            #print Fscore
            recall_list.append(recall)
            precision_list.append(precision)
        stats_Fscore.append(np.mean(Fscores_list))
        stats_recall.append(np.mean(recall_list))
        stats_precision.append(np.mean(precision_list))
    print "Average Fscores over all iterations:\t", np.mean(stats_Fscore)

    plt.plot(stats_Fscore, marker='o')
    plt.ylim([0, 1.0])
    plt.ylabel("Fscore")
    plt.xlabel("Iteration")
    plt.savefig("Fscores.png")



In [7]:
if __name__ == "__main__":
    main()

Average Fscores over all iterations:	0.933125891256
