In [1]:
import logging # record log event
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import re
import math
import collections
import nltk
import random
random.seed(0)

def load_data(path_to_data):
    train_pos = []
    train_neg = []
    sigwords = []
    with open(path_to_data+"EnronSignatures.txt",'r') as fes:
        for line in fes:
            if len(line) <= 3:
                train_pos.append((' '.join(sigwords)).split()) # combine all the texts of each signature
                sigwords = []
            else:
                sigwords.append(' '.join(line.lower().strip().split()))
    with open(path_to_data+"train_content.txt",'r') as ftc:
        for line in ftc:
            words = [w.lower() for w in line.strip().split()]
            if len(words) < 2:
                continue
            train_neg.append(words)
    return train_pos, train_neg

def data_filter(raw_text_data):
    clean_text_data = []
    for entry in raw_text_data:
        clean_entry = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#()_+|~\-={}\[\]:\";'<>?,.\/\\]|[0-9]|[@]", ' ', ' '.join(entry))
        #clean_entry = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#_+|\={}\[\]\";'<>?,.\/]|[0-9]", ' ', ' '.join(entry))
        #clean_entry = ' '.join(entry)
        clean_entry = re.sub('\s+', ' ', clean_entry)
        clean_text_data.append(clean_entry.split())
    return clean_text_data

In [3]:
from gensim.models.doc2vec import LabeledSentence, Doc2Vec
def feature_extraction_Doc2Vec(data_pos, data_neg): # use the word2vec under the hood
    labeled_data_pos = []
    for index, words in enumerate(data_pos):
        sentence = LabeledSentence(words, ["DATA_POS_%s"%index])
        labeled_data_pos.append(sentence)
    labeled_data_neg = []
    for index, words in enumerate(data_neg):
        sentence = LabeledSentence(words, ["DATA_NEG_%s"%index])
        labeled_data_neg.append(sentence)
    model = Doc2Vec(min_count=1, window=20, size=4000, sample=1e-4, negative=5, workers=4)
    sentences = labeled_data_pos + labeled_data_neg
    model.build_vocab(sentences)
    for i in range(5):
        #print "Training iteration %d" %(i)
        random.shuffle(sentences)
        model.train(sentences)
    data_pos_vec, data_neg_vec = [], []
    for index in range(len(labeled_data_pos)):
        doc_vec = model.docvecs["DATA_POS_%s"%index]
        data_pos_vec.append(doc_vec)
    for index in range(len(labeled_data_neg)):
        doc_vec = model.docvecs["DATA_NEG_%s"%index]
        data_neg_vec.append(doc_vec)
    return data_pos_vec, data_neg_vec

In [4]:
import sklearn.naive_bayes
import sklearn.linear_model
import sklearn.svm
import sklearn.neighbors.nearest_centroid
import sklearn.tree
def build_model(train_pos_vec, train_neg_vec):
    Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)
    X = train_pos_vec + train_neg_vec
    # use multiple classification methods
    svm_model = sklearn.svm.SVC() # SVM
    svm_model.fit(X,Y)
    nnc_model = sklearn.neighbors.nearest_centroid.NearestCentroid() # Nearest Neighbor
    nnc_model.fit(X,Y)
    lr_model = sklearn.linear_model.LogisticRegression() # Logistic Regression
    lr_model.fit(X,Y)
    nb_model = sklearn.naive_bayes.GaussianNB() # Naive Bayes
    nb_model.fit(X,Y)
    dt_model = sklearn.tree.DecisionTreeClassifier() # Decision Tree
    dt_model.fit(X,Y)
    return svm_model, nnc_model, lr_model, nb_model, dt_model

In [5]:
def evaluate_model(model, test_pos_vec, test_neg_vec, print_confusion=False):
    test_pos_predict = model.predict(test_pos_vec)
    test_neg_predict = model.predict(test_neg_vec)
    test_pos_Y = ["pos"]*len(test_pos_vec)
    test_neg_Y = ["neg"]*len(test_neg_vec)
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(test_pos_predict)):
        if test_pos_predict[i] == test_pos_Y[i]:
            tp += 1
        else:
            fn += 1
    for i in range(len(test_neg_predict)):
        if test_neg_predict[i] == test_neg_Y[i]:
            tn += 1
        else:
            fp += 1
    accuracy = float(tp+tn) / float(tp+tn+fp+fn+1)
    precision = float(tp) / float(tp+fp+1)
    recall = float(tp) / float(tp+fn+1)
    Fscore = (2*recall*precision) / (recall + precision)
    if print_confusion:
        print str(model)
        print "predicted:\tpos\tneg"
        print "actual:"
        print "pos\t\t%d\t%d" % (tp, fn)
        print "neg\t\t%d\t%d" % (fp, tn)
        print "accuracy: %f" % (accuracy)
        print "precision: %f" % (precision)
        print "recall: %f" % (recall)
        print "Fscore: %f" % (Fscore)
        print '\n'
    return accuracy, precision, recall, Fscore

In [None]:
def SMOTE()