In [44]:
# import libraries for pre-processing
import numpy as np
import pandas as pd
from sklearn import svm
import random
import math
import pickle
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, utils
from sklearn.linear_model import LogisticRegression 
from hazem_doc2vec.helper_functions import in_pickle, out_pickle, shuffle_corpus_labels
from data_analysis.preprocessor_end import preprocess_file
from sklearn.metrics import confusion_matrix
import data_analysis.preprocessor_end as pre
import os

In [24]:
# Function used to divide both negative and positive text files into 2 parts, one for
# test and the other for train. It returns two lists for train, test.
def divide_corpus(p_corpus, n_corpus, factor):
    train_corpus = []
    train_labels = []

    test_corpus = []
    test_labels = []

    # For positive dataset
    n_pos = int(math.ceil(factor * len(p_corpus)))
    for doc_id in range(len(p_corpus)):
        if doc_id < n_pos:
            train_corpus.append(TaggedDocument(p_corpus[doc_id], [doc_id]))
            train_labels.append([1])
        else:
            test_corpus.append(p_corpus[doc_id])
            test_labels.append([1])
    
    # For negative dataset
    n_neg = int(math.ceil(factor * len(n_corpus))) 
    for doc_id in range(len(n_corpus)):
        if doc_id < n_neg:
            train_corpus.append(TaggedDocument(n_corpus[doc_id], [int(n_pos + doc_id)]))
            train_labels.append([0])
        else:
            test_corpus.append(n_corpus[doc_id])
            test_labels.append([0])
            
    return train_corpus, train_labels, test_corpus, test_labels


def prepare_classifier_train_arrays(model, labels_arr):
    train_arrays = np.array(model.docvecs.vectors_docs)
    train_labels_arrays = np.zeros(model.docvecs.count, dtype=np.int)
    
    for i in range(model.docvecs.count):
        train_labels_arrays[i] = labels_arr[i][0]
    return train_arrays, train_labels_arrays


def prepare_classifier_test_arrays(model, test_corpus, labels_arr):
    test_arrays = np.zeros([len(test_corpus), model.vector_size])
    test_labels_arrays = np.zeros(len(test_corpus), dtype=np.int)

    # Shuffle test data
    test_corpus, labels_arr = shuffle_corpus_labels(test_corpus, labels_arr)

    for i in range(len(test_corpus)):
        test_arrays[i] = model.infer_vector(test_corpus[i])
        test_labels_arrays[i] = labels_arr[i][0]
    return test_arrays, test_labels_arrays

In [46]:
%%time
pos_corpus = in_pickle('data/pos_corpus')
neg_corpus = in_pickle('data/neg_corpus')


CPU times: user 4.06 s, sys: 865 ms, total: 4.93 s
Wall time: 5.55 s


In [4]:
%%time
train_corpus = in_pickle('data/train_corpus')
train_labels = in_pickle('data/train_labels')
test_corpus = in_pickle('data/test_corpus')
test_labels = in_pickle('data/test_labels')

CPU times: user 4.45 s, sys: 1.02 s, total: 5.47 s
Wall time: 5.73 s


In [48]:
(len(train_corpus), len(train_labels), len(test_corpus), len(test_labels), len(pos_corpus) + len(neg_corpus))

(5036, 5036, 1258, 1258, 6294)

In [52]:
%%time
# Model's parameter
max_epochs = 10
vec_size = 300
alpha = 0.025

# Note: defining 'dm=1' is important here. It means that we have selected 
# distributed memory’ (PV-DM) over ‘distributed bag of words’ (PV-DBOW) 'dm =0'
# Which doesn't preserve teh order of the words.
model = Doc2Vec(min_count=1, dm=1, workers=16, window=10, vector_size=vec_size, 
                alpha=alpha, min_alpha=0.00025)

# Setting up the vocabulary 
model.build_vocab(train_corpus)

for epoch in range(max_epochs):
    train_corpus, train_labels = shuffle_corpus_labels(train_corpus, train_labels)

    print('iteration_{0}'.format(epoch), end='\t')

    model.train(train_corpus, total_examples=len(train_corpus), epochs=model.epochs)
        
    # decrease the learning rate
    model.alpha -= 0.0002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("\nModel Saved\n")


iteration_0	

iteration_1	

iteration_2	

iteration_3	

iteration_4	

iteration_5	

iteration_6	

iteration_7	

iteration_8	

iteration_9	


Model Saved

CPU times: user 1h 46min 18s, sys: 35 s, total: 1h 46min 53s
Wall time: 31min 8s


In [53]:
%%time
# Loading the saved doc2vec model
model = Doc2Vec.load('d2v.model')

train_x, train_y = prepare_classifier_train_arrays(model, train_labels)
test_x, test_y = prepare_classifier_test_arrays(model, test_corpus, test_labels)

CPU times: user 1min 13s, sys: 182 ms, total: 1min 14s
Wall time: 1min 14s


In [54]:
%%time
out_pickle('data/train_x', train_x)
out_pickle('data/train_y', train_y)
out_pickle('data/test_x', test_x)
out_pickle('data/test_y', test_y)

CPU times: user 11.9 ms, sys: 3.98 ms, total: 15.9 ms
Wall time: 30.4 ms


In [55]:
%%time
train_x = in_pickle('data/train_x')
train_y = in_pickle('data/train_y')
test_x = in_pickle('data/test_x')
test_y = in_pickle('data/test_y')

CPU times: user 7.03 ms, sys: 0 ns, total: 7.03 ms
Wall time: 6.2 ms


In [56]:
%%time
classifier = LogisticRegression(solver='lbfgs')
classifier.fit(train_x, train_y)


CPU times: user 598 ms, sys: 549 ms, total: 1.15 s
Wall time: 507 ms


In [57]:
classifier.score(test_x, test_y)

0.7519872813990461

In [65]:
%%time
clf = svm.SVC(C=505, gamma=0.00055)
clf.fit(train_x, train_y)   
out_pickle('data/svm', clf)

CPU times: user 13.3 s, sys: 104 ms, total: 13.4 s
Wall time: 13.4 s


In [66]:
clf.score(test_x, test_y)


0.7170111287758346