In [1]:
# import libraries for pre-processing
import numpy as np
import pandas as pd
from sklearn import  svm
import os
import random
import pickle
import re
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, utils
from sklearn.linear_model import LogisticRegression
import sklearn
from gensim.parsing.porter import PorterStemmer

In [2]:
# Stop words in English
stop_words = set(stopwords.words('english'))

# A function used to remove stop words from a document passed as LIST
def remove_stop_words(text):
    for st_word in stop_words:
        text = list(filter(lambda a: a != st_word, text))
    return text

# Function to stem, remove stop word, and tokenize documents 
def pre_process(doc, doc_id, is_train):
    p_stemmer = PorterStemmer()

    # clean and tokenize document string    
    tokens = utils.simple_preprocess(doc)

    # remove stop words from tokens
    stopped_tokens = remove_stop_words(tokens)

    # remove numbers
    number_tokens = [re.sub(r'[\d]', ' ', i) for i in stopped_tokens]
    number_tokens = ' '.join(number_tokens).split()

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(w) for w in number_tokens]

    # If it's train data return TaggedDocument object else, return normal lists
    if is_train == 0:
        return stemmed_tokens
    else:
        return TaggedDocument(stemmed_tokens, [doc_id])
            

# Function used to divide both negative and positive text files into 2 parts, one for
# test and the other for train. It returns two lists for train, test.
def divide_corpus(p_corpus, n_corpus):
    train_corpus = []
    test_corpus = []
    train_labels = []
    test_labels = []
        
    factor = 3 / 4
    
    n_pos = factor * len(p_corpus) 
    for i in range(len(p_corpus)):
        if i < n_pos:
            train_corpus.append(pre_process(p_corpus[i], i, 1))
            train_labels.append([1])
        else:
            test_corpus.append(pre_process(p_corpus[i], i, 0))
            test_labels.append([1])

    n_neg = factor * len(n_corpus)
    for i in range(len(n_corpus)):
        if i < n_neg:
            train_corpus.append(pre_process(n_corpus[i], int(n_pos+i), 1))
            train_labels.append([0])
        else:
            test_corpus.append(pre_process(n_corpus[i], int(n_pos+i), 0))
            test_labels.append([0])
    return train_corpus, train_labels, test_corpus, test_labels

# Function to shuffle corpus & labels in the same order
def shuffle_corpus_labels(corpus, labels):
    z = list(zip(corpus, labels))
    random.shuffle(z)
    c, l = zip(*z)
    return c, l

def prepare_classifier_train_arrays(model, labels_arr):
    train_arrays = np.zeros((model.corpus_count, model.vector_size))
    train_labels_arrays = np.zeros(model.corpus_count, dtype=int)
    for i in range(model.corpus_count):
        train_arrays[i] = model.docvecs[i]
        train_labels_arrays[i] = labels_arr[i][0]
    return train_arrays, train_labels_arrays

def prepare_classifier_test_arrays(model, test_corpus, labels_arr):
    test_arrays = np.zeros((len(test_corpus), model.vector_size))
    test_labels_arrays = np.zeros(len(test_corpus), dtype=int)
    
    # Shuffle test data
    test_corpus, labels_arr = shuffle_corpus_labels(test_corpus, labels_arr)

    # Write test corpus & labels into pickle objects 
    with open('test_corpus.pickle', 'wb') as pkl:
        pickle.dump(test_corpus, pkl)
    with open('test_labels.pickle', 'wb') as pkl:
        pickle.dump(labels_arr, pkl)
        
    for i in range(len(test_corpus)):
        test_arrays[i] = model.infer_vector(test_corpus[i])
        test_labels_arrays[i] = labels_arr[i][0]
    return test_arrays, test_labels_arrays

In [3]:
%%time
# First, download the data set from NLTK dataset website

# The local path to the positively reviewed movies
pos_rev_folder_path = "/home/hazem/Downloads/movie_reviews/pos/"

# The local path to the negatively reviewed movies
neg_rev_folder_path = "/home/hazem/Downloads/movie_reviews/neg/"

# TaggedDocument: a Gensim object which has the form:
# [TaggedDocument(words = ['w_1', 'w_2' ...], tags = [0]) 'doc_1', ...)

# Positive text file as a list of lists
pos_corpus = []

# Negative text file as a list of lists
neg_corpus = []

# Iterator
i = 0
# Now, read the positive text files, filter out the stop words and convert all 
# letters to lowercase 
for file_name in os.listdir(pos_rev_folder_path):
    with open(pos_rev_folder_path + file_name, 'r') as file:
        doc = file.read().replace('\n', '')
        pos_corpus.append(doc)
        i += 1
        
# Iterator 
i = 0
# Now, read the negative text files, filter out the stop words and convert all 
# # letters to lowercase 
for file_name in os.listdir(neg_rev_folder_path):
    with open(neg_rev_folder_path + file_name, 'r') as file:
        doc = file.read().replace('\n', '')
        neg_corpus.append(doc)
        i += 1

# Write both pos and neg corpus & labels into pickle objects 
with open('pos_corpus.pickle', 'wb') as pkl:
    pickle.dump(pos_corpus, pkl)

with open('neg_corpus.pickle', 'wb') as pkl:
    pickle.dump(neg_corpus, pkl)

CPU times: user 138 ms, sys: 81.9 ms, total: 220 ms
Wall time: 727 ms


In [4]:
%%time
# Read from pickle objects
with open('pos_corpus.pickle', 'rb') as pkl:
    pos_corpus = pickle.load(pkl)

with open('neg_corpus.pickle', 'rb') as pkl:
    neg_corpus = pickle.load(pkl)

train_corpus, train_labels, test_corpus, test_labels = divide_corpus(pos_corpus, 
                                                                     neg_corpus)

# Write both train and test corpus and labels into pickle objects 
with open('train_corpus.pickle', 'wb') as pkl:
    pickle.dump(train_corpus, pkl)

with open('train_labels.pickle', 'wb') as pkl:
    pickle.dump(train_labels, pkl)

with open('test_corpus.pickle', 'wb') as pkl:
    pickle.dump(test_corpus, pkl)

with open('test_labels.pickle', 'wb') as pkl:
    pickle.dump(test_labels, pkl)

CPU times: user 34.7 s, sys: 78.7 ms, total: 34.8 s
Wall time: 34.8 s


In [5]:
# Load from pickle objects
with open('train_corpus.pickle', 'rb') as pkl:
    train_corpus = pickle.load(pkl)

with open('train_labels.pickle', 'rb') as pkl:
    train_labels = pickle.load(pkl)

with open('test_corpus.pickle', 'rb') as pkl:
    test_corpus = pickle.load(pkl)

with open('test_labels.pickle', 'rb') as pkl:
    test_labels = pickle.load(pkl)


In [6]:
%%time
# Load from pickle objects
with open('train_corpus.pickle', 'rb') as pkl:
    train_corpus = pickle.load(pkl)

with open('train_labels.pickle', 'rb') as pkl:
    train_labels = pickle.load(pkl)
    
# Model's parameter
max_epochs = 200
vec_size = 100
alpha = 0.025

# Note: defining 'dm=1' is important here. It means that we have selected 
# distributed memory’ (PV-DM) over ‘distributed bag of words’ (PV-DBOW) 'dm =0'
# Which doesn't preserve teh order of the words.
model = Doc2Vec(min_count=1, dm=1, sample=1e-4, negative=5, workers=16, 
                window=10, vector_size=vec_size, alpha=alpha, min_alpha=0.00025)

train_corpus, train_labels = shuffle_corpus_labels(train_corpus, train_labels)

# Setting up the vocabulary 
model.build_vocab(train_corpus)

for epoch in range(max_epochs):
    
    print('iteration_{0}'.format(epoch), end='\t')
        
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
        
    # decrease the learning rate
    model.alpha -= 0.0002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

# Write both train corpus and labels into pickle objects 
with open('train_corpus.pickle', 'wb') as pkl:
    pickle.dump(train_corpus, pkl)

with open('train_labels.pickle', 'wb') as pkl:
    pickle.dump(train_labels, pkl)

model.save("d2v.model")
print("\nModel Saved\n")


iteration_0	

iteration_1	

iteration_2	

iteration_3	

iteration_4	

iteration_5	

iteration_6	

iteration_7	

iteration_8	

iteration_9	

iteration_10	

iteration_11	

iteration_12	

iteration_13	

iteration_14	

iteration_15	

iteration_16	

iteration_17	

iteration_18	

iteration_19	

iteration_20	

iteration_21	

iteration_22	

iteration_23	

iteration_24	

iteration_25	

iteration_26	

iteration_27	

iteration_28	

iteration_29	

iteration_30	

iteration_31	

iteration_32	

iteration_33	

iteration_34	

iteration_35	

iteration_36	

iteration_37	

iteration_38	

iteration_39	

iteration_40	

iteration_41	

iteration_42	

iteration_43	

iteration_44	

iteration_45	

iteration_46	

iteration_47	

iteration_48	

iteration_49	

iteration_50	

iteration_51	

iteration_52	

iteration_53	

iteration_54	

iteration_55	

iteration_56	

iteration_57	

iteration_58	

iteration_59	

iteration_60	

iteration_61	

iteration_62	

iteration_63	

iteration_64	

iteration_65	

iteration_66	

iteration_67	

iteration_68	

iteration_69	

iteration_70	

iteration_71	

iteration_72	

iteration_73	

iteration_74	

iteration_75	

iteration_76	

iteration_77	

iteration_78	

iteration_79	

iteration_80	

iteration_81	

iteration_82	

iteration_83	

iteration_84	

iteration_85	

iteration_86	

iteration_87	

iteration_88	

iteration_89	

iteration_90	

iteration_91	

iteration_92	

iteration_93	

iteration_94	

iteration_95	

iteration_96	

iteration_97	

iteration_98	

iteration_99	

iteration_100	

iteration_101	

iteration_102	

iteration_103	

iteration_104	

iteration_105	

iteration_106	

iteration_107	

iteration_108	

iteration_109	

iteration_110	

iteration_111	

iteration_112	

iteration_113	

iteration_114	

iteration_115	

iteration_116	

iteration_117	

iteration_118	

iteration_119	

iteration_120	

iteration_121	

iteration_122	

iteration_123	

iteration_124	

iteration_125	

iteration_126	

iteration_127	

iteration_128	

iteration_129	

iteration_130	

iteration_131	

iteration_132	

iteration_133	

iteration_134	

iteration_135	

iteration_136	

iteration_137	

iteration_138	

iteration_139	

iteration_140	

iteration_141	

iteration_142	

iteration_143	

iteration_144	

iteration_145	

iteration_146	

iteration_147	

iteration_148	

iteration_149	

iteration_150	

iteration_151	

iteration_152	

iteration_153	

iteration_154	

iteration_155	

iteration_156	

iteration_157	

iteration_158	

iteration_159	

iteration_160	

iteration_161	

iteration_162	

iteration_163	

iteration_164	

iteration_165	

iteration_166	

iteration_167	

iteration_168	

iteration_169	

iteration_170	

iteration_171	

iteration_172	

iteration_173	

iteration_174	

iteration_175	

iteration_176	

iteration_177	

iteration_178	

iteration_179	

iteration_180	

iteration_181	

iteration_182	

iteration_183	

iteration_184	

iteration_185	

iteration_186	

iteration_187	

iteration_188	

iteration_189	

iteration_190	

iteration_191	

iteration_192	

iteration_193	

In [411]:
%%time
# Loading the saved doc2vec model
model = Doc2Vec.load('d2v.model')

# Load from pickle objects
with open('train_corpus.pickle', 'rb') as pkl:
    train_corpus = pickle.load(pkl)

with open('train_labels.pickle', 'rb') as pkl:
    train_labels = pickle.load(pkl)

with open('test_corpus.pickle', 'rb') as pkl:
    test_corpus = pickle.load(pkl)

with open('test_labels.pickle', 'rb') as pkl:
    test_labels = pickle.load(pkl)
    
    '''
    print(train_corpus[:2])
    print(train_labels[427], '\t', train_labels[977])
    '''
    
train_arrays, train_labels_arrays = \
    prepare_classifier_train_arrays(model, train_labels)

test_arrays, test_labels_arrays = \
    prepare_classifier_test_arrays(model, test_corpus, test_labels)

# Write both train and test arrays with labels into pi1ckle objects 
with open('train_arrays.pickle', 'wb') as pkl:
    pickle.dump(train_arrays, pkl)
with open('train_labels_arrays.pickle', 'wb') as pkl:
    pickle.dump(train_labels_arrays, pkl)

with open('test_arrays.pickle', 'wb') as pkl:
    pickle.dump(test_arrays, pkl)
with open('test_labels_arrays.pickle', 'wb') as pkl:
    pickle.dump(test_labels_arrays, pkl)

CPU times: user 4.1 s, sys: 28 ms, total: 4.13 s
Wall time: 4.13 s


In [422]:
# Load from pickle objects
with open('train_arrays.pickle', 'rb') as pkl:
    train_x = pickle.load(pkl)
with open('train_labels_arrays.pickle', 'rb') as pkl:
    train_y = pickle.load(pkl)

with open('test_arrays.pickle', 'rb') as pkl:
    test_x = pickle.load(pkl)
with open('test_labels_arrays.pickle', 'rb') as pkl:
    test_y = pickle.load(pkl)
    

# Now we train a logistic regression classifier using the training data
classifier = LogisticRegression(solver='lbfgs')
classifier.fit(train_x, train_y)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [426]:
classifier.score(test_x, test_y)


0.464

In [424]:
clf = svm.SVC(gamma=1,C=0.01)
clf.fit(train_x, train_y)



SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [425]:
clf.score(test_x, test_y)


0.524