In [224]:
import util
reload(util)

import gc
import os
import time
    
import numpy as np
import pandas as pd
import scipy

from datetime import timedelta
from gensim.models import word2vec

from sklearn import metrics
from sklearn.externals import joblib
from sklearn.feature_extraction.text import *

In [225]:
samp = True
samp = '_samp' if samp else ''

## I. Load

In [226]:
# Load feature names
fnames = np.array(pd.read_pickle('../interim/028_preproc_heavy_shows_concat' + samp + '.p'))

# load concatenated descriptions
desc = np.array(pd.read_pickle('../interim/028_preproc_heavy_show_description_concat' + samp + '.p'))

print("Feature Names Shape:", fnames.shape)
print("Descriptions Shape:", desc.shape)
assert fnames.shape[0] == desc.shape[0]

('Feature Names Shape:', (1192, 2))
('Descriptions Shape:', (1192, 1))


In [227]:
# Remove duplicates from full show list 
dups = []
copy = []
copy_desc=[]
for i, j in zip(fnames.tolist(), desc.tolist()):
    if i not in copy:
        copy.append(i)
        copy_desc.append(j)
    else:
        dups.append(i)
fnames = np.asarray(copy)
desc = np.asarray(desc)
print "Found %d duplicates" % len(dups)

# split data into reserve, dev, train
x_reserve, y_reserve, x_train, y_train, x_dev, y_dev = util.random_data_split( desc, fnames)
print "\nx_reserve shape:", x_reserve.shape
print "y_reserve shape:", y_reserve.shape
print "\nx_dev shape:", x_dev.shape
print "y_dev shape:", y_dev.shape
print "\nx_train shape:", x_train.shape
print "y_train shape:", y_train.shape

Found 0 duplicates

x_reserve shape: (120,)
y_reserve shape: (120,)

x_dev shape: (268,)
y_dev shape: (268,)

x_train shape: (804,)
y_train shape: (804,)


## II. Bag Of Words & KNN

In [176]:
# function operates against dev and train data using a passed in vectorizer 
def vectorize(vec):
    vectorizer_name = vec.__class__.__name__
    x_train_counts = vec.fit_transform(x_train)
    x_dev_vectors = vec.transform(x_dev)

    best_score, best_param, f1_score = util.knn_test(x_train_counts, y_train, x_dev_vectors , y_dev)
    print 'Accuracy with using %s sparse array and KNN: %3.2f%% with k_neighbors = %d, F1 score: %3.2f%%' % (vectorizer_name, best_score, best_param, f1_score )

start = time.time()
vectorize(CountVectorizer())
vectorize(TfidfVectorizer())

print "\n", util.elapsed_time(start, time.time())

Accuracy with using CountVectorizer sparse array and KNN: 8.46% with k_neighbors = 1, F1 score: 9.73%
Accuracy with using TfidfVectorizer sparse array and KNN: 12.94% with k_neighbors = 1, F1 score: 7.99%

Time elapsed: 37.698047 second(s)


## III. Word2Vec 

In [228]:
start = time.time()

# treat each description as a document and generate a single vector for each
def generate_document_vector_array( document ):
    min_count = 2
    size = 50
    window = 4
    document_vector_list = []

    for i in range(document.shape[0]):
        word_vecs = []
        sentences = [sentence + '.' for sentence in  document[i].split('.')]
        model = word2vec.Word2Vec(sentences, min_count=min_count, size=size, window=window)
        for key in model.wv.vocab.keys():
            word_vecs += model.wv[key].tolist()
        data = np.asarray(word_vecs).reshape(-1, size)
        document_vector = np.average(data, axis=0)
        document_vector_list += document_vector.tolist()

    return np.asarray(document_vector_list).reshape(-1, size)

x_reserve_vectors = generate_document_vector_array(x_reserve)
x_train_vectors = generate_document_vector_array(x_train)
x_dev_vectors = generate_document_vector_array(x_dev)

print 'x_reserve_vectors.shape', x_reserve_vectors.shape
print 'x_train_vectors.shape', x_train_vectors.shape
print 'x_dev_vectors.shape', x_dev_vectors.shape
print elapsed_time(start, time.time())

(804, 50)
(268, 50)
Time elapsed: 02:27.581614 minute(s)


In [None]:
# Evaluate accuracy with KNNClassifier:
start = time.time()
best_score, best_param, f1_score = util.knn_test(x_train_vectors, y_train, x_dev_vectors, y_dev)
print '\nAccuracy with Word2Vec using using KNN: %3.2f%% with k_neighbors = %d, F1 score: %3.2f%%' % (best_score, best_param, f1_score )
print elapsed_time(start, time.time())

In [220]:
# Evaluate accuracy with LinearRegression classiifier:
reload(util)
start = time.time()
best_score, best_param, f1_score = util.lr_test(x_train_vectors, y_train, x_dev_vectors, y_dev)
print '\nAccuracy with Word2Vec using using LR: %3.2f%% with Cs = %d, F1 score: %3.2f%%' % (best_score, best_param, f1_score )
print util.elapsed_time(start, time.time())


Accuracy with Word2Vec using using LR: 2.35% with Cs = 1, F1 score: 0.09%
Time elapsed: 03:53.483568 minute(s)


## IV. Save To File

In [229]:
x_reserve_vectors.dump('../interim/x_reserve_vectors' + samp + '.p')
y_reserve.dump('../interim/y_reserve' + samp + '.p')
x_train_vectors.dump('../interim/x_train_vectors' + samp + '.p')
y_train.dump('../interim/y_train' + samp + '.p')
x_dev_vectors.dump('../interim/x_dev_vectors' + samp + '.p')
y_dev.dump('../interim/y_dev' + samp + '.p')
print "Saved arrays to file"

Saved arrays to file
