In [129]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import os
import random
import tarfile
import re
from six.moves import urllib
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf

proxy = 'gw-proxy-la03p.corp.tcw.com:80'
os.environ['https_proxy'] = proxy

DOWNLOADED_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)

    print('Found and verified file from this path: ', url_path)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0

    reviews = []
    labels = []
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+', encoding='utf-8-sig') as f:
                review = f.read().lower()
                reviews.append(review)
                labels.append(label)
    return reviews, labels           

def extract_labels_data():
    positive_reviews, positive_labels = get_reviews("G:/Mohit/PortClass/CP/", positive=True)
    negative_reviews, negative_labels = get_reviews("G:/Mohit/PortClass/CR/", positive=False)

    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels

    return labels, data

URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
labels, data = extract_labels_data()
max_document_length = max([len(x.split(" ")) for x in data])

MAX_SEQUENCE_LENGTH = 717

vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_SEQUENCE_LENGTH)

x_data = np.array(list(vocab_processor.fit_transform(data)))
y_output = np.array(labels)

vocabulary_size = len(vocab_processor.vocabulary_)
vocab_dict = vocab_processor.vocabulary_._mapping

np.random.seed(22)
shuffle_indices = np.random.permutation(np.arange(len(x_data)))

x_shuffled = x_data[shuffle_indices]
y_shuffled = y_output[shuffle_indices]

TRAIN_DATA = 60
TOTAL_DATA = 69

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA:TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA:TOTAL_DATA]

In [130]:
for i in range(test_data.shape[0]):
    np.savetxt("G:\Chen\sentiment\\results\\" + str(i) + "_test_target.csv", np.asarray([test_target[i]]), delimiter=",")
    np.savetxt("G:\Chen\sentiment\\results\\" + str(i) + "_test_data.csv", test_data[i,:], delimiter=",")

In [131]:
# from numpy import genfromtxt
# genfromtxt('G:\Chen\sentiment\\results\\0_test_data.csv', delimiter=',')

In [132]:
np.asarray([test_target[0]]).shape

(1,)

In [133]:
test_data

array([[153, 159, 160, ...,   0,   0,   0],
       [ 30,  31,  32, ...,   0,   0,   0],
       [  1,   2,   3, ...,   0,   0,   0],
       ...,
       [  1,   2,   3, ...,   0,   0,   0],
       [  1,   2,   3, ...,   0,   0,   0],
       [  1,   2,   3, ...,   0,   0,   0]], dtype=int64)

In [134]:
def extract_labels_data():
    positive_reviews, positive_labels = get_reviews("G:/Mohit/PortClass/CP/", positive=True)
    negative_reviews, negative_labels = get_reviews("G:/Mohit/PortClass/CR/", positive=False)

    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels
    return positive_reviews,negative_reviews,positive_labels,negative_labels

positive_reviews,negative_reviews,positive_labels,negative_labels = extract_labels_data()

In [135]:
print(positive_reviews)

["do not invest in cash equivalents/short-term investments rated below a1/p1 by m/s&p/f using highest (any nrsro, manager rating allowed). do not invest in non-usd denominated securities. do not invest in mortgage derivatives. no more than 10% of total assets in private placements (including 144a's and privately issued cp). no more than 15% in abs - excluding abs home equity & manufactured housing. do not invest in debt rated below b- by m/s&p/f (any nrsro) using highest. repos - collateralized by ust and no less than 102%. no more than 5% of total assets in any one issuer - excl us govt/agcy/mf. do not invest in z tranche (also known as accrual) or support tranches of cmos. do not invest in structured notes. no more than 15% of total assets in foreign securities (including yankees). do not invest in abortion/abortifacients, alcohol, contraceptives, firearms, weapons, gambling, environmental degradation, pornography/violent forms of entertainment, tobacco, stem cell rsch. no more than 

In [126]:
from numpy import genfromtxt
print('shape: ', genfromtxt("G:\\Chen\\sentiment\\results\\0_test_data.csv", delimiter=',').shape)

shape:  (717,)


In [50]:
loss = tf.reduce_mean(
          tf.nn.nce_loss(weights=weight,     # [vocab_size, embed_size]
                   biases=bias,
                   labels=y,
                   inputs=embeddings,
                   num_sampled=num_samples, 
                   num_classes=vocabulary_size))

In [53]:
l2_norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
normalized_embeddings = embeddings / l2_norm

In [62]:
train_dict

{<tf.Tensor 'Placeholder:0' shape=(63,) dtype=int32>: array([[281, 256,  30, ...,   0,   0,   0],
        [  1,   2,   3, ...,   0,   0,   0],
        [281, 256,  30, ...,   0,   0,   0],
        ...,
        [153, 159, 160, ...,   0,   0,   0],
        [ 30,  31,  32, ...,   0,   0,   0],
        [  1,   2,   3, ...,   0,   0,   0]], dtype=int64),
 <tf.Tensor 'Placeholder_1:0' shape=(63, 1) dtype=int32>: array([1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
        0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
        0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0])}

In [56]:
#optimizer = tf.train.AdamOptimizer(0.01)
train_step = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

init = tf.global_variables_initializer()

with tf.Session() as session:
    init.run()
    
    for epoch in range(num_epochs):
        
        num_batches = int(len(train_data) // batch_size) + 1
        
        for i in range(num_batches):
            # Select train data
            min_ix = i * batch_size
            max_ix = np.min([len(train_data), ((i+1) * batch_size)])

            x_train_batch = train_data[min_ix:max_ix]
            y_train_batch = train_target[min_ix:max_ix]
            
            train_dict = {x: x_train_batch, y: y_train_batch}
            session.run(train_step, feed_dict=train_dict)
            
            train_loss, train_acc, logits_val, emb_mat = session.run([loss, accuracy, logits, embedding_matrix], feed_dict=train_dict)

        test_dict = {x: test_data, y: test_target}
        test_loss, test_acc = session.run([loss, accuracy], feed_dict=test_dict)    
        print('Epoch: {}, Test Loss: {:.2}, Test Acc: {:.5}'.format(epoch + 1, test_loss, test_acc)) 

ValueError: Cannot feed value of shape (63, 717) for Tensor 'Placeholder:0', which has shape '(63,)'