In [30]:
import numpy as np
import collections
import re
import os
import tarfile
import random
from six.moves import urllib

import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf

In [31]:
print(np.__version__)
print(mp.__version__)
print(tf.__version__)

1.13.3
2.1.1
1.4.1


In [32]:
DOWNLOADED_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
    print('Found and verified file from this path', url_path)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

In [33]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0
    
    reviews = []
    labels = []
    for filename in os.listdir(dirname):
        if filename.endswith('.txt'):
            
            for review in open(dirname + filename, 'r+', encoding='utf-8'):
                review = review.lower().replace('<br />', ' ')
                review = re.sub(TOKEN_REGEX, '', review)
                
                reviews.append(review)
                labels.append(label)
                
    return reviews, labels

In [34]:
def extract_labels_data():
    # If the file has not already been extracted
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOADED_FILENAME) as tar:
            tar.extractall()
            tar.close()
            
    positive_reviews, positive_labels = get_reviews('aclImdb/train/pos/', positive=True)
    negative_reviews, negative_labels = get_reviews('aclImdb/train/neg/', positive=False)
    
    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels
    
    return labels, data
            

In [35]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
download_file(URL_PATH)

Found and verified file from this path http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Downloaded file:  ImdbReviews.tar.gz


In [36]:
labels, data = extract_labels_data()

In [37]:
labels[:5]

[1, 1, 1, 1, 1]

In [38]:
data[:5]

['for a movie that gets no respect there sure are a lot of memorable quotes listed for this gem imagine a movie where joe piscopo is actually funny maureen stapleton is a scene stealer the moroni character is an absolute scream watch for alan the skipper hale jr as a police sgt',
 'bizarre horror movie filled with famous faces but stolen by cristina raines later of tvs flamingo road as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the gateway to hell the scenes with raines modeling are very well captured the mood music is perfect deborah raffin is charming as cristinas pal but when raines moves into a creepy brooklyn heights brownstone inhabited by a blind priest on the top floor things really start cooking the neighbors including a fantastically wicked burgess meredith and kinky couple sylvia miles  beverly dangelo are a diabolical lot and eli wallach is great fun as a wily police detective the movie is nearly a cro

In [39]:
len(labels), len(data)

(25000, 25000)

In [40]:
max_document_length = max([len(x.split(' ')) for x in data])
print(max_document_length)

2470


In [103]:
# The number of words to consider in each review - pad shorter reviews, truncate longer reviews.
MAX_SEQUENCE_LENGTH = 250

# SPECIFIC TO IMDB, average lenght of a review, plot it on graph and find the lenght, which include a majority of files.

In [104]:
words = np.load('wordsList.npy', encoding = 'latin1')

In [105]:
words[:5], len(words)

(array([b'0', b',', b'.', b'of', b'to'],
       dtype='|S68'), 400000)

In [106]:
def get_word_index_dictionary(words):
    dictionary = {}
    
    index = 0
    for word in words:
        dictionary[word.decode('utf-8')] = index
        index += 1
    return dictionary

In [107]:
dictionary = get_word_index_dictionary(words)

In [108]:
dictionary["and"]

5

In [111]:
review_ids = []

def convert_reviews_to_ids(data, words):
    words_list = words.tolist()
    
    progress = 0
    for review in data:
        
        review_id = []
        index = 0

        for word in review:
            if index >= MAX_SEQUENCE_LENGTH:
                break;

            try: 
                review_id.append(dictionary[word])
            except KeyError:
                review_id.append(0)

            index += 1

        if len(review_id) < MAX_SEQUENCE_LENGTH:
            review_id = np.pad(review_id, (0, MAX_SEQUENCE_LENGTH - index), 'constant')

        review_ids.append(np.array(review_id))
        progress += 1

        if progress % 1000 == 0:
            print("Completed: ", progress)


In [112]:
convert_reviews_to_ids(data, words)

Completed:  1000
Completed:  2000
Completed:  3000
Completed:  4000
Completed:  5000
Completed:  6000
Completed:  7000
Completed:  8000
Completed:  9000
Completed:  10000
Completed:  11000
Completed:  12000
Completed:  13000
Completed:  14000
Completed:  15000
Completed:  16000
Completed:  17000
Completed:  18000
Completed:  19000
Completed:  20000
Completed:  21000
Completed:  22000
Completed:  23000
Completed:  24000
Completed:  25000


In [113]:
review_ids[19825]

array([2159, 5918,   41, 1534,    0, 2159, 2404,    0, 3880,   41, 5025,
       1993,    0, 2159, 1110, 5025, 5025, 1534,    0, 2159, 5918, 1110,
          0, 1534, 2159, 4868, 1911, 3524,    0, 4868, 3880,    0, 1110,
       1585, 2159, 1911, 4868, 2404, 1110, 1911, 2159,    0, 3880, 1911,
          7, 3814, 3814,   41, 1110,    0, 1534, 6479, 1968, 1968, 1110,
       3814, 5025, 3524,    0, 1911, 1110, 2159, 6479, 1911, 3814,   41,
       3814, 3410,    0, 2159, 4868,    0, 1534,   41, 5025, 4652,    0,
       5918, 4868, 3420, 1110,    0, 2159, 4868,    0, 2404,   41, 1534,
         41, 2159,    0, 3880, 1911,   41, 1110, 3814, 1968, 1534,    0,
          7, 3814, 1968,    0, 3880,    7, 1993,   41, 5025, 3524,    0,
       1556, 6479, 2159,    0, 6479, 3814,    7, 5140,    7, 1911, 1110,
          0, 4868, 3880,    0, 5918, 1110, 1911,    0, 1993, 4868, 2159,
       5918, 1110, 1911, 1534,    0, 1968, 1110,    7, 2159, 5918,    0,
       5918, 1110, 1911,    0, 1534,   41, 1534, 21

In [114]:
review_ids = np.load('idsMatrix.npy')

In [115]:
review_ids.shape

(25000, 250)

In [117]:
x_data = review_ids
y_output = np.array(labels)

In [119]:
vocabulary_size = len(words)
print(vocabulary_size)

400000


In [126]:
np.random.seed(22)
shuffle_indices = np.random.permutation(np.arange(len(x_data)))
x_shuffled = x_data[shuffle_indices]
y_shuffled = y_output[shuffle_indices]

In [127]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA:TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA:TOTAL_DATA]

In [128]:
tf.reset_default_graph()

x = tf.placeholder(tf.int32, [None, MAX_SEQUENCE_LENGTH])
y = tf.placeholder(tf.int32, [None])

In [130]:
batch_size = 25
embedding_size = 50
max_label = 2

In [133]:
saved_embedings = np.load('wordVectors.npy')

embeddings = tf.nn.embedding_lookup(saved_embedings, x)