In [2]:
import numpy as np
import collections
import re
import os
import tarfile
import random
from six.moves import urllib

import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf

In [4]:
print(np.__version__)
print(mp.__version__)
print(tf.__version__)

1.13.3
2.1.1
1.4.0


In [39]:
DOWNLOADED_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
    print('Found and verified file from this path', url_path)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

In [107]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0
    
    reviews = []
    labels = []
    for filename in os.listdir(dirname):
        if filename.endswith('.txt'):
            
            for review in open(dirname + filename, 'r+', encoding='utf-8'):
                review = review.lower().replace('<br />', ' ')
                review = re.sub(TOKEN_REGEX, '', review)
                
                reviews.append(review)
                labels.append(label)
                
    return reviews, labels

In [108]:
def extract_labels_data():
    # If the file has not already been extracted
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOADED_FILENAME) as tar:
            tar.extractall()
            tar.close()
            
    positive_reviews, positive_labels = get_reviews('aclImdb/train/pos/', positive=True)
    negative_reviews, negative_labels = get_reviews('aclImdb/train/neg/', positive=False)
    
    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels
    
    return labels, data
            

In [109]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
download_file(URL_PATH)

Found and verified file from this path http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Downloaded file:  ImdbReviews.tar.gz


In [110]:
labels, data = extract_labels_data()

In [111]:
labels[:5]

[1, 1, 1, 1, 1]

In [112]:
data[:5]

['bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt',
 'homelessness or houselessness as george carlin stated has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school work or vote for the matter most

In [113]:
len(labels), len(data)

(25000, 25000)

In [115]:
max_document_length = max([len(x.split(' ')) for x in data])
print(max_document_length)

2470


In [117]:
# The number of words to consider in each review - pad shorter reviews, truncate longer reviews.
MAX_SEQUENCE_LENGTH = 250

# SPECIFIC TO IMDB, average lenght of a review, plot it on graph and find the lenght, which include a majority of files.

In [118]:
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_SEQUENCE_LENGTH)

In [120]:
x_data = np.array(list(vocab_processor.fit_transform(data)))

y_output = np.array(labels)

In [121]:
vocabulary_size = len(vocab_processor.vocabulary_)
print(vocabulary_size)

111526


In [122]:
data[3:5]

['this is easily the most underrated film inn the brooks cannon sure its flawed it does not give a realistic view of homelessness unlike say how citizen kane gave a realistic view of lounge singers or titanic gave a realistic view of italians you idiots many of the jokes fall flat but still this film is very lovable in a way many comedies are not and to pull that off in a story about some of the most traditionally reviled members of society is truly impressive its not the fisher king but its not crap either my only complaint is that brooks should have cast someone else in the lead i love mel as a director and writer not so much as a lead',
 'this is not the typical mel brooks film it was much less slapstick than most of his movies and actually had a plot that was followable leslie ann warren made the movie she is such a fantastic underrated actress there were some moments that could have been fleshed out a bit more and some scenes that could probably have been cut to make the room to d

In [123]:
x_data[3:5] # common words tend to have lower index values

array([[290,   3, 364,  10, 121, 365, 291, 366,  10, 168, 367, 368, 162,
        369,   7, 370, 243, 286,   4, 371, 372,  53,  92, 373, 374, 375,
        376, 377, 378,   4, 371, 372,  53, 379, 380,  93, 381, 378,   4,
        371, 372,  53, 382, 146, 383,  83,  53,  10, 384, 385, 386, 103,
        387, 290, 291,   3, 388, 389,  25,   4, 390,  83, 391, 238, 243,
         61,  30, 392,  32, 206,  25,   4, 393,  17,  14,  53,  10, 121,
        394, 395, 396,  53, 397,   3, 398, 399, 162, 243,  10, 400, 401,
        103, 162, 243, 402, 403,  22, 404, 405,   3,  32, 168, 285, 301,
        406, 407, 408,  25,  10,  28,  59, 252, 167,  13,   4, 409,  61,
        410, 243, 411,  35,  13,   4,  28,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [124]:
y_output[:5]

array([1, 1, 1, 1, 1])

In [128]:
np.random.seed(22)
shuffle_indicies = np.random.permutation(np.arange(len(x_data)))

x_shuffled = x_data[shuffle_indicies]
y_shuffled = y_output[shuffle_indicies]

In [132]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_data = x_shuffled[:TRAIN_DATA]
train_target = y_shuffled[:TRAIN_DATA]

test_data = x_shuffled[TRAIN_DATA:TOTAL_DATA]
test_target = y_shuffled[TRAIN_DATA:TOTAL_DATA]