In [21]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import os
import re
import json
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [2]:
# Variables 
train_positive_reviews = []
train_negative_reviews = []

test_positive_reviews = []
test_negative_reviews = []

In [3]:
for filename_ in os.listdir('../data/aclImdb/train/pos'):
    with open(os.path.join('../data/aclImdb/train/pos',filename_),'r') as f:
        sentiment = f.read()
        train_positive_reviews.append(sentiment)
    
for filename_ in os.listdir('../data/aclImdb/train/neg'):
    with open(os.path.join('../data/aclImdb/train/neg',filename_),'r') as f:
        sentiment = f.read()
        train_negative_reviews.append(sentiment)
    
for filename_ in os.listdir('../data/aclImdb/test/pos'):
    with open(os.path.join('../data/aclImdb/test/pos',filename_),'r') as f:
        sentiment = f.read()
        test_positive_reviews.append(sentiment)
    
for filename_ in os.listdir('../data/aclImdb/test/neg'):
    with open(os.path.join('../data/aclImdb/test/neg',filename_),'r') as f:
        sentiment = f.read()
        test_negative_reviews.append(sentiment)

In [4]:
train_reviews = train_positive_reviews + train_negative_reviews
test_reviews = test_positive_reviews + test_negative_reviews

train_labels = [0]*len(train_positive_reviews) + [1]*len(train_negative_reviews)
test_labels = [0]*len(test_positive_reviews) + [1]*len(test_negative_reviews)

print 'Number of training reviews: ', len(train_reviews)
print 'Number of testing reviews: ', len(test_reviews)
print 'Number of train labels: ', len(train_labels)
print 'Number of test labels: ', len(test_labels)

Number of training reviews:  25000
Number of testing reviews:  25000
Number of train labels:  25000
Number of test labels:  25000


In [5]:
reviews_train_df = pd.DataFrame({'reviews': train_reviews, 'sentiment':train_labels})
reviews_test_df = pd.DataFrame({'reviews': test_reviews, 'sentiment':test_labels})

reviews_train_df = reviews_train_df.sample(frac=1).reset_index(drop=True)
reviews_test_df = reviews_test_df.sample(frac=1).reset_index(drop=True)

# 1 for negative and 0 for positive
print reviews_train_df.head()
print reviews_test_df.head()

                                             reviews  sentiment
0  I may very well be one of the few who really s...          0
1  so yes it is quite nostalgic watching the 1st ...          0
2  And a self-admitted one to boot. At one point ...          1
3  MAJOR LEAGUE: BACK TO THE MINORS (1998) ½* <br...          1
4  After some of the negative reviews i heard on ...          0
                                             reviews  sentiment
0  'Leatherheads' tries so hard. Tries to be ligh...          1
1  I absolutely positively can't believe my fello...          1
2  Just reading why this show got canceled makes ...          0
3  A hilarious comedy by the best director ever, ...          0
4  I feel conflicted about this film - it is one ...          1


In [6]:
#def preprocess(sentence):
#    list_of_word = text_to_word_sequence(sentence,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
#                                               lower=True,
#                                               split=" ")
#    return list_of_word

#reviews_train_df['words_reviews'] = reviews_train_df['reviews'].map(preprocess)
#reviews_test_df['words_reviews'] = reviews_test_df['reviews'].map(preprocess)

#reviews_train_df.head()
#reviews_test_df.head()

In [7]:
reviews = train_reviews + test_reviews
labels = train_labels + test_labels

MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 500
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
# one hot encoding
labels = np_utils.to_categorical(np.asarray(labels))



Found 124259 unique tokens.


In [8]:
print data.shape
print labels.shape

(50000, 500)
(50000, 2)


In [9]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:25000]
y_train = labels[:25000]
x_val = data[25000:]
y_val = labels[25000:]

In [10]:
embeddings_index = {}
f = open('../data/glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [11]:
embeddings_index['verplank'].shape

(100,)

In [24]:
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word,i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [27]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,EMBEDDING_DIM,weights=[embedding_matrix],input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 100)          12426000  
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 500, 32)           9632      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               2000250   
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 502       
Total params: 14,436,384
Trainable params: 2,010,384
Non-trainable params: 12,426,000
________________________________________________________

In [28]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128, verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
50s - loss: 0.6352 - acc: 0.6345 - val_loss: 0.4585 - val_acc: 0.7886
Epoch 2/2
49s - loss: 0.4091 - acc: 0.8173 - val_loss: 0.3976 - val_acc: 0.8195


<keras.callbacks.History at 0x13dc3a490>