In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd # used for managing the dataframes
import numpy as np # used for managing vectors and matrices
from wordcloud import WordCloud,STOPWORDS # for generating word clouds --> to show the most occuring words
from sklearn.model_selection import train_test_split

# graphing library
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode(connected=True)
# NLTK libraries
import nltk
from nltk.corpus import stopwords

In [2]:
import string
from string import punctuation
from os import listdir
from collections import Counter

In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [4]:
train_df_pos=pd.DataFrame(columns=['Filename','Text','Polarity'])
test_df_pos=pd.DataFrame(columns=['Filename','Text','Polarity'])
train_df_neg=pd.DataFrame(columns=['Filename','Text','Polarity'])
test_df_neg=pd.DataFrame(columns=['Filename','Text','Polarity'])

In [5]:
def doc_dataframe(folder_path):
    train_data,train_files=[],[]
    test_data,test_files=[],[]
    for filename in listdir(folder_path):
        if filename.startswith('cv9'):
            test_files.append(filename)
            test_data.append(load_doc(folder_path+filename))
        else:
            train_files.append(filename)
            train_data.append(load_doc(folder_path+filename))
    return train_data,test_data,train_files,test_files

In [6]:
folder_path = 'movie-polarity-data/pos/'
train_data,test_data,train_files,test_files=doc_dataframe(folder_path)

train_df_pos['Filename']=train_files
train_df_pos['Text']=train_data
train_df_pos['Polarity']=1

test_df_pos['Filename']=test_files
test_df_pos['Text']=test_data
test_df_pos['Polarity']=1

In [7]:
folder_path = 'movie-polarity-data/neg/'
train_data,test_data,train_files,test_files=doc_dataframe(folder_path)

train_df_neg['Filename']=train_files
train_df_neg['Text']=train_data
train_df_neg['Polarity']=0

test_df_neg['Filename']=test_files
test_df_neg['Text']=test_data
test_df_neg['Polarity']=0

In [8]:
train_df=pd.concat([train_df_pos,train_df_neg])
test_df=pd.concat([test_df_pos,test_df_neg])

In [9]:
# This function cleans the data and replace numbers by a common token: NUM. This function is inspired in kim's work
# at https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
import re
def clean_str(string):
    """
    Tokenization/string cleaning.
    """
    string = re.sub(r"[^A-Za-z0-9(),:!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r":", " : ", string)
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)
    string = string.strip().lower()
    string = re.sub(r"[0-9]{2,}", "NUM", string)
    return string

In [10]:
train_df["Cleaned_Text"]=train_df["Text"].apply(clean_str)
test_df["Cleaned_Text"]=test_df["Text"].apply(clean_str)

In [11]:
train_df.head()

Unnamed: 0,Filename,Text,Polarity,Cleaned_Text
0,cv087_1989.txt,many people dislike french films for their lac...,1,many people dislike french films for their lac...
1,cv525_16122.txt,""" take a number , fill out a form , and wait ...",1,"take a number , fill out a form , and wait you..."
2,cv116_28942.txt,capsule : a short punchy action sequel to the ...,1,capsule : a short punchy action sequel to the ...
3,cv130_17083.txt,"while watching wes anderson's rushmore , it ma...",1,"while watching wes anderson 's rushmore , it m..."
4,cv567_29611.txt,plot : a peculiar french girl grows up lonely ...,1,plot : a peculiar french girl grows up lonely ...


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
# this function will help in creating the tokenized Input Tensor 
vectorizer = CountVectorizer(stop_words='english',lowercase=False)

# fit the data
cleaned_text = vectorizer.fit(train_df["Cleaned_Text"])

# Get the vocabulary
vocabulary = cleaned_text.vocabulary_

# vocabulary is a dictionary with keys as the words and the values as frequencies
words = set(vocabulary.keys())

# Size of vocabulary
size_vocab = len(vocabulary)
print ("The vocabulary has {} words".format(size_vocab))

The vocabulary has 37054 words


In [13]:
# Get the data cleaned and transform words in numerical tokens.
text = train_df['Cleaned_Text'].tolist()
text = [string.split() for string in text]
# tokens is a list of lists. Each list inside tokens represents a sentence
tokens = []
for sentence in text:
    dummy = []
    for word in sentence:
        if word in words:
            dummy.append(vocabulary[word] + 1) # Sum 1 cause I want to leave the token 0 for padding in LSTM.
    tokens.append(dummy)
    
# Maximum length of a sentence in the data
max_len = max([len(sentence) for sentence in tokens])
print ("The largest sentence has {} tokens".format(max_len))

The largest sentence has 1292 tokens


In [14]:
X=tokens
Y=train_df[['Polarity']]

In [15]:
X_train,X_val,Y_train,Y_val=train_test_split(X,Y,test_size=0.1,random_state=0,stratify=Y.Polarity)

In [16]:
from keras.preprocessing.sequence import pad_sequences


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

Using TensorFlow backend.


In [17]:
# Sequence padding
trainX = pad_sequences(X_train, maxlen=max_len, padding='post') # max_len was found some cells above
validationX = pad_sequences(X_val, maxlen=max_len, padding='post')

In [18]:
trainX.shape

(1620, 1292)

In [19]:
from keras.preprocessing import sequence
from keras.models import Sequential,load_model
from keras.layers.core import Activation,Dense,Dropout,SpatialDropout1D
from keras.layers.wrappers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM,GRU
from keras import regularizers
from keras.callbacks import ModelCheckpoint,EarlyStopping

In [20]:
EMBEDDING_SIZE=100
HIDDEN_LAYER_SIZE=64
BATCH_SIZE=32
NUM_EPOCHS=10
DROPOUT=0.3

In [21]:
model=Sequential()
model.add(Embedding(size_vocab+1,EMBEDDING_SIZE,input_length=max_len))
model.add(SpatialDropout1D(DROPOUT))
model.add(Bidirectional(LSTM(HIDDEN_LAYER_SIZE,dropout=DROPOUT,recurrent_dropout=DROPOUT)))
model.add(Dense(2))
model.add(Activation("sigmoid"))
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=["accuracy"])

In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1292, 100)         3705500   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 1292, 100)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               84480     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
Total params: 3,790,238
Trainable params: 3,790,238
Non-trainable params: 0
_________________________________________________________________


In [23]:
model_filename="model_movie_reviews.h5"

In [24]:
train_Y=pd.get_dummies(Y_train,columns=['Polarity'])
val_Y=pd.get_dummies(Y_val,columns=['Polarity'])
train_Y=train_Y.values
val_Y=val_Y.values

In [25]:
checkpoint=ModelCheckpoint(model_filename, monitor='val_acc', verbose=0, save_best_only=False, mode='auto', period=1)
early=EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
history=model.fit(trainX, train_Y,batch_size=32,epochs=NUM_EPOCHS,validation_data=(validationX, val_Y),callbacks=[checkpoint,early])

Train on 1620 samples, validate on 180 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [26]:
# Get the data cleaned and transform words in numerical tokens.
text = test_df['Cleaned_Text'].tolist()
text = [string.split() for string in text]
# tokens is a list of lists. Each list inside tokens represents a sentence
tokens = []
for sentence in text:
    dummy = []
    for word in sentence:
        if word in words:
            dummy.append(vocabulary[word] + 1) # Sum 1 cause I want to leave the token 0 for padding in LSTM.
    tokens.append(dummy)

In [27]:
X_test=tokens
Xtest = pad_sequences(X_test, maxlen=1292, padding='post') # max_len was found some cells above
Ytest=test_df[['Polarity']]
Ytest=pd.get_dummies(Ytest,columns=['Polarity'])

In [28]:
loss, acc = model.evaluate(Xtest, Ytest.values, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 64.500000
