In [1]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import pickle

In [2]:
data = pd.read_csv("../Data/moviereviews2.tsv", sep='\t')

In [3]:
data.head()

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...


In [4]:
data.shape

(6000, 2)

## Preprocessing

In [5]:
def cleaning(text):
    text_clean = str(text)
    text_clean = text_clean.replace('\r',' ')
    text_clean = text_clean.replace('\n',' ')
    text_clean = text_clean.replace('&amp',' ')
    text_clean = text_clean.replace('&gt',' ')
    text_clean = text_clean.replace('&lt',' ')
    text_clean = text_clean.replace('[^a-zA-Z]+',' ')
    return text_clean

def case_folding(text):
    text_cf = text
    text_cf = text_cf.lower()
    return text_cf

def lemmatization(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    text_split = text.split(" ")
    lemma_list = []
    for i in text_split:
        lemma_text = wordnet_lemmatizer.lemmatize(i, pos="v")
        lemma_list.append(lemma_text)
    lemmatized = ' '.join(map(str,lemma_list))
    return lemmatized

def stopword_removal(text):
    stopword_list = list(stopwords.words('english'))
    text_stopword = text
    text_stopword = ' '.join([i for i in text_stopword.split() if i not in stopword_list])
    return text_stopword

In [6]:
def preprocessing(text):
    text_preprocess = text
    text_preprocess = cleaning(text_preprocess)
    text_preprocess = case_folding(text_preprocess)
    text_preprocess = lemmatization(text_preprocess)
    text_preprocess = stopword_removal(text_preprocess)
    return text_preprocess

In [7]:
data['review_parsed'] = data['review'].map(lambda com : preprocessing(com))

In [8]:
import tensorflow as tf

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [9]:
data.loc[data['label'] == 'neg', ['label']] = 0
data.loc[data['label'] == 'pos', ['label']] = 1

In [10]:
X = data['review_parsed']
y = data['label']

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=8)

In [12]:
vocab_size = 5000

In [34]:
vect = Tokenizer(num_words = vocab_size)
vect.fit_on_texts(X_train)

print(vocab_size)

5000


In [35]:
filename = '../Data/tokenize.pkl'
pickle.dump(vect, open(filename, 'wb'), protocol = pickle.HIGHEST_PROTOCOL)

In [36]:
def tokenize(list_text):
    vects = pickle.load(open('../Data/tokenize.pkl', 'rb'))
    encoded_docs = vects.texts_to_sequences(list_text)
    padded_docs = sequence.pad_sequences(encoded_docs,maxlen=200,padding='post')
    return padded_docs

In [37]:
padded_docs_train = tokenize(X_train)
padded_docs_val = tokenize(X_val)

## Create a Model

In [17]:
from tensorflow.keras.models import Sequential

In [18]:
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Embedding

In [19]:
model = Sequential()

model.add(Embedding(vocab_size, output_dim=64))
# model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(optimizer = 'adam', loss='binary_crossentropy',metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.


In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          320000    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 128)               16512     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 402,689
Trainable params: 402,689
Non-trainable params: 0
_________________________________________________________________


In [21]:
history = model.fit(padded_docs_train, y_train.values, 
                    validation_data=(padded_docs_val, y_val.values),
                    epochs=2, 
                    verbose=1)

Train on 4800 samples, validate on 1200 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/2
Epoch 2/2


In [24]:
model.save('../Model/bilstm_2epochs.h5')

In [25]:
from tensorflow.keras.models import load_model

In [26]:
model = load_model('../Model/bilstm_2epochs.h5')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          320000    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 128)               16512     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 402,689
Trainable params: 402,689
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.predict_classes(padded_docs_val)

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [40]:
X_val.tolist()

['movie plain terrible!!!! slow acting, slow get point wooden character there. best part show iron maiden sing video theater thats it. end worth watch wait it!! character movie put sleep almost. avoid it!!!',
 "movie mess. i'm surprise even theatrical release. without robin williams would go straight video. poorly written. poorly directed. worse offense take interest topic reduce ridiculous bore thriller thrill suspense inner emotional logic.especially first half hour movie dovetail series ridiculous set piece top audience saw laugh it. save money. trailer totally mislead - suspenseful thrill - fact movie's truly worst offense simply boring.",
 "least able enjoy mock movie surprise since barely able sit it. honesty, guess cover dvd case cost entire movie. say director boogeyman, new version come out...nice touch guys, mislead enough rope in. thing frustrate insufferable act copycat haircut. usually see kind hair ten year old boy character act like it. film look like shoot d+ grad stude

In [33]:
padded_docs_val[0]

array([   2,  832,  189,  500,  201,  500,    9,  112, 1446,   22,  174,
         47,   81,   15, 2527, 4688,  586,  236,  562,  944,   10,   39,
        222,    8,  352,   10,   22,    2,  130,  878,  155,  454,   10,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   