In [1]:
from numpy.random import seed
seed(1)
import tensorflow
tensorflow.random.set_seed(2)


from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from nltk.corpus import stopwords
import pandas as pd
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
from keras.preprocessing.text import Tokenizer 

In [2]:
stops = set(stopwords.words('english'))

def load_dataset(df):
    
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return (x_data, y_data)

In [3]:
stops = set(stopwords.words('english'))
df = pd.read_csv('D:/PESU/NLP/final/dataset/wordnetIMDB.csv')
df=df.sample(frac=1)
print(df.head())


print('Loading data...')
x_data, y_data = load_dataset(df)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

                                                   review sentiment
43846   While watching a mundane modern movie (The Run...  positive
139009  Joe Don Baker is. .. Thomas Jefferson Geronimo...  negative
121123  I realize several Ben Stiller movies are out o...  negative
60636   Being an admitted chess addict, I was excited ...  negative
26945   I originally saw this several years ago while ...  positive
Loading data...


In [4]:
df = pd.read_csv('D:/PESU/NLP/final/dataset/IMDB Dataset.csv')
x_val_data, y_val_data = load_dataset(df)
x_val, x_val_test, y_val, y_val_test = train_test_split(x_val_data, y_val_data, test_size = 0.2)

In [5]:
# ENCODE REVIEW
token = Tokenizer(lower=False) 
token.fit_on_texts(x_train)

x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)
x_val = token.texts_to_sequences(x_val)
x_val_test = token.texts_to_sequences(x_val_test)



x_train = pad_sequences(x_train, maxlen=131, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=131, padding='post', truncating='post')
x_val = pad_sequences(x_val, maxlen=131, padding='post', truncating='post')
x_val_test = pad_sequences(x_val_test, maxlen=131   , padding='post', truncating='post')


total_words = len(token.word_index) + 1
print(total_words)

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

107808
119980 train sequences
29996 test sequences
Pad sequences (samples x time)
x_train shape: (119980, 131)
x_test shape: (29996, 131)


In [6]:
import keras
vanillaLSTM_model = keras.models.load_model("vanillaLSTM")
score, acc = vanillaLSTM_model.evaluate(x_test, y_test, batch_size=128)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 2.4657058715820312
Test accuracy: 0.4918655753135681


In [7]:
print('Build model...')

# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = 131))
model.add(LSTM(LSTM_OUT, return_sequences=True))  # returns a sequence of vectors of dimension 32
model.add(LSTM(128, return_sequences=True))  # returns a sequence of vectors of dimension 32
model.add(LSTM(128, return_sequences=True)) 
model.add(LSTM(LSTM_OUT))  # return a single vector of dimension 32
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())


Build model...
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 131, 32)           3449856   
                                                                 
 lstm (LSTM)                 (None, 131, 64)           24832     
                                                                 
 lstm_1 (LSTM)               (None, 131, 128)          98816     
                                                                 
 lstm_2 (LSTM)               (None, 131, 128)          131584    
                                                                 
 lstm_3 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 3,754,561
Trainable params:

In [8]:
model.fit(x_train, y_train, batch_size = 128, epochs = 15, validation_data=(x_val, y_val))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x290c2fd8b50>

In [9]:
score, acc = model.evaluate(x_test, y_test, batch_size=128)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.09775712341070175
Test accuracy: 0.9744299054145813


In [10]:

score, acc = model.evaluate(x_val_test, y_val_test, batch_size=128)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.027724862098693848
Test accuracy: 0.9926999807357788
