In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/imdb-review-dataset/imdb_master.csv


In [2]:
df = pd.read_csv('/kaggle/input/imdb-review-dataset/imdb_master.csv', encoding="latin-1")
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [3]:
df.type.unique()

array(['test', 'train'], dtype=object)

In [4]:
df.type.value_counts()

train    75000
test     25000
Name: type, dtype: int64

In [5]:
tran_df = df[df.type=='train']
test_df = df[df.type=='test']

In [6]:
tran_df = tran_df.drop(['Unnamed: 0','type','file'],axis=1)
tran_df.columns = ["review","sentiment"]
tran_df.head()

Unnamed: 0,review,sentiment
25000,Story of a man who has unnatural feelings for ...,neg
25001,Airport '77 starts as a brand new luxury 747 p...,neg
25002,This film lacked something I couldn't put my f...,neg
25003,"Sorry everyone,,, I know this is supposed to b...",neg
25004,When I was little my parents took me along to ...,neg


In [7]:
tran_df.sentiment.value_counts()

unsup    50000
pos      12500
neg      12500
Name: sentiment, dtype: int64

In [8]:
tran_df = tran_df[tran_df.sentiment != 'unsup']
tran_df['sentiment'] = tran_df['sentiment'].map({'pos': 1, 'neg': 0})

In [9]:
tran_df = tran_df.reset_index().drop(['index'],axis=1)
tran_df.head()

Unnamed: 0,review,sentiment
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0


In [10]:
# Testing data
test_df = test_df.drop(['Unnamed: 0','type','file'],axis=1)
test_df.columns = ["review","sentiment"]
test_df = test_df[test_df.sentiment != 'unsup']
test_df['sentiment'] = test_df['sentiment'].map({'pos': 1, 'neg': 0})
test_df.head()

Unnamed: 0,review,sentiment
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0


In [11]:
X_train = tran_df.review.values
y_train = tran_df.sentiment.values

X_test = test_df.review.values
y_test = test_df.sentiment.values

### Learn Word Embedding
The word embeddings of our dataset can be learned while training a neural network on the classification problem.

Before put it into a network, the text data is first encoded so that each word is represented by a unique integer.

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
total_reviews = X_train + X_test
tokenizer.fit_on_texts(total_reviews)

Using TensorFlow backend.


In [13]:
text_len = [len(i.split()) for i in total_reviews]
print(" Average length of review in training dataset", np.mean(text_len))

 Average length of review in training dataset 461.29888


In [14]:
# Max length
#max_len = 460 the acc will stuck at 50%, try to use a smaller one
max_len = 260

# Define vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_len, padding='post')

The Embedding layer requires the specification of the vocabulary size (vocab_size), the size of the real-valued vector space EMBEDDING_DIM = 100, and the maximum length of input documents max_length .

### Build Model
The model will use an Embedding layer as the first hidden layer. The Embedding layer is initialized with random weights and will learn an embedding for all of the words in the training dataset during training of the model.

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense

embedding_dim = 100
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [16]:
# Complie model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 260, 100)          12500400  
_________________________________________________________________
gru (GRU)                    (None, 32)                12864     
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 12,513,297
Trainable params: 12,513,297
Non-trainable params: 0
_________________________________________________________________


In [18]:
# Fix model
model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/25
25000/25000 - 176s - loss: 0.6835 - acc: 0.5316 - val_loss: 0.6697 - val_acc: 0.5469
Epoch 2/25
25000/25000 - 170s - loss: 0.6243 - acc: 0.5913 - val_loss: 0.6438 - val_acc: 0.5756
Epoch 3/25
25000/25000 - 170s - loss: 0.5615 - acc: 0.6642 - val_loss: 0.5511 - val_acc: 0.7331
Epoch 4/25
25000/25000 - 169s - loss: 0.4379 - acc: 0.8158 - val_loss: 0.4206 - val_acc: 0.8368
Epoch 5/25
25000/25000 - 170s - loss: 0.3813 - acc: 0.8574 - val_loss: 0.4317 - val_acc: 0.8339
Epoch 6/25
25000/25000 - 171s - loss: 0.3919 - acc: 0.8489 - val_loss: 0.4207 - val_acc: 0.8352
Epoch 7/25
25000/25000 - 171s - loss: 0.3314 - acc: 0.8837 - val_loss: 0.4235 - val_acc: 0.8365
Epoch 8/25
25000/25000 - 172s - loss: 0.2927 - acc: 0.8956 - val_loss: 0.4348 - val_acc: 0.8252
Epoch 9/25
25000/25000 - 170s - loss: 0.3099 - acc: 0.8800 - val_loss: 0.4573 - val_acc: 0.8057
Epoch 10/25
25000/25000 - 171s - loss: 0.2262 - acc: 0.9217 - val_loss: 0.4678 - val_a

<tensorflow.python.keras.callbacks.History at 0x7f0978d4a978>

In [19]:
score, acc = model.evaluate(X_test_pad, y_test, batch_size=32)



### LSTM

In [20]:
embedding_dim = 100
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [21]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 260, 100)          12500400  
_________________________________________________________________
lstm (LSTM)                  (None, 32)                17024     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 12,517,457
Trainable params: 12,517,457
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/25
25000/25000 - 177s - loss: 0.6715 - accuracy: 0.5669 - val_loss: 0.5747 - val_accuracy: 0.7717
Epoch 2/25
25000/25000 - 173s - loss: 0.5461 - accuracy: 0.7570 - val_loss: 0.4958 - val_accuracy: 0.7907
Epoch 3/25
25000/25000 - 172s - loss: 0.5432 - accuracy: 0.7304 - val_loss: 0.5212 - val_accuracy: 0.7744
Epoch 4/25
25000/25000 - 173s - loss: 0.4572 - accuracy: 0.8143 - val_loss: 0.4757 - val_accuracy: 0.8014
Epoch 5/25
25000/25000 - 172s - loss: 0.4492 - accuracy: 0.8099 - val_loss: 0.5727 - val_accuracy: 0.7100
Epoch 6/25
25000/25000 - 174s - loss: 0.4308 - accuracy: 0.8252 - val_loss: 0.4895 - val_accuracy: 0.8041
Epoch 7/25
25000/25000 - 174s - loss: 0.3776 - accuracy: 0.8552 - val_loss: 0.4973 - val_accuracy: 0.8088
Epoch 8/25
25000/25000 - 173s - loss: 0.3587 - accuracy: 0.8662 - val_loss: 0.5067 - val_accuracy: 0.8051
Epoch 9/25
25000/25000 - 174s - loss: 0.3525 - accuracy: 0.8679 - val_loss: 0.5158 - val_accuracy: 0.8

<tensorflow.python.keras.callbacks.History at 0x7f08e44fb5c0>

In [24]:
score, acc = model.evaluate(X_test_pad, y_test, batch_size=32)

