In [1]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [2]:
import pandas as pd

In [4]:
data = pd.read_csv("IMDB_dataset.csv")

In [5]:
data = pd.DataFrame({"review": data["review"], "sentiment": data["sentiment"].map({"positive": 1, "negative": 0})})

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(["sentiment"], axis=1), data["sentiment"], test_size=0.3, random_state=7)

In [8]:
import re
import string
from spacy.lang.en.stop_words import STOP_WORDS as stop_words
from spacy.lang.en import English

In [9]:
parser = English()
punctuations = string.punctuation

In [10]:
def spacy_text_normalizer(text):
    text = re.sub(r"<.*>", "", text) #Remove all tags
    tokens = parser(text) #Get doc from text
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens ] #Normalize words
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ] #Remove stop words and punctuation
    return " ".join(tokens)

In [12]:
%%time
X_train["review"] = X_train["review"].apply(spacy_text_normalizer)
X_test["review"] = X_test["review"].apply(spacy_text_normalizer)

CPU times: user 51.4 s, sys: 142 ms, total: 51.5 s
Wall time: 51.5 s


In [18]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [19]:
def make_sequences_from_text(text_series, max_features=5000, maxlen=300):
  tokenizer = Tokenizer(num_words=max_features)
  tokenizer.fit_on_texts(text_series)
  list_tokenized = tokenizer.texts_to_sequences(text_series)
  return pad_sequences(list_tokenized, maxlen=maxlen)

In [22]:
%%time
sequences = make_sequences_from_text(X_train["review"])

CPU times: user 3.32 s, sys: 19.9 ms, total: 3.34 s
Wall time: 3.34 s


In [24]:
sequences

array([[   0,    0,    0, ...,  187,   25,   25],
       [   0,    0,    0, ..., 2105,  888,  206],
       [   0,    0,    0, ..., 1136,    2,  111],
       ...,
       [   0,    0,    0, ..., 1519,  358,   25],
       [   0,    0,    0, ..., 1684, 1157,  871],
       [   0,    0,    0, ..., 1163,   23,   39]], dtype=int32)

In [25]:
sequences.shape

(35000, 300)

In [26]:
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Sequential

In [30]:
embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [31]:
batch_size = 128
epochs = 3
model.fit(make_sequences_from_text(X_train["review"]), y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fa0735f0780>

<h4>Model overfits. Need to make model weaker</h4>

In [39]:
def tune_model(max_features_list, maxlen_list, embed_size_list, batch_size_list, dropout_list):
  best_model = None
  best_params = None
  best_val_accuracy = 0
  for max_features in max_features_list:
    for maxlen in maxlen_list:
      for embed_size in embed_size_list:
        for batch_size in batch_size_list:
          for dropout in dropout_list:
            print({"max_features": max_features,
                    "maxlen": maxlen,
                    "embed_size": embed_size,
                    "batch_size": batch_size,
                    "dropout": dropout})
            model = Sequential()
            model.add(Embedding(max_features, embed_size))
            model.add(Bidirectional(LSTM(32, return_sequences = True)))
            model.add(GlobalMaxPool1D())
            model.add(Dense(20, activation="relu"))
            model.add(Dropout(dropout))
            model.add(Dense(1, activation="sigmoid"))
            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            history = model.fit(make_sequences_from_text(X_train["review"], max_features, maxlen), y_train, batch_size=batch_size, epochs=3, validation_split=0.2)
            if model.history.history['val_accuracy'][-1] > best_val_accuracy:
              best_val_accuracy = model.history.history['val_accuracy'][-1]
              best_model = model
              best_params = {"max_features": max_features,
                             "maxlen": maxlen,
                             "embed_size": embed_size,
                             "batch_size": batch_size,
                             "dropout": dropout}
  return (best_model, best_params)

In [40]:
tune_model([5000, 10000, 20000], [300], [64, 128, 256], [128, 256], [0.1, 0.5])

{'max_features': 5000, 'maxlen': 300, 'embed_size': 64, 'batch_size': 128, 'dropout': 0.1}
Epoch 1/3
Epoch 2/3
Epoch 3/3
{'max_features': 5000, 'maxlen': 300, 'embed_size': 64, 'batch_size': 128, 'dropout': 0.5}
Epoch 1/3
Epoch 2/3
Epoch 3/3
{'max_features': 5000, 'maxlen': 300, 'embed_size': 64, 'batch_size': 256, 'dropout': 0.1}
Epoch 1/3
Epoch 2/3
Epoch 3/3
{'max_features': 5000, 'maxlen': 300, 'embed_size': 64, 'batch_size': 256, 'dropout': 0.5}
Epoch 1/3
Epoch 2/3
Epoch 3/3
{'max_features': 5000, 'maxlen': 300, 'embed_size': 128, 'batch_size': 128, 'dropout': 0.1}
Epoch 1/3
Epoch 2/3
Epoch 3/3
{'max_features': 5000, 'maxlen': 300, 'embed_size': 128, 'batch_size': 128, 'dropout': 0.5}
Epoch 1/3
Epoch 2/3
Epoch 3/3
{'max_features': 5000, 'maxlen': 300, 'embed_size': 128, 'batch_size': 256, 'dropout': 0.1}
Epoch 1/3
Epoch 2/3
Epoch 3/3
{'max_features': 5000, 'maxlen': 300, 'embed_size': 128, 'batch_size': 256, 'dropout': 0.5}
Epoch 1/3
Epoch 2/3
Epoch 3/3
{'max_features': 5000, 'maxl

(<tensorflow.python.keras.engine.sequential.Sequential at 0x7f9feec4bda0>,
 {'batch_size': 256,
  'dropout': 0.5,
  'embed_size': 256,
  'max_features': 20000,
  'maxlen': 300})

<h4>All models overfits. I think there is some problem with data. More thorough data analysis needs to be done.</h4>