https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456

In [1]:
import pandas as pd
import numpy as np

fullframe = pd.DataFrame()
fullframe = pd.read_csv('../movie_data_shuffled.csv', encoding='utf-8')
df = fullframe[:25000]
df.shape

(25000, 2)

In [2]:
x_train = df.loc[:12499, 'review'].values
y_train = df.loc[:12499, 'sentiment'].values

x_test = df.loc[12500:, 'review'].values
y_test = df.loc[12500:, 'sentiment'].values

In [3]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(12500,)
(12500,)
(12500,)
(12500,)


In [4]:
from tensorflow.python.keras.preprocessing.text import Tokenizer as tokes
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
tokenizer_obj = tokes(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

total_reviews = x_train + x_test
tokenizer_obj.fit_on_texts(total_reviews)

print(total_reviews.shape)

#pad sequences
max_length = max([len(s.split()) for s in total_reviews])

print(max_length)

#define vocabulary size
vocab_size = len(tokenizer_obj.word_index) + 1

print(vocab_size)

x_train_tokens = tokenizer_obj.texts_to_sequences(x_train)
x_test_tokens = tokenizer_obj.texts_to_sequences(x_test)

x_train_pad = pad_sequences(x_train_tokens, maxlen=max_length, padding='post')
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_length, padding='post')

(12500,)
2421
91282


In [5]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Embedding, LSTM, GRU
from tensorflow.python.keras.layers.embeddings import Embedding
from tensorflow.python.keras import losses

EMBEDDING_DIM = 100

print("Build Model...")

### For whatever reason this is needed to make tensorflow work with python 3.6 on mac. It must come before Sequential()
import tensorflow as tf
from tensorflow.core.protobuf import rewriter_config_pb2
from tensorflow.keras.backend import set_session
tf.keras.backend.clear_session()  # For easy reset of notebook state.

config_proto = tf.ConfigProto()
off = rewriter_config_pb2.RewriterConfig.OFF
config_proto.graph_options.rewrite_options.arithmetic_optimization = off
session = tf.Session(config=config_proto)
set_session(session)
### end whatever reason

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss=losses.binary_crossentropy, optimizer='adam', metrics=['accuracy'])

Build Model...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [6]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2421, 100)         9128200   
_________________________________________________________________
gru (GRU)                    (None, 32)                12768     
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 9,141,001
Trainable params: 9,141,001
Non-trainable params: 0
_________________________________________________________________


In [7]:
for layer in model.layers:
    print(layer, layer.trainable)

<tensorflow.python.keras.layers.embeddings.Embedding object at 0xb27208c50> True
<tensorflow.python.keras.layers.recurrent.GRU object at 0xb272ad048> True
<tensorflow.python.keras.layers.core.Dense object at 0x1145168d0> True


In [8]:
print ("training model...")

model.fit(x_train_pad, y_train, batch_size=64, epochs=2, validation_data=(x_test_pad, y_test), shuffle=True, verbose=1)



training model...
Train on 12500 samples, validate on 12500 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/2
Epoch 2/2
 1344/12500 [==>...........................] - ETA: 5:45 - loss: 0.6939 - acc: 0.4836

KeyboardInterrupt: 

In [None]:
from sklearn import model_selection
import pickle

filename = "imdb_model_v0.sav"
pickle.dump(model, open(filename, 'wb'))