### Sentiment classification from movie reviews (IMBD dataset).

In [1]:
import tensorflow as tf

In [2]:
tf.__version__

'2.2.0'

Let's do sentiment classification from movie reviews (IMBD dataset).

Data: 25000 movie reviews from IMDB, labelled good (1) or bad (0).

Strategy: Embed sentences in reviews, them learn structure of reviews with LSTM

In [3]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.datasets import imdb

In [4]:
max_features = 20000 # select only the 20000 most common items (words) from our vocabulary
maxlen = 80 # only allow sequence of length 80

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(x_train.shape, x_test.shape)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])


(25000,) (25000,)


  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [5]:
# sequences shorter than maxlen items are padded with zeros at the end
# sequences longer than maxlen are cut
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

In [6]:
# define model
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid')) # output probability (between 0 and 1) of reviews being good or bad 

In [7]:
# compile model
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [8]:
# train model
model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f8312b8b400>

In [9]:
score = model.evaluate(x_test, y_test)
print("Test loss: ", score[0])
print("Test accuracy: ", score[1])

Test loss:  0.690742015838623
Test accuracy:  0.5415599942207336


In [10]:
# visualize model architecture
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         2560000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


In [11]:
# save model as json and weights as hdf5
json_string = model.to_json() # alternative: model.to_yaml()
model.save_weights("model_weights_sentiment_imdb.h5")

# save full model (architecture + weights)
model.save("model_sentiment_imdb.h5")

In [14]:
from tensorflow.keras.models import model_from_json, load_model

# load model from json and set weights
model_json = model_from_json(json_string)
model_json.load_weights("model_weights_sentiment_imdb.h5")

# load full model
model_full = load_model("model_sentiment_imdb.h5")


In [15]:
model_json.summary()
model_full.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         2560000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         2560000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
__________________________