In [39]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import scikitplot as skplt
import matplotlib.pyplot as plt


In [40]:
text = []
label = []
for i in open('/content/sample_data/imdb_labelled.txt'):
  i = i.rstrip('\n').split('\t')
  text.append(i[0].strip(' '))
  label.append(i[1])

text = np.array(text)
label = np.array(label)

print (text.shape)
print (label.shape)

(1000,)
(1000,)


In [41]:
df_text = pd.DataFrame({'text': text , 'label':label })

In [42]:
df_text.head(5)

Unnamed: 0,text,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [43]:
# Now we will do preprocessing on the text data using the following
vectorize_layer = keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens = None,
    standardize = 'lower_and_strip_punctuation',
    split = 'whitespace',
    ngrams = None,
    output_mode = 'int',
    output_sequence_length = None
)

In [44]:
vectorize_layer.adapt(text)

In [45]:
print (vectorize_layer.get_vocabulary())



In [46]:
a =  vectorize_layer.get_vocabulary()
print (a)
print (a.index('movie'))

13


In [47]:
# now use it to process some text
# get_vocabulary is used to get the lsit of all the words sorted with descending
# number of times a word is repeated and to encode the input text according to the index
input_text = [['good movie'], ['Mochen Yang']]
vectorize_layer(input_text)

<tf.Tensor: shape=(2, 2), dtype=int64, numpy=
array([[31, 13],
       [ 1,  1]])>

**Pre processing**

In [48]:
df_text = df_text.astype({'label': 'int32'})
df_text.dtypes

text     object
label     int32
dtype: object

In [49]:
X_train, X_test, y_train, y_test = train_test_split(df_text['text'], df_text['label'], test_size=0.20, random_state=42)

print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)


(800,)
(800,)
(200,)
(200,)


**RNN model**

Now, let's actually build a basic RNN model, by stacking together the text processing layer, an embedding layer, and an RNN layer

In [50]:
model_rnn = keras.Sequential()

# Vectorize layer
model_rnn.add(vectorize_layer)

# Embedding layer
model_rnn.add(keras.layers.Embedding(
    input_dim = len(vectorize_layer.get_vocabulary()),
    output_dim = 64,
    mask_zero = True
))

# RNN layer
model_rnn.add(keras.layers.SimpleRNN(128)) # see note below

# Output layer
model_rnn.add(keras.layers.Dense(1, activation = 'sigmoid'))

In [51]:
# configure training / optimization
model_rnn.compile(loss = keras.losses.BinaryCrossentropy(),
                  optimizer='adam',
                  metrics=['accuracy'])

In [52]:
# training with 20% validation and 10 epochs.
model_rnn.fit(x = X_train, y = y_train, validation_split = 0.2,
              epochs=10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff2091a6ad0>

In [53]:
model_rnn.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_4 (Embedding)     (None, None, 64)          203264    
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 128)               24704     
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                                 
Total params: 228,097
Trainable params: 228,097
Non-trainable params: 0
_________________________________________________________________


In [54]:
# Lets use this RNN to make predictions on some sentences 
rnn_prediction = (model_rnn.predict(X_test)[:,0] >= 0.5).astype(bool)

print(classification_report(y_test, rnn_prediction, labels=[1, 0]))

              precision    recall  f1-score   support

           1       0.69      0.54      0.61       109
           0       0.56      0.70      0.62        91

    accuracy                           0.61       200
   macro avg       0.62      0.62      0.61       200
weighted avg       0.63      0.61      0.61       200



In [55]:
a = (model_rnn.predict(X_test)[::,0])
print (a.shape)
print (y_test.shape)

(200,)
(200,)


In [56]:
y_test = y_test.reset_index(drop = True)

In [57]:
a = pd.Series(a)
print ((a).dtype)
df_output = pd.DataFrame ({'real_label': y_test , 'predict_prob': a})

float32


In [58]:
#LR curves
#skplt.metrics.plot_roc(df_output['real_label'], df_output['predict_prob'])

# skplt.metrics.plot_cumulative_gain(y_test, model_rnn.predict(X_test))

# skplt.metrics.plot_lift_curve(y_test, model_rnn.predict(X_test))

LSTM network

Lets now try a LSTM network 

In [59]:
model_lstm = keras.Sequential()

model_lstm.add(vectorize_layer)

model_lstm.add(keras.layers.Embedding(
    input_dim = len(vectorize_layer.get_vocabulary()),
    output_dim = 64,
    mask_zero = True
))

model_lstm.add(keras.layers.LSTM(128))

model_lstm.add(keras.layers.Dense(1, activation = 'sigmoid'))



In [60]:
# configure training / optimization
model_lstm.compile(loss = keras.losses.BinaryCrossentropy(),
                   optimizer='adam',
                   metrics=['accuracy'])

In [61]:
# training with 20% validation and 10 epochs.
model_lstm.fit(x = X_train, y = y_train, validation_split = 0.2,
               epochs=10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff20a03af50>

In [62]:
# Lets use this RNN to make predictions on some sentences 
lstm_prediction = (model_lstm.predict(X_test)[:,0] >= 0.5).astype(bool)

print(classification_report(y_test, lstm_prediction, labels=[1, 0]))

              precision    recall  f1-score   support

           1       0.74      0.80      0.77       109
           0       0.73      0.66      0.69        91

    accuracy                           0.73       200
   macro avg       0.73      0.73      0.73       200
weighted avg       0.73      0.73      0.73       200



Lets build GRU 

In [63]:
model_gru = keras.Sequential()

model_gru.add(vectorize_layer)

model_gru.add(keras.layers.Embedding(
    input_dim = len(vectorize_layer.get_vocabulary()),
    output_dim = 64,
    mask_zero = True
))

model_gru.add(keras.layers.GRU(128))

model_gru.add(keras.layers.Dense(1, activation = 'sigmoid'))

In [64]:
# configure training / optimization
model_gru.compile(loss = keras.losses.BinaryCrossentropy(),
                  optimizer='adam',
                  metrics=['accuracy'])

In [65]:
# training with 20% validation and 10 epochs.
model_gru.fit(x = X_train, y = y_train, validation_split = 0.2,
              epochs=10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff205523790>

In [66]:
# Lets use this RNN to make predictions on some sentences 
lstm_prediction = (model_lstm.predict(X_test)[:,0] >= 0.5).astype(bool)

print(classification_report(y_test, lstm_prediction, labels=[1, 0]))

              precision    recall  f1-score   support

           1       0.74      0.80      0.77       109
           0       0.73      0.66      0.69        91

    accuracy                           0.73       200
   macro avg       0.73      0.73      0.73       200
weighted avg       0.73      0.73      0.73       200



Lets also try to make a bidirectional RNN

In [67]:
model_bilstm = keras.Sequential()

model_bilstm.add(vectorize_layer)

model_bilstm.add(keras.layers.Embedding(
    input_dim = len(vectorize_layer.get_vocabulary()),
    output_dim = 64,
    mask_zero = True
))

model_bilstm.add(keras.layers.Bidirectional(keras.layers.LSTM(128)))

model_bilstm.add(keras.layers.Dense(1, activation = 'sigmoid'))

In [68]:
# configure training / optimization
model_bilstm.compile(loss = keras.losses.BinaryCrossentropy(),
                     optimizer='adam',
                     metrics=['accuracy'])

In [69]:
# training with 20% validation and 10 epochs.
model_bilstm.fit(x = X_train, y = y_train, validation_split = 0.2,
                 epochs = 10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff20370bd90>

In [70]:
# Lets use this RNN to make predictions on some sentences 
lstm_prediction = (model_lstm.predict(X_test)[:,0] >= 0.5).astype(bool)

print(classification_report(y_test, lstm_prediction, labels=[1, 0]))

              precision    recall  f1-score   support

           1       0.74      0.80      0.77       109
           0       0.73      0.66      0.69        91

    accuracy                           0.73       200
   macro avg       0.73      0.73      0.73       200
weighted avg       0.73      0.73      0.73       200

