In [1]:
import tensorflow as tf
from tensorflow.keras import layers
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import os
import warnings

In [3]:
df = pd.read_csv("data.csv")
review_list = []

def analyze_review(row,review_list):
    if str(row['sentiment']).strip().lower() in ['positive','negative']:
        review_list.append({"review":row["review"],"sentiment":row["sentiment"]})
        return None
    
    review = row['sentiment'] + row['review']
    sentiment,i = "",2
    while row['Unnamed: '+str(i)].strip().lower() not in ['positive','negative']:
        review += row['Unnamed: '+str(i)]
        i+=1
        
    sentiment = row['Unnamed: '+str(i)]
    review_list.append({"review":review,"sentiment":sentiment})

df.apply(analyze_review,review_list=review_list,axis=1)
df_processed = pd.DataFrame.from_dict(review_list)

# Data Preprocessing

### Label Encoding the binary target variable

In [6]:
le = LabelEncoder()
training_reviews,testing_reviews,training_labels,testing_labels = train_test_split(df_processed['review'].values,df_processed['sentiment'].values,test_size=0.2)
training_labels = le.fit_transform(training_labels)
testing_labels = le.fit_transform(testing_labels)

### Tokenizing the sentences and converting them to sequences with max len 200 (200 dimensional vectors)

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_reviews)
word_index = tokenizer.word_index
training_sequence = tokenizer.texts_to_sequences(training_reviews)
testing_sequence = tokenizer.texts_to_sequences(testing_reviews)
train_pad_sequence = pad_sequences(training_sequence,maxlen=200,truncating='post',padding='pre') #truncating 'post' means removing words to fit the maxlen from the end of the sentence
test_pad_sequence = pad_sequences(testing_sequence,maxlen=200,truncating='post',padding='pre') #padding 'pre' means adding words in the beginning of the sentence

In [52]:
result = {
    "0": "Negative",
    "1": "Positive"
}

In [47]:
s = b'{\n    "predictions": [[0.808536172]\n    ]\n}'

In [49]:
json.loads(s)

{'predictions': [[0.808536172]]}

In [43]:
index_no = 5
print(training_reviews[index_no])
sentence = training_reviews[index_no]

I could not agree less with the rating that was given to this movie, and I believe this is a sample of how short minded most of spectators are all over the world. Really... Are you forgetting that Cinema used to be a kind of art before some tycoons tried to make it only entertainment? This movie is not entertainment, at least not that easy entertainment you get on movies like Titanic or Gladiator. It has style, it is different, it is shocking... That's why most of you have hated it so much: because it does not try to be pleasing to you. It's just a story, a very weird one I admit, but after all, only a weird story. It is not a great story, not even a great cinema work, but I believe it is worth a 7-stars rating only for the courage of both author and director to shot a story that is not made to please the audience, thus selling billions of copies and making the big studios even richer. This movie is, for me, European-artistic-like movie made in the US, and everyone involved in the maki

In [8]:
import json

In [9]:
tokenizer_conf = tokenizer.to_json()
with open("tokenizer.json","w") as f:
    json.dump(tokenizer_conf,f)

In [10]:
with open("tokenizer.json","r") as f:
    tokenizer_json = json.load(f)

In [11]:
tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_json)

In [42]:
result[int(round(model.predict(pad_sequences(tokenizer.texts_to_sequences([sentence]),maxlen=200,truncating='post',padding='pre')).ravel()[0]))]

'Positive'

In [51]:
model.predict(pad_sequences(tokenizer.texts_to_sequences([sentence]),maxlen=200,truncating='post',padding='pre')).ravel()

array([0.8085362], dtype=float32)

In [46]:
pad_sequences(tokenizer.texts_to_sequences([sentence]),maxlen=200,truncating='post',padding='pre')

array([[   10,    98,    21,  1020,   341,    16,     1,   657,    12,
           13,   353,     5,    11,    17,     2,    10,   259,    11,
            6,     3,  9661,     4,    86,   338,  2846,    88,     4,
        13135,    23,    29,   121,     1,   181,    62,    23,    22,
         7540,    12,   436,   330,     5,    26,     3,   241,     4,
          506,   159,    47, 27745,   766,     5,    93,     9,    61,
          741,    11,    17,     6,    21,   741,    30,   222,    21,
           12,   753,   741,    22,    74,    20,    97,    37,  3643,
           38, 10235,     9,    45,   399,     9,     6,   278,     9,
            6,  1621,   196,   134,    88,     4,    22,    25,  1733,
            9,    34,    72,    84,     9,   124,    21,   365,     5,
           26,  5374,     5,    22,    44,    39,     3,    63,     3,
           52,   929,    27,    10,   994,    18,   100,    29,    61,
            3,   929,    63,     9,     6,    21,     3,    78,    63,
      

# Transfer Learning

### Loading the 200 dimensional GloVe Vectors

In [16]:
#Using glove vectors for embedding
embedded_words = {}
with open("glove.6B.200d.txt") as f:
    for line in f:
        words, coeff = line.split(maxsplit=1)
        coeff = np.array(coeff.split(),dtype=float)
        embedded_words[words] = coeff

### Preparing an embedding matrix

In [17]:
embedding_matrix = np.zeros((len(word_index)+1, 200))
for word,i in word_index.items():
    embedding_vector = embedded_words.get(word)
    if embedding_vector is not None: #if we cannot find the embedding vector from glove embedding, it will just be zeros
        embedding_matrix[i] = embedding_vector

# Modeling

### Defining the model structure

In [21]:
inputs = tf.keras.Input(shape=(None,),dtype='int64')
x = layers.Embedding(len(word_index)+1,200,weights=[embedding_matrix],input_length=200,trainable=False)(inputs)
x = layers.Dropout(0.5)(x)

#Conv1D + global max pooling
x = layers.Conv1D(128,7,padding='valid',activation='relu',strides=2)(x)
x = layers.Conv1D(128,7,padding='valid',activation='relu',strides=2)(x)
x = layers.GlobalMaxPooling1D()(x)

#Hidden layer
x = layers.Dense(128,activation='relu')(x)
x = layers.Dropout(0.5)(x)

predictions = layers.Dense(1,activation='sigmoid',name='predictions')(x)
model = tf.keras.Model(inputs,predictions)

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 200)         22467200  
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 200)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         179328    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 128)         114816    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1651

### Defining the model callbacks and version

In [31]:
with open("config.json","r") as f:
    model_config = json.load(f)

if os.path.isdir(os.path.join(MODEL_FOLDER,str(model_version))):
    warnings.warn("Already a folder with a similar version in the config exists. Kindly check before proceeding!! Once the training starts, the data would be overrated")

model_version = model_config['training_version']
MODEL_FOLDER = model_config['model_folder']
export_path = os.path.join(MODEL_FOLDER,str(model_version))
log_dir = os.path.join(export_path,f"logs")

callbacks_list = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=3
    ),
    tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(export_path,"checkpoints"),
        monitor='val_loss',
        save_best_only=True
    ),
    tf.keras.callbacks.TensorBoard(
        log_dir = log_dir, #the folder is created automatically if not present
        histogram_freq=1,
        embeddings_freq=1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.1,
        patience=5
    )
]

  


In [23]:
callbacks = callbacks_list
history = model.fit(train_pad_sequence,training_labels,epochs=50,validation_data=(test_pad_sequence,testing_labels),callbacks=callbacks)

Epoch 1/50
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: models/1/checkpoints/assets
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50


### Saving the model, model config and the tokenizer for future inferences

In [39]:
model_config['serving_version'] = model_version
model_config['training_version'] = str(int(model_version) + 1)

with open("config.json","w") as f:
    json.dump(model_config,f)

tf.keras.models.save_model(
    model,
    export_path,
    overwrite=True,
    include_optimizer=True,
    save_format=None,
    signatures=None,
    options=None
)

while True:
    try:
        model.save(f"{MODEL_FOLDER}/{model_version}/weight_file/model.h5")
    except OSError:
        os.mkdir(f"{MODEL_FOLDER}/{model_version}/weight_file")
        continue
    break

tokenizer_conf = tokenizer.to_json()
with open(os.path.join(export_path,"tokenizer.json"),"w") as f:
    json.dump(tokenizer_conf,f)

INFO:tensorflow:Assets written to: models/1/assets


In [54]:
%load_ext tensorboard
%tensorboard --logdir logdir --host 0.0.0.0 #port 6006 for tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [96]:
from tensorflow.keras.models import load_model

In [77]:
model = load_model("models/3/weight_file/model.h5")

In [107]:
from tensorflow.keras.utils import plot_model
plot_model(model,to_file="model_new.png")

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.


In [101]:
!pip uninstall pydot -y

Found existing installation: pydotplus 2.0.2
Uninstalling pydotplus-2.0.2:
  Successfully uninstalled pydotplus-2.0.2


In [103]:
!pip install pydotplus

Processing /root/.cache/pip/wheels/1f/5c/ba/f931f74fcac8f48b18ae597279203b1c1f92fc76249c2b6f66/pydotplus-2.0.2-py3-none-any.whl
Installing collected packages: pydotplus
Successfully installed pydotplus-2.0.2
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [106]:
!apt-get install python-pydot -y

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  graphviz libpython-stdlib libpython2.7-minimal libpython2.7-stdlib python
  python-minimal python-pyparsing python2.7 python2.7-minimal
Suggested packages:
  gsfonts graphviz-doc python-doc python-tk python-pyparsing-doc python2.7-doc
  binfmt-support
The following NEW packages will be installed:
  graphviz libpython-stdlib libpython2.7-minimal libpython2.7-stdlib python
  python-minimal python-pydot python-pyparsing python2.7 python2.7-minimal
0 upgraded, 10 newly installed, 0 to remove and 46 not upgraded.
Need to get 4640 kB of archives.
After this operation, 20.3 MB of additional disk space will be used.
Ign:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libpython2.7-minimal amd64 2.7.17-1~18.04ubuntu1
Ign:2 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 python2.7-minimal amd64 2.7.17-1~18.04ubuntu1
Err:1 ht