# MNIST / LSTM

In [1]:
# Credits: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
# LSTM for sequence classification in the IMDB dataset
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)


Using TensorFlow backend.


In [0]:
#Refer: https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification

# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)


Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
    8192/17464789 [..............................] - ETA: 0s





In [0]:
print(X_train[1]) # representation of text/words in frequency form
print(type(X_train[1]))
print(len(X_train[1]))

[1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 4369, 2, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 2, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 2, 5, 163, 11, 3215, 2, 4, 1153, 9, 194, 775, 7, 2, 2, 349, 2637, 148, 605, 2, 2, 15, 123, 125, 68, 2, 2, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 2, 5, 2, 656, 245, 2350, 5, 4, 2, 131, 152, 491, 18, 2, 32, 2, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]
<class 'list'>
189


In [0]:
# truncate and/or pad input sequences
max_review_length = 600
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

print(X_train.shape)
print(X_train[1])

(25000, 600)
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    

In [0]:
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words+1, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
#Refer: https://datascience.stackexchange.com/questions/10615/number-of-parameters-in-an-lstm-model

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 600, 32)           160032    
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 213,333
Trainable params: 213,333
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
model.fit(X_train, y_train, nb_epoch=2, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/2
Epoch 2/2
Accuracy: 86.87%


# Amazon / LSTM

In [0]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle


from tqdm import tqdm
import os



In [3]:

!pip install pydrive
 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
  
def get_file_names():

  # 1. Authenticate and create the PyDrive client.
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  file_list = drive.ListFile({'q': "'15InUdebAa5obIjKZQWBV_oBKHT3TL0h1' in parents and trashed=false"}).GetList()
  for file1 in file_list:
    print('title: %s, id: %s' % (file1['title'], file1['id']))
    
    
    
def get_file_into_colab(file_id,file_name):
  
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  amazon = drive.CreateFile({'id': file_id})
  amazon.GetContentFile(file_name)
  print(" Congrats ! Now You can import file into Pandas DataFrame !")



    




In [4]:
get_file_names()


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

title: Train_so.csv, id: 1QxkbsPaSQUazKHZRfTk1_I7M5a4gSf6T
title: train_HAR.csv, id: 1HgwNMzAxgNkYeLbdV-K0wNzMnR4jxY7V
title: test_HAR.csv, id: 1-EElNcN1mh-1YbqTZN922HipDSP8HlAS
title: total_acc_z_train.txt, id: 1zBX4nWZVsCYcZHZN9CWoGan2MUDfbGAQ
title: total_acc_y_train.txt, id: 1OJ0Pg989sYrILuKTZ_egZmm-8IYMC-2H
title: total_acc_x_train.txt, id: 1tRjnGtVLoPix04lrIdcHtP9Nr2TwT22w
title: body_gyro_z_train.txt, id: 1_JrBhkKTxLI2KJ-bdnOayMMtu0yO9pja
title: body_gyro_y_train.txt, id: 1-PiT6oXOhIXtzcj-igd2huqoRoXeZsPw
title: body_gyro_x_train.txt, id: 1QiLmTuudrNNx5gOXP2gXB2QCZKE-lbSq
title: body_acc_z_train.txt, id: 1lyaEVLnaDACqh9KI

In [5]:
get_file_into_colab('1HZ0ICAkv_IjrHHT2hAQtJ58RV61PG2cL','Amazon.csv')

 Congrats ! Now You can import file into Pandas DataFrame !


## Data Preprocessing 

In [0]:
df=pd.read_csv('Amazon.csv')
# read csv file into pandas DataFRame

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525773 entries, 0 to 525772
Data columns (total 11 columns):
Unnamed: 0                525773 non-null int64
Id                        525773 non-null int64
ProductId                 525773 non-null object
UserId                    525773 non-null object
ProfileName               525773 non-null object
HelpfulnessNumerator      525773 non-null int64
HelpfulnessDenominator    525773 non-null int64
Score                     525773 non-null int64
Time                      525773 non-null int64
Summary                   525773 non-null object
Text                      525773 non-null object
dtypes: int64(6), object(5)
memory usage: 44.1+ MB


## Pre-processing data to represent review as a +ve or -ve review

In [0]:
# geting first 20k Score values as train data
score_train=df.Score[:20000]
score_test=df.Score[20000:22000]


In [10]:
print(score_train[:5])
score_test[:5]


0    5
1    1
2    4
3    2
4    5
Name: Score, dtype: int64


20000    5
20001    5
20002    4
20003    5
20004    5
Name: Score, dtype: int64

In [11]:
# lets create y with binary values -> 1 for Score= 4 & 5 / 0 for Score= 1 & 2

y_train=[]

for i in tqdm(score_train):
  if i <3:
    y_train.append(0)
  else:
    y_train.append(1)


y_test=[]

for i in tqdm(score_test):
  if i <3:
    y_test.append(0)
  else:
    y_test.append(1)

100%|██████████| 20000/20000 [00:00<00:00, 1043865.56it/s]
100%|██████████| 2000/2000 [00:00<00:00, 429128.71it/s]


In [12]:
print(y_train[:5])
y_test[:5]

[1, 0, 1, 0, 1]


[1, 1, 1, 1, 1]

In [0]:
# getting only first 20k datapoints only for text feature (column) 

text_train=df.Text[:20000].values
text_test=df.Text[20000:22000].values

#Function to plot Train & Validation Loss

In [0]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import time
# https://gist.github.com/greydanus/f6eee59eaf1d90fcb3b534a25362cea4
# https://stackoverflow.com/a/14434334
# this function is used to update the plots for each epoch and error
def plt_dynamic(x, vy, ty, ax, colors=['b']):
    ax.plot(x, vy, 'b', label="Validation Loss")
    ax.plot(x, ty, 'r', label="Train Loss")
    plt.legend()
    plt.grid()
    fig.canvas.draw()

#Keras Tokenizer

In [0]:
from keras.preprocessing.text import Tokenizer

t=Tokenizer(num_words=5000)


In [0]:
# fitting tokenier on train_text data
t.fit_on_texts(text_train)
# generating sequence for text_train data based on previously trained/fit tokenizer
train_sequences=t.texts_to_sequences(text_train)
# generating sequence for test_text data based on trained/fit tokenizer on train_text data
test_sequences=t.texts_to_sequences(text_test)


In [17]:
print(train_sequences[:2])
print("*"*50)
len(train_sequences)

[[2, 17, 125, 315, 7, 1, 572, 81, 54, 199, 3, 17, 115, 28, 39, 5, 30, 7, 29, 170, 1, 38, 540, 51, 26, 4, 2737, 56, 4, 1220, 463, 3, 6, 714, 98, 13, 8, 1617, 3, 90, 9, 38, 98, 56, 149], [38, 346, 2011, 23, 3769, 1782, 823, 1, 823, 79, 258, 188, 990, 3572, 20, 205, 40, 9, 21, 70, 2873, 32, 40, 1, 1527, 3149, 5, 1, 38, 23, 3769]]
**************************************************


20000

In [0]:
from sklearn.model_selection import train_test_split

x_train,x_val,y_train,y_val=train_test_split(train_sequences,y_train,test_size=0.25)

x_test=test_sequences
y_test=y_test



In [19]:
# truncate and/or pad input sequences so as to make every input/review of length 600

from keras.preprocessing import sequence

max_review_length = 600
x_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
x_val = sequence.pad_sequences(x_val, maxlen=max_review_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_review_length)

print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(15000, 600)
(5000, 600)
(2000, 600)


In [0]:
"""def best_model(n_hidden,dropout):
  
  top_words=5000 

  # we have changed this cz our frequency representation of words has max value of 5000 (Tokenizer(num_words=5000))

  embedding_vecor_length = 32

  model = Sequential()

  model.add(Embedding(top_words+1, embedding_vecor_length, input_length=max_review_length))
  model.add(BatchNormalization())

  model.add(LSTM(n_hidden))
  model.add(Dropout(dropout))
  model.add(BatchNormalization())

  model.add(Dense(1, activation='sigmoid')) # only one neuron with sigmoid activation, because its binary classification problem

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model
"""
from keras.layers import BatchNormalization

# create the model

#top_words = 5000

"""
Now notice that, we have increased the number of test datapoints so as to get the vocab size equal to vocab_train size...
but its lil bigger i.e 53912, so we will consider top_words=53912 rather than 50959...since 53912 > 50959
"""

top_words=5000 

# we have changed this cz our frequency representation of words has max value of 5000 (Tokenizer(num_words=5000))

embedding_vecor_length = 32



def best_model(n_hidden,dropout):

  model = Sequential()
  model.add(Embedding(top_words+1, embedding_vecor_length, input_length=max_review_length))
  #model.add(BatchNormalization())

  model.add(LSTM(n_hidden))
  model.add(Dropout(dropout))
  model.add(BatchNormalization())
  """
  model.add(LSTM(20))
  model.add(Dropout(0.3))
  model.add(BatchNormalization())"""

  model.add(Dense(50, activation='relu')) 
  model.add(BatchNormalization())

  model.add(Dense(1, activation='sigmoid')) # only one neuron with sigmoid activation, because its binary classification problem

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  #print(model.summary())

  return model

In [0]:
dropout=[0.2,0.3,0.5]
n_hidden=[20,30,50]

In [22]:

from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import BatchNormalization

model=KerasClassifier(build_fn=best_model)
param_grid=dict(dropout=dropout,n_hidden=n_hidden)

print(param_grid)

{'dropout': [0.2, 0.3, 0.5], 'n_hidden': [20, 30, 50]}


In [0]:
grid=GridSearchCV(estimator=model,param_grid=param_grid)
grid_result=grid.fit(x_train,y_train)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [0]:
grid_result.best_params_

{'dropout': 0.5, 'n_hidden': 20}

# Model 1

In [0]:
from keras.layers import BatchNormalization

# create the model

#top_words = 5000

"""
Now notice that, we have increased the number of test datapoints so as to get the vocab size equal to vocab_train size...
but its lil bigger i.e 53912, so we will consider top_words=53912 rather than 50959...since 53912 > 50959
"""

top_words=5000 

# we have changed this cz our frequency representation of words has max value of 5000 (Tokenizer(num_words=5000))

embedding_vecor_length = 32

model = Sequential()

model.add(Embedding(top_words+1, embedding_vecor_length, input_length=max_review_length))
#model.add(BatchNormalization())

model.add(LSTM(20))
model.add(Dropout(0.5))
model.add(BatchNormalization())
"""
model.add(LSTM(20))
model.add(Dropout(0.3))
model.add(BatchNormalization())"""

model.add(Dense(50, activation='relu')) 
model.add(BatchNormalization())

model.add(Dense(1, activation='sigmoid')) # only one neuron with sigmoid activation, because its binary classification problem

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 600, 32)           160032    
_________________________________________________________________
lstm_14 (LSTM)               (None, 20)                4240      
_________________________________________________________________
dropout_14 (Dropout)         (None, 20)                0         
_________________________________________________________________
batch_normalization_29 (Batc (None, 20)                80        
_________________________________________________________________
dense_13 (Dense)             (None, 50)                1050      
_________________________________________________________________
batch_normalization_30 (Batc (None, 50)                200       
_________________________________________________________________
dense_14 (Dense)             (None, 1)               

In [0]:
# to make sure there is no any class imbalance 
# by this we are balancing the class (like what we do in LR i.e class_weight='balanced')

from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)


In [0]:

history=model.fit(x_train,y_train,nb_epoch=5,batch_size=128,class_weight=class_weights,validation_data=(x_val,y_val))
#it returns all info collected during training i.e history



Train on 15000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:

# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
print("Loss Score: %.2f%%" % (scores[0]))



Accuracy: 86.90%
Loss Score: 0.42%


1. Before introducing BN layer...val loss was 12 in first epoch and acc score was ~15%.
2. Now with BN layer, model has improved.
3. Also loss difference betwenn train & validation data has improved from previously built models which indicates we are succesfully able to avoid model overfitting to some extent

In [0]:

"""score = model_drop.evaluate(X_test, Y_test, verbose=0) 
print('Test score:', score[0]) 
print('Test accuracy:', score[1])
"""


fig,ax = plt.subplots(1,1)
ax.set_xlabel('epoch') ; 
ax.set_ylabel('Categorical Crossentropy Loss')

# list of epoch numbers
x = list(range(1,5+1))

# print(history.history.keys())
# dict_keys(['val_loss', 'val_acc', 'loss', 'acc'])
# history = model_drop.fit(X_train, Y_train, batch_size=batch_size, epochs=nb_epoch, verbose=1, validation_data=(X_test, Y_test))

# we will get val_loss and val_acc only when you pass the paramter validation_data
# val_loss : validation loss
# val_acc : validation accuracy

# loss : training loss
# acc : train accuracy
# for each key in histrory.histrory we will have a list of length equal to number of epochs

vy = history.history['val_loss']
ty = history.history['loss']
plt_dynamic(x, vy, ty, ax)
#plt.plot(x,ty)
#plt.show()

<IPython.core.display.Javascript object>

##Model 2

In [0]:
from keras.layers import BatchNormalization

# create the model

#top_words = 5000

"""
Now notice that, we have increased the number of test datapoints so as to get the vocab size equal to vocab_train size...
but its lil bigger i.e 53912, so we will consider top_words=53912 rather than 50959...since 53912 > 50959
"""

top_words=5000 

# we have changed this cz our frequency representation of words has max value of 5000 (Tokenizer(num_words=5000))

embedding_vecor_length = 32

model = Sequential()

model.add(Embedding(top_words+1, embedding_vecor_length, input_length=max_review_length))
model.add(BatchNormalization())

model.add(LSTM(70,return_sequences=True))
model.add(Dropout(0.5))
model.add(BatchNormalization())


model.add(LSTM(50))
model.add(Dropout(0.3))
model.add(BatchNormalization())


"""
model.add(LSTM(20))
model.add(Dropout(0.3))
model.add(BatchNormalization())
"""

model.add(Dense(50, activation='relu')) 
model.add(BatchNormalization())

model.add(Dense(1, activation='sigmoid')) # only one neuron with sigmoid activation, because its binary classification problem

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 600, 32)           160032    
_________________________________________________________________
batch_normalization_35 (Batc (None, 600, 32)           128       
_________________________________________________________________
lstm_17 (LSTM)               (None, 600, 70)           28840     
_________________________________________________________________
dropout_17 (Dropout)         (None, 600, 70)           0         
_________________________________________________________________
batch_normalization_36 (Batc (None, 600, 70)           280       
_________________________________________________________________
lstm_18 (LSTM)               (None, 50)                24200     
_________________________________________________________________
dropout_18 (Dropout)         (None, 50)              

In [0]:

history=model.fit(x_train,y_train,nb_epoch=5,batch_size=128,class_weight=class_weights,validation_data=(x_val,y_val))
#it returns all info collected during training i.e history



Train on 15000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:

# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
print("Loss Score: %.2f%%" % (scores[0]))


Accuracy: 86.35%
Loss Score: 0.35%


## Model 3

In [29]:
from keras.layers import BatchNormalization, Conv1D, MaxPool1D

# create the model

#top_words = 5000

"""
Now notice that, we have increased the number of test datapoints so as to get the vocab size equal to vocab_train size...
but its lil bigger i.e 53912, so we will consider top_words=53912 rather than 50959...since 53912 > 50959
"""

top_words=5000 

# we have changed this cz our frequency representation of words has max value of 5000 (Tokenizer(num_words=5000))

embedding_vecor_length = 32

model = Sequential()

model.add(Embedding(top_words+1, embedding_vecor_length, input_length=max_review_length))
model.add(BatchNormalization())

model.add(Conv1D(filters=100,kernel_size=(3)))

# model.add(Conv1D(filters=100,kernel_size=(3))) / using more than 1 COnv1D layer seems to be affecting model performance as acc is decreasing and loss is increasing very much
#model.add(BatchNormalization())  / was reducing acc and increasing loss
#model.add(MaxPool1D()) / not much effective , as performance was remaining almost same

"""model.add(LSTM(70,return_sequences=True))
model.add(Dropout(0.5))
model.add(BatchNormalization())
"""
model.add(LSTM(20))
model.add(Dropout(0.5))
model.add(BatchNormalization())


"""
model.add(LSTM(20))
model.add(Dropout(0.3))
model.add(BatchNormalization())
"""

model.add(Dense(50, activation='relu')) 
model.add(BatchNormalization())

model.add(Dense(1, activation='sigmoid')) # only one neuron with sigmoid activation, because its binary classification problem

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 600, 32)           160032    
_________________________________________________________________
batch_normalization_7 (Batch (None, 600, 32)           128       
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 598, 100)          9700      
_________________________________________________________________
lstm_3 (LSTM)                (None, 20)                9680      
_________________________________________________________________
dropout_3 (Dropout)          (None, 20)                0         
_________________________________________________________________
batch_normalization_8 (Batch (None, 20)                80        
_________________________________________________________________
dense_5 (Dense)              (None, 50)               

In [30]:

history=model.fit(x_train,y_train,nb_epoch=5,batch_size=128,class_weight=class_weights,validation_data=(x_val,y_val))
#it returns all info collected during training i.e history



Train on 15000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [31]:

# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
print("Loss Score: %.2f%%" % (scores[0]))


Accuracy: 87.40%
Loss Score: 0.36%


# Analysis

In [32]:
from prettytable import PrettyTable

table=PrettyTable()

table.field_names=['-','Model 1','Model 2','Model 3']

table.add_row(['CNN',0,0,1])
table.add_row(['LSTM',1,2,2])
table.add_row(['Dense',2,2,2])
table.add_row(['Accuracy',86.90,86.35,87.40])
table.add_row(['Loss',0.42,0.35,0.36])

print(table)

+----------+---------+---------+---------+
|    -     | Model 1 | Model 2 | Model 3 |
+----------+---------+---------+---------+
|   CNN    |    0    |    0    |    1    |
|   LSTM   |    1    |    2    |    2    |
|  Dense   |    2    |    2    |    2    |
| Accuracy |   86.9  |  86.35  |   87.4  |
|   Loss   |   0.42  |   0.35  |   0.36  |
+----------+---------+---------+---------+


1. Changing architecture seems to be quite effective in terms of minimizing loss which we can notice, model is performing **slightly better in case 2 in terms of loss while accuracy is almost same**
2. Also **introducing Conv layer reduced loss as well as accuracy has increased slightly to 87.40%.**

---



# Conclusion:-

1. It was observed that, there is a small improvement in model performance when we introduce class_weight='balanced' parameter while training.

2. Again, with more data and huge corpus and also more number of epochs would improve model perofrmance.

3. But even with 20k datapoints, model is performing well with decent accuracy and loss score

4. Also, with increase in number of layers (LSTM) model didn't improved significantly, rather they were almost same.

5. And introducing Conv layer has improved model performance to some extent