<a href="https://www.kaggle.com/jashtailor/comparing-the-performance-of-neural-networks-with?scriptVersionId=84203346" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.50d.txt
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt
/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
# importing the necessary libraries
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential 
from keras.layers import Embedding, Flatten, Dense
os.environ["KMP_SETTINGS"] = "false"

from sklearn import preprocessing

import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings("ignore")

In [3]:
# importing the IMDB movie review dataset
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# preprocessing the textual data by tokenizing it 
maxlen = 100
training_samples = 25000
validation_samples = 15000
testing_samples = 10000
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])
word_index = tokenizer.word_index
print('Found ', len(word_index), ' unique tokens.')

# encoding the categorical variables 
label_encoder = preprocessing.LabelEncoder()
sentiment = label_encoder.fit_transform(df['sentiment'])

Found  124252  unique tokens.


In [5]:
# padding the tokenized data to make them all of equal length 
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(sentiment)
print('Shape of the tensor containing the reviews:', data.shape)
print('Shape of the tensor containing the sentiment labels:', labels.shape)

Shape of the tensor containing the reviews: (50000, 100)
Shape of the tensor containing the sentiment labels: (50000,)


In [6]:
# splitting the dataset into 3 parts for training, validation and testing
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
x_test = data[validation_samples: validation_samples + testing_samples]
y_test = labels[validation_samples: validation_samples + testing_samples]

In [7]:
# importing the GloVe word embeddings 
glove_dir = '/kaggle/input/glove-global-vectors-for-word-representation/'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found', len(embeddings_index), 'word index')

Found 400000 word index


In [8]:
# creating a embedding matrix of size (max_words, embedding_dim) which can be loaded in the embedding layer 
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [9]:
# creating a model 
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          1000000   
_________________________________________________________________
flatten (Flatten)            (None, 10000)             0         
_________________________________________________________________
dense (Dense)                (None, 32)                320032    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


2022-01-02 06:59:50.193874: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [10]:
# loading the embedding matrix into the first layer i.e. Embedding layer of the model
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [11]:
# compiling, training and validating the model
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['acc'])
history_1 = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))

2022-01-02 06:59:50.461641: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
# evaluating the performance of the model
values_1 = model.evaluate(x_test,y_test)
values_1



[0.6201045513153076, 0.7540000081062317]

In [13]:
df_1 = pd.DataFrame()
df_1['Training Accuracy'] = history_1.history['acc']
df_1['Validation Accuracy'] = history_1.history['val_acc']
df_1['Training Loss'] = history_1.history['loss']
df_1['Validation Loss'] = history_1.history['val_loss']
df_1['Epochs'] = range(1, len(df_1['Training Accuracy']) + 1)

# comparing the training and validation accuracy 
fig = px.line(df_1, x='Epochs', y=['Training Accuracy', 'Validation Accuracy'], title='Model with Pretrained Word Embeddings')
fig.show()

In [14]:
# comparing the training and validation loss
fig = px.line(df_1, x='Epochs', y=['Training Loss', 'Validation Loss'], title='Model without Pretrained Word Embeddings')
fig.show()

In [15]:
# creating a model without pretrained word embeddings 
network = Sequential()
network.add(Embedding(max_words, embedding_dim, input_length=maxlen))
network.add(Flatten())
network.add(Dense(32, activation='relu'))
network.add(Dense(1, activation='sigmoid'))
network.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


In [16]:
# compiling, training and validating the model
network.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history_2 = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val,y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
# evaluating the performance of the model
values_2 = network.evaluate(x_test,y_test)
values_2



[0.6931566596031189, 0.5016999840736389]

In [18]:
df_2 = pd.DataFrame()
df_2['Training Accuracy'] = history_2.history['acc']
df_2['Validation Accuracy'] = history_2.history['val_acc']
df_2['Training Loss'] = history_2.history['loss']
df_2['Validation Loss'] = history_2.history['val_loss']
df_2['Epochs'] = range(1, len(df_2['Training Accuracy']) + 1)

# comparing the training and validation accuracy
fig = px.line(df_2, x='Epochs', y=['Training Accuracy', 'Validation Accuracy'], title='Model without Pretrained Word Embeddings')
fig.show()

In [19]:
# comparing the training and validation loss
fig = px.line(df_2, x='Epochs', y=['Training Loss', 'Validation Loss'], title='Model without Pretrained Word Embeddings')
fig.show()

In [20]:
# comparing the evaluation performance of both the models
fig = go.Figure()
fig.add_trace(go.Bar(name='Loss', 
                     x=['Loss with Pretrained Word Embeddings', 'Loss without Pretrained Word Embeddings'], 
                     y=[values_1[0], values_2[0]]))
fig.add_trace(go.Bar(name='Accuracy', 
                     x=['Accuracy with PreTrained Word Embeddings', 'Accuracy without PreTrained Word Embeddings'], 
                     y=[values_1[1], values_2[1]]))
fig.show()


As we can see, the model with Pretrained Word Embeddings performed much better as compared to the model without them. This can be attributed to the fact that in case of small training datasets the model can't fully learn an appropriate task-specific embedding of the vocabulary. Pretrained Word Embeddings give us an advantage as they are already well-structured and systematic in nature and allows the model to capitalize on its capabilities. 