In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Recurrent Neural Network Sentiment Analysis using integer encoding technique
# First the method is implemented in general and then applied on the imdb dataset
# Author: Muhammad Humayun Khan

# create own sentences
docs = [
    'sun rises every morning',
    'waves crash on shore',
    'trees sway in wind',
    'stars twinkle at night',
    'birds chirp at dawn',
    'rain falls on leaves',
    'snow blankets the ground',
    'fire crackles in fireplace',
    'wind whispers through trees',
    'clouds drift across sky'
]

In [5]:
# now tokenize the sentence means word by word
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='<nothing>')

# the oov_token means that replace the new word or out of dictionary word with nothing

In [7]:
# pass the data to be tokenize
tokenizer.fit_on_texts(docs)

In [8]:
tokenizer.word_index
# unique word index
# Words that appear more frequently are assigned lower indices (by default), while less frequent words get higher indices.

{'<nothing>': 1,
 'on': 2,
 'trees': 3,
 'in': 4,
 'wind': 5,
 'at': 6,
 'sun': 7,
 'rises': 8,
 'every': 9,
 'morning': 10,
 'waves': 11,
 'crash': 12,
 'shore': 13,
 'sway': 14,
 'stars': 15,
 'twinkle': 16,
 'night': 17,
 'birds': 18,
 'chirp': 19,
 'dawn': 20,
 'rain': 21,
 'falls': 22,
 'leaves': 23,
 'snow': 24,
 'blankets': 25,
 'the': 26,
 'ground': 27,
 'fire': 28,
 'crackles': 29,
 'fireplace': 30,
 'whispers': 31,
 'through': 32,
 'clouds': 33,
 'drift': 34,
 'across': 35,
 'sky': 36}

In [9]:
tokenizer.word_counts
# word repeatetion rate

OrderedDict([('sun', 1),
             ('rises', 1),
             ('every', 1),
             ('morning', 1),
             ('waves', 1),
             ('crash', 1),
             ('on', 2),
             ('shore', 1),
             ('trees', 2),
             ('sway', 1),
             ('in', 2),
             ('wind', 2),
             ('stars', 1),
             ('twinkle', 1),
             ('at', 2),
             ('night', 1),
             ('birds', 1),
             ('chirp', 1),
             ('dawn', 1),
             ('rain', 1),
             ('falls', 1),
             ('leaves', 1),
             ('snow', 1),
             ('blankets', 1),
             ('the', 1),
             ('ground', 1),
             ('fire', 1),
             ('crackles', 1),
             ('fireplace', 1),
             ('whispers', 1),
             ('through', 1),
             ('clouds', 1),
             ('drift', 1),
             ('across', 1),
             ('sky', 1)])

In [10]:
tokenizer.document_count

10

In [11]:
# create sequences for the words e.g. sun = 7 index, rise = 8 so all words will be combine
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[7, 8, 9, 10],
 [11, 12, 2, 13],
 [3, 14, 4, 5],
 [15, 16, 6, 17],
 [18, 19, 6, 20],
 [21, 22, 2, 23],
 [24, 25, 26, 27],
 [28, 29, 4, 30],
 [5, 31, 32, 3],
 [33, 34, 35, 36]]

In [12]:
# start padding for equality of the words
from keras.utils import pad_sequences

In [13]:
sequences = pad_sequences(sequences,padding='post')
sequences

# till this point, our input data is prepared

array([[ 7,  8,  9, 10],
       [11, 12,  2, 13],
       [ 3, 14,  4,  5],
       [15, 16,  6, 17],
       [18, 19,  6, 20],
       [21, 22,  2, 23],
       [24, 25, 26, 27],
       [28, 29,  4, 30],
       [ 5, 31, 32,  3],
       [33, 34, 35, 36]], dtype=int32)

# **Implementation of the Integer Encoding Technique on the Real World Dataset IMDB**

In [21]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,SimpleRNN,Flatten,Input

In [24]:
# load the data from the imdb dataset
(X_train,y_train),(X_test,y_test) = imdb.load_data()

In [26]:
X_train[0]
# the dataset is preprocessed and is ready. Can be checked below

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [27]:
# as can see above, the sequence of words are not the same so add padding and maximum length should be 50
X_train = pad_sequences(X_train,padding='post',maxlen=50)
X_test = pad_sequences(X_test,padding='post',maxlen=50)

In [28]:
X_train[0]

array([2071,   56,   26,  141,    6,  194, 7486,   18,    4,  226,   22,
         21,  134,  476,   26,  480,    5,  144,   30, 5535,   18,   51,
         36,   28,  224,   92,   25,  104,    4,  226,   65,   16,   38,
       1334,   88,   12,   16,  283,    5,   16, 4472,  113,  103,   32,
         15,   16, 5345,   19,  178,   32], dtype=int32)

In [29]:
# now create the RNN model
model = Sequential()

# Add an Input layer to define the input shape
model.add(Input(shape=(50, 1)))

# Add the SimpleRNN layer, return_seq = False because we need the output at the end and first time output should be part of all steps
# such as t1 output = o1 and for t2 the input includes the o1 etc, same theory concept
model.add(SimpleRNN(32, return_sequences=False))

# Add the Dense output layer
model.add(Dense(1, activation='sigmoid'))

# Print the model summary
model.summary()


In [30]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.fit(X_train,y_train,epochs=5,validation_data=(X_test,y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.4931 - loss: 0.7550 - val_accuracy: 0.5037 - val_loss: 0.6936
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.5068 - loss: 0.6930 - val_accuracy: 0.5056 - val_loss: 0.6935
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.5124 - loss: 0.6925 - val_accuracy: 0.5070 - val_loss: 0.6944
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.5116 - loss: 0.6927 - val_accuracy: 0.5049 - val_loss: 0.6937
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.5025 - loss: 0.6928 - val_accuracy: 0.5048 - val_loss: 0.6943


<keras.src.callbacks.history.History at 0x7ab6384e8fa0>