In [7]:
import pandas as pd
import numpy as np
import json
import os

In [10]:
kaggle_dict = json.load(open('../kaggle/kaggle.json'))

In [11]:
kaggle_dict.keys()

dict_keys(['username', 'key'])

In [None]:
# Loading the username and key to authenticate

os.environ['KAGGLE_USERNAME'] = kaggle_dict["username"]
os.environ['KAGGLE_KEY'] = kaggle_dict["key"]

In [None]:
# Downloading from kaggle

# Running once is fine. Uncomment when required
# !kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [19]:
from zipfile import ZipFile

# Unzipping the file

with ZipFile('imdb-dataset-of-50k-movie-reviews.zip', 'r') as f:
    f.extractall("../data/imdb_reviews")

In [20]:
sentiment_df = pd.read_csv('../data/imdb_reviews/IMDB Dataset.csv')

In [22]:
sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [24]:
sentiment_df['sentiment'] = sentiment_df['sentiment'].replace({'positive': 1, 'negative': 0})

In [25]:
sentiment_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [36]:
sentiment_df['sentiment'].value_counts()

sentiment
1    24884
0    24698
Name: count, dtype: int64

In [33]:
sentiment_df['review'].duplicated().sum()

np.int64(418)

In [37]:
sentiment_df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [35]:
sentiment_df.drop_duplicates(inplace=True)

No null value and dropped duplicate as its a small number comapared to the dataset

## Training and Modeling

In [38]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(sentiment_df, test_size=0.2, random_state=42)

In [56]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Input

In [66]:
tokens = Tokenizer(num_words=5000)          # Getting 5000 most frequent words

tokens.fit_on_texts(train_data['review'])   # So now each word in dataset are assigned a integer index

X_train = pad_sequences(tokens.texts_to_sequences(train_data['review']), maxlen=200, padding='pre')   
# padding are to ensure all sentences will have same shape as NN requires same shape

X_test = pad_sequences(tokens.texts_to_sequences(test_data["review"]), maxlen=200, padding='pre')   

In [67]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   10,   64,  430,    1,
         15,  993,  167,   18,  148,   11,   13,   37,  561,  343,    2,
        315,  126,  323,  262, 4631,   17,    3,  601,   11,   15,   13,
         74,   44,   37, 1319,  100, 1183,   10,  236,  437,    5,  884,
        156,  407,   18,   10,   89,   56,  118,   48,  326,    5,  132,
         10, 1454,    3,  373,    4,  212,   18,   

In [68]:
X_train.shape

(39665, 200)

- As we have defined maxlen 200 so 200 cols
- X_train[0] basically is a sentence the 0 means that this sentence doesnt have 200 words and the number basically is index of each word


In [69]:
y_train = train_data['sentiment']
y_test = test_data['sentiment']

## LSTM

In [70]:
# Building the model
model = Sequential()

#Embedding
model.add(Input(shape=(200,)))                              # as maxlen = 200
model.add(Embedding(input_dim=5000, output_dim=128))        # inp * o/p = 640,000

#Lstm
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))    # 4 * ((128 + 128) * 128 + 128) = 131,584  -> gates((inp + neurons) * neurons + neurons)

# Here the dropout drops 20% inputs at each training step
# recurrent_dropout drops 20% memory carried from previous timestep
# Both are to reduce overfitting

#Dense
model.add(Dense(1, activation='sigmoid'))                   # 128*1 + 1 = 129

In [71]:
model.summary()

In [72]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Training the model

In [73]:
model.fit(X_train , y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 462ms/step - accuracy: 0.7821 - loss: 0.4623 - val_accuracy: 0.8571 - val_loss: 0.3461
Epoch 2/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 258ms/step - accuracy: 0.8493 - loss: 0.3539 - val_accuracy: 0.8568 - val_loss: 0.3634
Epoch 3/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 362ms/step - accuracy: 0.8835 - loss: 0.2892 - val_accuracy: 0.8620 - val_loss: 0.3481
Epoch 4/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 343ms/step - accuracy: 0.8972 - loss: 0.2603 - val_accuracy: 0.8489 - val_loss: 0.3530
Epoch 5/5
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 353ms/step - accuracy: 0.9093 - loss: 0.2296 - val_accuracy: 0.8674 - val_loss: 0.3398


<keras.src.callbacks.history.History at 0x1b85741ce00>

In [74]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 37ms/step - accuracy: 0.8678 - loss: 0.3371
Loss: 0.3371107280254364
Accuracy: 0.8678027391433716


## Small Function to predict

In [77]:
def predict_sentiment(review):
    seq = tokens.texts_to_sequences([review])
    padded_seq = pad_sequences(seq, maxlen = 200)
    prediction = model.predict(padded_seq)
    sentiment = 'Positive' if prediction[0][0] > 0.5 else 'Negative'
    return sentiment

In [78]:
myreview = "The overall series is pretty good !!"
sentiment = predict_sentiment(myreview)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step
The sentiment of the review is: Positive


In [79]:
myreview = "The series is was boring!!"
sentiment = predict_sentiment(myreview)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
The sentiment of the review is: Negative


In [80]:
myreview = "The series is wasnt to my taste !!"
sentiment = predict_sentiment(myreview)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
The sentiment of the review is: Negative


In [86]:
myreview = "The series is was fine !!"
sentiment = predict_sentiment(myreview)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
The sentiment of the review is: Positive
