In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


### Loading the Necessary Libraries

In [2]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Embedding , LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-07-28 08:48:13.222146: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-28 08:48:13.222286: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-28 08:48:13.359288: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Loading The Textual Data

In [3]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [4]:
df.shape

(50000, 2)

In [5]:
df.sample(n=5)

Unnamed: 0,review,sentiment
48149,I screamed my head off because seeing this mov...,negative
8666,I think Andrew Davies did an admirable job of ...,positive
47115,This film differentiates itself from the run-o...,negative
42958,"I wasn't expecting much, and, to be honest, I ...",positive
41592,"The film shows relations of the dying mother, ...",negative


In [6]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [7]:
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [8]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

### Splitting the Data into Train and Test

In [9]:
train_data , test_data = train_test_split(df,test_size=0.2,random_state=42)

In [10]:
print(f"Shape of Train Data : {train_data.shape}")
print(f"Shape of Test Data : {test_data.shape}")

Shape of Train Data : (40000, 2)
Shape of Test Data : (10000, 2)


### Text Preprocessing

In [11]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['review'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']),maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']),maxlen=200)

In [12]:
Y_train , Y_test = train_data['sentiment'],test_data['sentiment']

### Model Training Using (LSTM - Long Short Term Memory)

In [13]:
model = Sequential()
model.add(Embedding(input_dim=5000,output_dim=128,input_length=200))
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))



In [14]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [15]:
model.fit(X_train, Y_train, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 195ms/step - accuracy: 0.7144 - loss: 0.5438 - val_accuracy: 0.8145 - val_loss: 0.4056
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 193ms/step - accuracy: 0.8572 - loss: 0.3434 - val_accuracy: 0.8496 - val_loss: 0.3405
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 194ms/step - accuracy: 0.8754 - loss: 0.3105 - val_accuracy: 0.8487 - val_loss: 0.3517
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 192ms/step - accuracy: 0.8864 - loss: 0.2811 - val_accuracy: 0.8595 - val_loss: 0.3404
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 192ms/step - accuracy: 0.8996 - loss: 0.2543 - val_accuracy: 0.8736 - val_loss: 0.3198
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 192ms/step - accuracy: 0.9104 - loss: 0.2325 - val_accuracy: 0.8665 - val_loss: 0.3467
Epoch 7/10

<keras.src.callbacks.history.History at 0x7d36139a5b70>

In [16]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 68ms/step - accuracy: 0.8388 - loss: 0.3827
Test Loss: 0.37961244583129883
Test Accuracy: 0.8413000106811523


### Prediction Time

In [17]:
def predict_sentiment(review):
    
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [18]:
# Predicting the Sentiment of Actual IMDB Review of Deadpool & Wolverine

new_review = "What a crazy blast ! Bonkers !!"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 255ms/step
The sentiment of the review is: positive


In [19]:
# Predicting the Sentiment of Actual IMDB Review of Deadpool & Wolverine

new_review = "Disappointing reference happy mess"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
The sentiment of the review is: negative
