In [22]:
!pip install kaggle



In [23]:
import os
import json
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
kaggle_dictionary = json.load(open("kaggle.json"))

In [25]:
kaggle_dictionary.keys()

dict_keys(['username', 'key'])

In [26]:
# setup kaggle credentials as environment variables
os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]

In [27]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [28]:
!ls

'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   kaggle.json   sample_data


In [29]:
# unzip the dataset file
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
  zip_ref.extractall()

In [30]:
!ls

'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   kaggle.json   sample_data


In [31]:
data = pd.read_csv("/content/IMDB Dataset.csv")

In [32]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [33]:
data.shape

(50000, 2)

In [34]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [35]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [36]:
data.replace({'sentiment':{'positive':1,'negative':0}},inplace=True)

  data.replace({'sentiment':{'positive':1,'negative':0}},inplace=True)


In [37]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [38]:
train_data,test_data=train_test_split(data,test_size=0.3)

In [39]:
train_data.shape

(35000, 2)

In [40]:
test_data.shape

(15000, 2)

In [41]:
#Tokenize test data
tokenizer=Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [42]:
X_train

array([[   0,    0,    0, ...,  248,   10, 3676],
       [   0,    0,    0, ...,  114,   14,  288],
       [  12,  160,   28, ...,   27, 4641,  130],
       ...,
       [   0,    0,    0, ...,   42,    4,  159],
       [2701,  274,  241, ...,  241,    4,   17],
       [   0,    0,    0, ...,   79,   74,  390]], dtype=int32)

In [43]:
X_test

array([[   0,    0,    0, ..., 2914,    1,   17],
       [4554,   10,  455, ...,  445,  319, 3370],
       [   0,    0,    0, ...,   82,   74,   99],
       ...,
       [   6, 3748,   16, ...,   16,    3, 2945],
       [  21,   56,    3, ...,    5, 1733,  120],
       [   1,  618, 4391, ...,  902,    5,  103]], dtype=int32)

In [44]:
y_train=train_data['sentiment']
y_test=test_data['sentiment']

In [45]:
y_train

Unnamed: 0,sentiment
21292,1
42007,1
21311,0
31738,1
4670,1
...,...
45905,1
20419,1
10007,0
1722,0


In [55]:
models=Sequential([
    Embedding(input_dim=5000,output_dim=128,input_length=200),
    LSTM(128,dropout=0.2,recurrent_dropout=0.2),
    Dense(1,activation='sigmoid')
])



In [57]:
models.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [60]:
models.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 493ms/step - accuracy: 0.7166 - loss: 0.5361 - val_accuracy: 0.8244 - val_loss: 0.4031
Epoch 2/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 493ms/step - accuracy: 0.8579 - loss: 0.3464 - val_accuracy: 0.8439 - val_loss: 0.3638
Epoch 3/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 477ms/step - accuracy: 0.8616 - loss: 0.3348 - val_accuracy: 0.8359 - val_loss: 0.3930
Epoch 4/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 482ms/step - accuracy: 0.8704 - loss: 0.3066 - val_accuracy: 0.8507 - val_loss: 0.3655
Epoch 5/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 488ms/step - accuracy: 0.8805 - loss: 0.2968 - val_accuracy: 0.8109 - val_loss: 0.4230


<keras.src.callbacks.history.History at 0x7c4361cceaa0>

In [61]:
loss, accuracy = models.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 104ms/step - accuracy: 0.8166 - loss: 0.4115
Test Loss: 0.4114936888217926
Test Accuracy: 0.8149333596229553


In [63]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = models.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [64]:
new_review = "This movie was fantastic. I loved it."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 228ms/step
The sentiment of the review is: positive


In [65]:
new_review = "This movie was ok but not that good."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
The sentiment of the review is: negative
