In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.
Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


In [2]:
import pandas as pd
import os
csv_path=os.path.join(path,"IMDB Dataset.csv")
df=pd.read_csv(csv_path)

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [5]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [7]:
x=df['review']
y=df['sentiment'].map({'positive':1, 'negative':0})

In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
stopword=set(stopwords.words('english'))
lemmetizer=WordNetLemmatizer()

def clean_text_deep(text):
  #lowercasing
  text=text.lower()
  #remove HTML tags
  text=re.sub(r'<.*?>',' ',text)
  #remove URL's
  text=re.sub(r'http\S+/www\S+',' ',text)
  #remove non alphabetic characters
  text=re.sub(r'[^a-zA-Z]',' ',text)
  #remove extra space
  text=re.sub(r'\s+',' ',text).strip()
  words = [
        lemmetizer.lemmatize(word)
        for word in text.split()
        if word not in stopword
    ]

  return ' '.join(words)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [9]:
df['clean_review_deep'] = df['review'].apply(clean_text_deep)

In [10]:
print("RAW REVIEW:\n")
print(df['review'].iloc[0])

print("\n" + "-"*80 + "\n")

print("CLEANED REVIEW:\n")
print(df['clean_review_deep'].iloc[0])


RAW REVIEW:

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is d

In [11]:
def simple_clean(text):
  text=re.sub(r'<.*?>',' ',text)
  text=re.sub(r'[^a-zA-Z]',' ',text)
  return text.lower()
df['clean_review_simple']=df['review'].apply(simple_clean)

In [12]:
from sklearn.model_selection import train_test_split
X=df['clean_review_simple']
Y=df['sentiment'].map({'positive':1, 'negative':0})
x_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [13]:
#use text for TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)
X_train_tfidf=tfidf.fit_transform(x_train)
X_test_tfidf=tfidf.transform(X_test)

ML model

In [14]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf,Y_train)

In [15]:
from sklearn.metrics import accuracy_score,classification_report
y_pred=model.predict(X_test_tfidf)

print("Accuracy:",accuracy_score(Y_test,y_pred))
print("Classification Report",classification_report(Y_test,y_pred))

Accuracy: 0.8973
Classification Report               precision    recall  f1-score   support

           0       0.91      0.89      0.90      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [16]:
test_sentences = [
    "This movie was absolutely fantastic and inspiring",
    "Worst movie ever, total waste of time",
    "The acting was okay but the story was boring",
    "I really loved the music and visuals",
    "Dhurandhar is a massive blockbuster hit, becoming one of the highest-grossing Indian films ever, shattering box office records for Hindi cinema"
]

test_clean = [simple_clean(text) for text in test_sentences]
test_tfidf = tfidf.transform(test_clean)

predictions = model.predict(test_tfidf)

for text, pred in zip(test_sentences, predictions):
    print(text, "→", "Positive" if pred == 1 else "Negative")


This movie was absolutely fantastic and inspiring → Positive
Worst movie ever, total waste of time → Negative
The acting was okay but the story was boring → Negative
I really loved the music and visuals → Positive
Dhurandhar is a massive blockbuster hit, becoming one of the highest-grossing Indian films ever, shattering box office records for Hindi cinema → Positive


Light cleaning + DL Model

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer=Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)

x_train_seq=tokenizer.texts_to_sequences(x_train)
x_test_seq=tokenizer.texts_to_sequences(X_test)

x_train_pad=pad_sequences(x_train_seq, maxlen=200, padding='post')
x_test_pad=pad_sequences(x_test_seq, maxlen=200, padding='post')


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Bidirectional,Dense,Dropout,LSTM

model=Sequential([
    Embedding(input_dim=20000,output_dim=128,input_length=200),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(1,activation="sigmoid")
])
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)



Training the Model

In [19]:
model.fit(
    x_train_pad,Y_train,
    epochs=3,
    batch_size=64,
    validation_split=0.2

)

Epoch 1/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 296ms/step - accuracy: 0.6854 - loss: 0.5720 - val_accuracy: 0.8570 - val_loss: 0.3537
Epoch 2/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 262ms/step - accuracy: 0.8993 - loss: 0.2660 - val_accuracy: 0.8826 - val_loss: 0.2979
Epoch 3/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 264ms/step - accuracy: 0.9442 - loss: 0.1617 - val_accuracy: 0.8671 - val_loss: 0.3185


<keras.src.callbacks.history.History at 0x7d14b4fd8f50>

In [20]:
loss, accuracy = model.evaluate(x_test_pad, Y_test)
print("Test Accuracy:", accuracy)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 35ms/step - accuracy: 0.8634 - loss: 0.3247
Test Accuracy: 0.8690999746322632


In [21]:
sample_texts = [
    "This movie was absolutely amazing",
    "Worst film ever made",
    "The plot was slow but acting was good",
    "Dhurandhar is a massive blockbuster hit, becoming one of the highest-grossing Indian films ever, shattering box office records for Hindi cinema"
]

# Clean text
sample_clean = [simple_clean(text) for text in sample_texts]

# Tokenize + pad
sample_seq = tokenizer.texts_to_sequences(sample_clean)
sample_pad = pad_sequences(sample_seq, maxlen=200, padding='post')

# Predict
predictions = model.predict(sample_pad)

for text, pred in zip(sample_texts, predictions):
    sentiment = "Positive" if pred > 0.5 else "Negative"
    print(f"{text} → {sentiment}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 268ms/step
This movie was absolutely amazing → Negative
Worst film ever made → Negative
The plot was slow but acting was good → Negative
Dhurandhar is a massive blockbuster hit, becoming one of the highest-grossing Indian films ever, shattering box office records for Hindi cinema → Positive


Experimenting Deep Text Cleaning + DL Model

In [22]:
X_deep=df["clean_review_deep"]
Y_deep=df["sentiment"].map({'positive':1,'negative':0})

X_train_deep,X_test_deep,Y_train_deep,Y_test_deep=train_test_split(X,y,test_size=0.2,random_state=42)


In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer=Tokenizer(num_words=20000,oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_deep)

X_train_deep_seq=tokenizer.texts_to_sequences(X_train_deep)
X_test_deep_seq=tokenizer.texts_to_sequences(X_test_deep)

X_train_deep_pad=pad_sequences(X_train_deep_seq,maxlen=200,padding='post')
X_test_deep_pad=pad_sequences(X_test_deep_seq,maxlen=200,padding='post')

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,Dropout,Bidirectional,LSTM

model=Sequential([
    Embedding(20000,128,input_length=200),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)



In [31]:
model.fit(
    X_train_deep_pad,Y_train_deep,
    epochs=3,
    batch_size=64,
    validation_split=0.2
)

Epoch 1/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 263ms/step - accuracy: 0.9777 - loss: 0.0689 - val_accuracy: 0.8540 - val_loss: 0.4686
Epoch 2/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 269ms/step - accuracy: 0.9762 - loss: 0.0737 - val_accuracy: 0.8729 - val_loss: 0.5280
Epoch 3/3
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 267ms/step - accuracy: 0.9916 - loss: 0.0292 - val_accuracy: 0.8668 - val_loss: 0.5978


<keras.src.callbacks.history.History at 0x7d143f882600>

In [32]:
loss,accuracy=model.evaluate(X_test_deep_pad,Y_test)
print("Test Accuracy:",accuracy)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 36ms/step - accuracy: 0.8620 - loss: 0.5781
Test Accuracy: 0.8669000267982483


In [27]:
positive_samples = [
    "This movie was absolutely fantastic and emotionally touching",
    "Brilliant performances and an engaging storyline",
    "I loved every minute of this film, highly recommended",
    "The direction, music, and acting were outstanding"
]
negative_samples = [
    "This was the worst movie I have ever seen",
    "Completely boring and a total waste of time",
    "The plot made no sense and the acting was terrible",
    "Painfully slow and extremely disappointing"
]
mixed_samples = [
    "The acting was good but the story was very weak",
    "Not a great movie, but it had a few nice moments",
    "The film started well but became boring later",
    "I expected more, although some scenes were decent"
]

sample_texts =positive_samples+negative_samples+mixed_samples

sample_clean = [clean_text_deep(text) for text in sample_texts]  # or clean_text_dl
sample_seq = tokenizer.texts_to_sequences(sample_clean)
sample_pad = pad_sequences(sample_seq, maxlen=200, padding='post')

predictions = model.predict(sample_pad)

for text, pred in zip(sample_texts, predictions):
    sentiment = "Positive" if pred > 0.5 else "Negative"
    print(f"{text} → {sentiment}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step
This movie was absolutely fantastic and emotionally touching → Positive
Brilliant performances and an engaging storyline → Positive
I loved every minute of this film, highly recommended → Positive
The direction, music, and acting were outstanding → Positive
This was the worst movie I have ever seen → Negative
Completely boring and a total waste of time → Negative
The plot made no sense and the acting was terrible → Negative
Painfully slow and extremely disappointing → Negative
The acting was good but the story was very weak → Negative
Not a great movie, but it had a few nice moments → Positive
The film started well but became boring later → Negative
I expected more, although some scenes were decent → Negative


In [34]:
import pandas as pd

data = {
    "Pipeline": ["TF-IDF + LR", "BiLSTM (Light Clean)", "BiLSTM (Heavy Clean)"],
    "Cleaning": ["Light", "Light", "Heavy"],
    "Model": ["Logistic Regression", "BiLSTM", "BiLSTM"],
    "Accuracy": [89.73, 86.90, 86.69]
}

df = pd.DataFrame(data)
df


Unnamed: 0,Pipeline,Cleaning,Model,Accuracy
0,TF-IDF + LR,Light,Logistic Regression,89.73
1,BiLSTM (Light Clean),Light,BiLSTM,86.9
2,BiLSTM (Heavy Clean),Heavy,BiLSTM,86.69
