In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [9]:
df=pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [13]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [15]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [17]:
# **Step 3: Text Preprocessing (Cleaning)**
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [19]:
df['review'] = df['review'].apply(clean_text)

In [21]:
# **Step 4: Convert Sentiments to Binary**
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [23]:
# ✅ Tokenization without limiting vocab size
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])

In [25]:
# ✅ Padding based on the longest review in the dataset
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [27]:
# ✅ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, df['sentiment'], test_size=0.2, random_state=42
)


In [29]:
# ✅ Model Definition using actual vocab size
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token
embedding_dim = 64

In [31]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])



In [33]:
# ✅ Compile Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [37]:
# ✅ Train Model
history = model.fit(X_train, y_train, epochs=4, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/4
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 490ms/step - accuracy: 0.6998 - loss: 0.5243 - val_accuracy: 0.8959 - val_loss: 0.2480
Epoch 2/4
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 504ms/step - accuracy: 0.9487 - loss: 0.1441 - val_accuracy: 0.9055 - val_loss: 0.2360
Epoch 3/4
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m488s[0m 780ms/step - accuracy: 0.9934 - loss: 0.0319 - val_accuracy: 0.8999 - val_loss: 0.2967
Epoch 4/4
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m298s[0m 477ms/step - accuracy: 0.9997 - loss: 0.0039 - val_accuracy: 0.9041 - val_loss: 0.3280


In [39]:
# ✅ Evaluate Model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"\n✅ Test Accuracy: {test_acc:.2f}")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 44ms/step - accuracy: 0.9081 - loss: 0.3088

✅ Test Accuracy: 0.90


In [41]:
# ✅ Predictions & Misclassified Samples
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype("int32").flatten()
misclassified_indices = np.where(y_pred != y_test.to_numpy())[0]
print(f"\n🔹 Number of Misclassified Samples: {len(misclassified_indices)}")

num_samples_to_display = 5
misclassified_samples = misclassified_indices[:num_samples_to_display]

print("\n🔹 Sample Misclassified Reviews:")
for idx in misclassified_samples:
    print(f"\n🔹 Review: {df.iloc[idx]['review'][:300]}...")
    print(f"   ✅ Actual Sentiment: {'Positive' if y_test.iloc[idx] == 1 else 'Negative'}")
    print(f"   ❌ Predicted Sentiment: {'Positive' if y_pred[idx] == 1 else 'Negative'}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 43ms/step

🔹 Number of Misclassified Samples: 959

🔹 Sample Misclassified Reviews:

🔹 Review: first of all lets get a few things straight here a i am an anime fan always has been as a matter of fact i used to watch speed racer all the time in preschool b i do like several bmovies because theyre hilarious c i like the godzilla movies a lotmoving on when the movie first comes on it seems like ...
   ✅ Actual Sentiment: Positive
   ❌ Predicted Sentiment: Negative

🔹 Review: this movie was so frustrating everything seemed energetic and i was totally prepared to have a good time i at least thought id be able to stand it but i was wrong first the weird looping it was like watching americas funniest home videos the damn parents i hated them so much the stereotypical latino...
   ✅ Actual Sentiment: Negative
   ❌ Predicted Sentiment: Positive

🔹 Review: war movie is a hollywood genre that has been done and redone so many times 