In [1]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, roc_auc_score
import joblib

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Load dataset
train = pd.read_csv("/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv")
valid = pd.read_csv("/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Valid.csv")
test = pd.read_csv("/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Test.csv")

In [3]:
# Function for text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove numbers and punctuation
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [4]:
train['text'] = train['text'].apply(clean_text)
valid['text'] = valid['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

In [5]:
# Feature Engineering using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train = vectorizer.fit_transform(train['text'])
X_valid = vectorizer.transform(valid['text'])
X_test = vectorizer.transform(test['text'])

In [6]:
y_train = train['label'].values
y_valid = valid['label'].values
y_test = test['label'].values

In [7]:
# Neural Network Model with Batch Normalization and Dropout
model = Sequential([
    Dense(128, activation='relu', input_dim=X_train.shape[1]),
    BatchNormalization(),
    Dropout(0.4),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [8]:
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0005), metrics=['accuracy'])

In [9]:
# Train Model
history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10, batch_size=64, verbose=1)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.7204 - loss: 0.5399 - val_accuracy: 0.8848 - val_loss: 0.2952
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.8969 - loss: 0.2542 - val_accuracy: 0.8854 - val_loss: 0.2758
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9181 - loss: 0.2075 - val_accuracy: 0.8806 - val_loss: 0.2882
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9366 - loss: 0.1634 - val_accuracy: 0.8774 - val_loss: 0.3096
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9518 - loss: 0.1269 - val_accuracy: 0.8778 - val_loss: 0.3614
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9652 - loss: 0.0936 - val_accuracy: 0.8706 - val_loss: 0.4231
Epoch 7/10
[1m625/625[0m 

In [10]:
# Evaluate Model
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {acc * 100:.2f}%')

Test Accuracy: 88.30%


In [11]:
# Confusion Matrix & AUC Score
y_pred = (model.predict(X_test) > 0.5).astype('int32')
cm = confusion_matrix(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print("Confusion Matrix:\n", cm)
print("AUC Score:", auc)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Confusion Matrix:
 [[2162  333]
 [ 252 2253]]
AUC Score: 0.8829671318685276


In [12]:
# Making Predictions
example_texts = ["This movie was fantastic!", "Absolutely terrible film, never watching again"]
example_transformed = vectorizer.transform(example_texts)
predictions = (model.predict(example_transformed) > 0.5).astype('int32')
print("Predictions:", predictions.flatten())

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Predictions: [1 0]


In [13]:
# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [14]:
# Save the model and weights
model.save("sentiment_model.h5")