In [3]:
# Following command to install tensorflow
!pip install tensorflow

# Following command to install scikit-learn
!pip install scikit-learn

# Used existing outline revamping model for increased efficiency and generalization to other text elements (product reviews).

Collecting tensorflow
  Downloading tensorflow-2.17.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow)
  Downloading h5py-3.11.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.5 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting ml-dtypes<0.5.0,>=0.3.1 (from tensorf

In [4]:
# Import necessary libraries
import os
import json
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
# Set up Kaggle credentials
kaggle_dictionary = json.load(open("kaggle.json"))
os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]

In [18]:
# Download the IMDb dataset from Kaggle
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /Users/kunalk
100%|██████████████████████████████████████| 25.7M/25.7M [00:02<00:00, 14.3MB/s]
100%|██████████████████████████████████████| 25.7M/25.7M [00:02<00:00, 11.4MB/s]


In [None]:
# Unzip the downloaded dataset
with ZipFile('imdb-dataset-of-50k-movie-reviews.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [51]:
# Load the dataset
df = pd.read_csv('IMDB Dataset.csv')

In [52]:
# Preprocess the data
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
X = df['review'].values
y = df['sentiment'].values

In [53]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

In [54]:
# Pad sequences to ensure uniform input length
X = pad_sequences(X, maxlen=200)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
model.summary()

In [63]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Build a more complex LSTM model
model = Sequential()

# Embedding layer: Converts input words into dense vectors of fixed size
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))

# First LSTM layer: Processes sequences, returning the full sequence of outputs (return_sequences=True)
model.add(LSTM(units=128, return_sequences=True))

# Second LSTM layer: Processes the sequence from the first LSTM and returns the final output (not a sequence)
model.add(LSTM(units=64))

# Output layer: Dense layer with a single unit and sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))  # Output layer: Dense layer with a single unit and sigmoid activation for binary classification

# Compile the model
# Optimizer: Adam - a popular choice for deep learning
# Loss function: binary_crossentropy - suitable for binary classification
# Metrics: accuracy - to monitor the model's performance during training
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [64]:
from tensorflow.keras.layers import Bidirectional

# Build a Bidirectional LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))  # Embedding layer: Converts input into dense vectors of fixed size
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))  # Bidirectional LSTM layer: Captures dependencies from both past and future contexts
model.add(LSTM(units=64))  # Second LSTM layer: Further processes the sequence output from the Bidirectional LSTM
model.add(Dense(1, activation='sigmoid'))  # Output layer: Dense layer with one unit and sigmoid activation for binary classification

# Compile the model
# Optimizer: Adam - popular optimizer in deep learning
# Loss function: binary_crossentropy - suited for binary classification tasks
# Metrics: accuracy - to monitor the accuracy during training
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [68]:
# Train the model with increased epochs and batch size
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 261ms/step - accuracy: 0.9959 - loss: 0.0148 - val_accuracy: 0.8783 - val_loss: 0.6426
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 262ms/step - accuracy: 0.9972 - loss: 0.0113 - val_accuracy: 0.8728 - val_loss: 0.6332
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 263ms/step - accuracy: 0.9954 - loss: 0.0143 - val_accuracy: 0.8742 - val_loss: 0.6230
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 262ms/step - accuracy: 0.9933 - loss: 0.0203 - val_accuracy: 0.8712 - val_loss: 0.5694
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 261ms/step - accuracy: 0.9968 - loss: 0.0104 - val_accuracy: 0.8718 - val_loss: 0.7001
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 262ms/step - accuracy: 0.9961 - loss: 0.0125 - val_accuracy: 0.8676 - val_loss: 0.6584
Epoc

<keras.src.callbacks.history.History at 0x2ada5be10>

In [69]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
import tensorflow as tf

# Learning rate scheduler function: Adjusts the learning rate during training
def scheduler(epoch, lr):
    if epoch < 3:
        return lr  # Keep the initial learning rate for the first 3 epochs
    else:
        # Reduce the learning rate exponentially after 3 epochs and ensure the output is a float
        return float(lr * tf.math.exp(-0.1))

# Instantiate the learning rate scheduler callback
lr_scheduler = LearningRateScheduler(scheduler)

# Compile the model with Adam optimizer, binary crossentropy loss, and accuracy metric
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with the learning rate scheduler, using 10 epochs, a batch size of 64, and validation data
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), callbacks=[lr_scheduler])

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 254ms/step - accuracy: 0.9966 - loss: 0.0107 - val_accuracy: 0.8814 - val_loss: 0.6969 - learning_rate: 0.0010
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 262ms/step - accuracy: 0.9993 - loss: 0.0025 - val_accuracy: 0.8814 - val_loss: 0.7891 - learning_rate: 0.0010
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 266ms/step - accuracy: 0.9984 - loss: 0.0055 - val_accuracy: 0.8755 - val_loss: 0.7343 - learning_rate: 0.0010
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 277ms/step - accuracy: 0.9980 - loss: 0.0055 - val_accuracy: 0.8767 - val_loss: 0.6501 - learning_rate: 9.0484e-04
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 280ms/step - accuracy: 0.9982 - loss: 0.0058 - val_accuracy: 0.8805 - val_loss: 0.8464 - learning_rate: 8.1873e-04
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x2ada71c10>

In [70]:
from tensorflow.keras.layers import Dropout

# Build a model with Dropout layers to prevent overfitting
model = Sequential()

# Embedding layer: Converts input into dense vectors of fixed size
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))

# Bidirectional LSTM layer: Processes input sequences in both directions
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))

# Dropout layer: Regularization technique to prevent overfitting by randomly setting a fraction of input units to 0
model.add(Dropout(0.5))

# Second LSTM layer: Processes sequences output from the previous layer
model.add(LSTM(units=64))

# Another Dropout layer: Adds more regularization after the second LSTM layer
model.add(Dropout(0.5))

# Output layer: Final layer with a sigmoid activation function for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model with the Adam optimizer, binary crossentropy loss, and accuracy metric
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [80]:
from sklearn.model_selection import KFold
import numpy as np

# Convert training data to numpy arrays for compatibility with KFold
X = np.array(X_train)
y = np.array(y_train)

# Initialize KFold cross-validation with 5 splits
kf = KFold(n_splits=5)

# Perform cross-validation
for train_index, val_index in kf.split(X):
    # Split data into training and validation sets for this fold
    X_train_cv, X_val_cv = X[train_index], X[val_index]
    y_train_cv, y_val_cv = y[train_index], y[val_index]
    
    # Build the model for this fold
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))  # Embedding layer
    model.add(Bidirectional(LSTM(units=128, return_sequences=True)))  # Bidirectional LSTM layer
    model.add(LSTM(units=64))  # Second LSTM layer
    model.add(Dense(1, activation='sigmoid'))  # Output layer
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model with cross-validation data
    model.fit(X_train_cv, y_train_cv, epochs=5, batch_size=64, validation_data=(X_val_cv, y_val_cv))

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 248ms/step - accuracy: 0.7543 - loss: 0.4898 - val_accuracy: 0.8655 - val_loss: 0.3187
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 260ms/step - accuracy: 0.8931 - loss: 0.2675 - val_accuracy: 0.8714 - val_loss: 0.3096
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 257ms/step - accuracy: 0.9172 - loss: 0.2133 - val_accuracy: 0.8795 - val_loss: 0.3004
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 251ms/step - accuracy: 0.9356 - loss: 0.1723 - val_accuracy: 0.8730 - val_loss: 0.3162
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 257ms/step - accuracy: 0.9505 - loss: 0.1340 - val_accuracy: 0.8761 - val_loss: 0.3317
Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 251ms/step - accuracy: 0.7272 - loss: 0.5279 - val_accuracy: 0.7308 - val_loss: 0.5356
Epoch 2/5


In [87]:
# Sample review for sentiment analysis
review_2 = "The product quality was exceptional and I would definitely recommend it."
sentiment_2 = predict_sentiment(review_2)
print(f"Sentiment Analysis Result: {sentiment_2}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Sentiment Analysis Result: positive


In [88]:
# Analyze sentiment of a new review
review_3 = "I am disappointed with the late delivery and poor customer service."
sentiment_3 = predict_sentiment(review_3)
print(f"Review Sentiment: {sentiment_3}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Review Sentiment: negative


In [90]:
# Determine sentiment for a user review
review_4 = "Amazing experience! The food was delicious and the service was excellent."
sentiment_4 = predict_sentiment(review_4)
print(f"Sentiment for the review: {sentiment_4}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Sentiment for the review: positive


In [91]:
# Get sentiment score for a feedback
review_5 = "The hotel was clean and comfortable, but the check-in process was too slow."
sentiment_5 = predict_sentiment(review_5)
print(f"Feedback sentiment: {sentiment_5}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Feedback sentiment: positive
