In [1]:
import pandas as pd
import numpy as np
import random

In [2]:

np.random.seed(42)
random.seed(42)


In [3]:
df = pd.read_csv('../2. Dataset/Customer_Sentiment.csv')
df.head()

Unnamed: 0,customer_id,gender,age_group,region,product_category,purchase_channel,platform,customer_rating,review_text,sentiment,response_time_hours,issue_resolved,complaint_registered
0,1,male,60+,north,automobile,online,flipkart,1,very disappointed with the quality.,negative,46,yes,yes
1,2,other,46-60,central,books,online,swiggy instamart,5,fast delivery and great packaging.,positive,5,yes,no
2,3,female,36-45,east,sports,online,facebook marketplace,1,very disappointed with the quality.,negative,38,yes,yes
3,4,female,18-25,central,groceries,online,zepto,2,product stopped working after few days.,negative,16,yes,yes
4,5,female,18-25,east,electronics,online,croma,3,neutral about the quality.,neutral,15,yes,no


In [113]:
df.columns

Index(['customer_id', 'gender', 'age_group', 'region', 'product_category',
       'purchase_channel', 'platform', 'customer_rating', 'review_text',
       'sentiment', 'response_time_hours', 'issue_resolved',
       'complaint_registered'],
      dtype='object')

In [114]:

df['sentiment_label'] = df['sentiment'].map({
    'positive' : 2,
     'neutral' : 1,
       'negative' : 0
})

df[['sentiment', 'sentiment_label']].head()



Unnamed: 0,sentiment,sentiment_label
0,negative,0
1,positive,2
2,negative,0
3,negative,0
4,neutral,1


In [115]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['gender', 'age_group', 'region', 'product_category', 
                    'purchase_channel', 'platform', 'issue_resolved', 'complaint_registered']

for col in categorical_cols:
    print(df[col].unique())


label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


import pickle

with open("rnn_weights/label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)


['male' 'other' 'female']
['60+' '46-60' '36-45' '18-25' '26-35']
['north' 'central' 'east' 'south' 'west']
['automobile' 'books' 'sports' 'groceries' 'electronics' 'travel'
 'fashion' 'home & kitchen' 'beauty']
['online']
['flipkart' 'swiggy instamart' 'facebook marketplace' 'zepto' 'croma'
 'amazon' 'shopclues' 'tata cliq' 'snapdeal' 'paytm mall' 'ajio' 'myntra'
 'nykaa' 'reliance digital' 'meesho' 'bigbasket' 'lenskart' 'jiomart'
 'others' 'boat']
['yes' 'no']
['yes' 'no']


In [116]:
from sklearn.preprocessing import StandardScaler

numerical_cols = ['customer_rating', 'response_time_hours']

print(df['customer_rating'].unique())
print(df['response_time_hours'].unique())


scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

import pickle

with open("rnn_weights/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

[1 5 2 3 4]
[46  5 38 16 15 10 53  7 56 69 70 48 55  4 44 32 61 11 51 31 59  8 12 52
 57 36 68 64 26 34 18  2  9 24  6 35 45 47 25 29 58 13 17 42 20  1 41 63
 39 67 22 19 43 27  3 28 40 49 14 21 54 23 71 62 66 60 33 50 30 65 37]


In [117]:
df['review_text'].head()

0        very disappointed with the quality.
1         fast delivery and great packaging.
2        very disappointed with the quality.
3    product stopped working after few days.
4                 neutral about the quality.
Name: review_text, dtype: object

In [118]:
def tokenize(text):

  text = text.lower()
  text = text.replace('?','')
  text = text.replace("'","")
  
  return text.split()


vocab = {'<PAD>':0 ,'<UNK>': 1}

def build_vocab(row):
  
  tokenized_question = tokenize(row['review_text'])
  
  for token in tokenized_question:

    if token not in vocab:
      
      vocab[token] = len(vocab)
    

df.apply(build_vocab, axis=1) # for each row

import pickle
with open('rnn_weights/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)


def text_to_indices(text, vocab):

  indexed_text = []

  for token in tokenize(text):

    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

df['review_text'] = df['review_text'].apply(lambda x: text_to_indices(x, vocab))

def pad_sequence(seq, max_len, pad_value=vocab['<PAD>']):
    if len(seq) < max_len:
        seq = seq + [pad_value] * (max_len - len(seq))
    else:
        seq = seq[:max_len]  # decrease
    return seq

max_len = max(df['review_text'].apply(len))

df['review_text'] = df['review_text'].apply(lambda x: pad_sequence(x, max_len))

df['review_text'].head()

0          [2, 3, 4, 5, 6, 0]
1        [7, 8, 9, 10, 11, 0]
2          [2, 3, 4, 5, 6, 0]
3    [12, 13, 14, 15, 16, 17]
4        [18, 19, 5, 6, 0, 0]
Name: review_text, dtype: object

In [119]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomerDataset(Dataset):

    def __init__(self, df, seq_cols, feature_cols, target_col):
    
        self.sequences = df[seq_cols]
        self.features = df[feature_cols]
        self.target = df[target_col]
        
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self, idx):
        
         sequence = torch.tensor(self.sequences.iloc[idx], dtype=torch.long).squeeze(0)
         feature =  torch.tensor(self.features.iloc[idx], dtype=torch.float)
         target = torch.tensor(self.target.iloc[idx], dtype=torch.long)

         return sequence , feature , target
    

In [120]:
seq_cols = ['review_text']
features_cols = categorical_cols + numerical_cols
target_col = 'sentiment_label' 

In [121]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [122]:
train_dataset = CustomerDataset(train_df, seq_cols, features_cols, target_col)
test_dataset = CustomerDataset(test_df, seq_cols, features_cols, target_col)


In [123]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [124]:
import torch
import torch.nn as nn

class SimpleRNNModel(nn.Module):

    def __init__(self,vocab_size, text_emb_dim, rnn_hidden, feature_dim, num_classes , dropout_rate = 0.3):
       
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, text_emb_dim, padding_idx=0)  
        self.rnn = nn.RNN(text_emb_dim, rnn_hidden, batch_first=True)
        
        self.fc = nn.Sequential(
            nn.Linear(rnn_hidden + feature_dim , 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, num_classes)
        )

    def forward(self, text_seq, feature):

        x = self.embedding(text_seq)     
    
        hidden_combined , hidden_last = self.rnn(x)  # (all hidden combined , hidden for last step )

        hidden_last = hidden_last.squeeze(0)

        x = torch.cat([hidden_last, feature], dim=1)                  

        out = self.fc(x)                     

        return out


In [125]:
device = 'cpu'
if hasattr(torch,'mps') and torch.backends.mps.is_available():
    device = 'mps'
    print("MPS is available")

MPS is available


In [126]:

def objective(trial):

    vocab_size = len(vocab)
    feature_dim = 10
    num_classes = 3

    text_emb_dim = trial.suggest_categorical('text_emb_dim', [8,16, 32])
    rnn_hidden = trial.suggest_categorical('rnn_hidden', [16,32,64])    
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
    num_epochs = trial.suggest_int('num_epochs', 10, 30)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-2)

    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    optimizer_name = trial.suggest_categorical('optimizer', ['SGD', 'Adam', 'RMSprop'])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = SimpleRNNModel(vocab_size, text_emb_dim, rnn_hidden, feature_dim, num_classes , dropout_rate = 0.3)
    model = model.to(device)

    if optimizer_name == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == 'RMSprop':
        optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    criterion = nn.CrossEntropyLoss()


    for epoch in range(num_epochs):
        
        model.train()

        total_epoch_loss = 0

        for batch_sequence , batch_feature , batch_target in train_loader:
            
            batch_sequence , batch_feature , batch_target = batch_sequence.to(device) , batch_feature.to(device) , batch_target.to(device)
            
            optimizer.zero_grad()
            
            outputs = model(batch_sequence,batch_feature)
            
            loss = criterion(outputs, batch_target)
            
            loss.backward()
            
            optimizer.step()

            total_epoch_loss = total_epoch_loss + loss.item()
        
        avg_loss = total_epoch_loss/len(train_loader)
        print(f'Epoch: {epoch + 1} , Loss: {avg_loss}')


    model.eval() 

    correct = 0
    total = 0
    
    with torch.no_grad():

        for batch_sequence , batch_feature , batch_target in test_loader:

            batch_sequence , batch_feature , batch_target = batch_sequence.to(device) , batch_feature.to(device) , batch_target.to(device)
        
            outputs = model(batch_sequence , batch_feature)
            _ , predicted = torch.max(outputs, 1)
        
            total += batch_target.size(0)
            
            correct += (predicted == batch_target).sum().item()

    accuracy = correct / total
    return accuracy

In [127]:
import optuna

pruner = optuna.pruners.MedianPruner() 

study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(objective, n_trials=5)  # Run 5 trials

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-12-08 18:01:07,031] A new study created in memory with name: no-name-399350a7-1806-4265-b7be-d1ae806c91d7
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-2)
  sequence = torch.tensor(self.sequences.iloc[idx], dtype=torch.long).squeeze(0)
  feature =  torch.tensor(self.features.iloc[idx], dtype=torch.float)


Epoch: 1 , Loss: 1.1639169929504394
Epoch: 2 , Loss: 1.1316650966644286
Epoch: 3 , Loss: 1.112564157485962
Epoch: 4 , Loss: 1.0894032797813415
Epoch: 5 , Loss: 1.0706944945335388
Epoch: 6 , Loss: 1.0553609543800353
Epoch: 7 , Loss: 1.0403046257019044
Epoch: 8 , Loss: 1.0268815011024475
Epoch: 9 , Loss: 1.0155579345703125
Epoch: 10 , Loss: 0.9995580371856689
Epoch: 11 , Loss: 0.9856505519866944
Epoch: 12 , Loss: 0.9733967250823975


[I 2025-12-08 18:01:57,345] Trial 0 finished with value: 0.7064 and parameters: {'text_emb_dim': 8, 'rnn_hidden': 16, 'dropout_rate': 0.4697624091511885, 'num_epochs': 12, 'learning_rate': 0.00012033611358447874, 'weight_decay': 0.0033276807243787085, 'batch_size': 32, 'optimizer': 'SGD'}. Best is trial 0 with value: 0.7064.


Epoch: 1 , Loss: 1.0525310355634354
Epoch: 2 , Loss: 0.9584301708224482
Epoch: 3 , Loss: 0.8690450549506532
Epoch: 4 , Loss: 0.7695131638941293
Epoch: 5 , Loss: 0.6639600550404753
Epoch: 6 , Loss: 0.5673502084736626
Epoch: 7 , Loss: 0.47986580712345844
Epoch: 8 , Loss: 0.39978536154134586
Epoch: 9 , Loss: 0.32887126024538715
Epoch: 10 , Loss: 0.26644053845740734
Epoch: 11 , Loss: 0.21289273934623304
Epoch: 12 , Loss: 0.16629178864887348
Epoch: 13 , Loss: 0.13210844981689424
Epoch: 14 , Loss: 0.10279382170198824
Epoch: 15 , Loss: 0.08400184501664706
Epoch: 16 , Loss: 0.0674402797326874
Epoch: 17 , Loss: 0.056356005418224456
Epoch: 18 , Loss: 0.04868805291244207
Epoch: 19 , Loss: 0.04291472626260866
Epoch: 20 , Loss: 0.036977231448974475


[I 2025-12-08 18:02:51,698] Trial 1 finished with value: 1.0 and parameters: {'text_emb_dim': 16, 'rnn_hidden': 64, 'dropout_rate': 0.35541525567992005, 'num_epochs': 20, 'learning_rate': 0.0015601527167279818, 'weight_decay': 0.0008968231476781962, 'batch_size': 64, 'optimizer': 'SGD'}. Best is trial 1 with value: 1.0.


Epoch: 1 , Loss: 0.054129169705719835
Epoch: 2 , Loss: 0.0004970703834214005
Epoch: 3 , Loss: 0.00040124133824308517
Epoch: 4 , Loss: 0.016528387436647692
Epoch: 5 , Loss: 0.0003899707883236052
Epoch: 6 , Loss: 0.0002023693848934075
Epoch: 7 , Loss: 0.00017708394937176452
Epoch: 8 , Loss: 0.00019520984985383678
Epoch: 9 , Loss: 0.0001888210707684092
Epoch: 10 , Loss: 0.0001917871451985972
Epoch: 11 , Loss: 0.00018065803058005744
Epoch: 12 , Loss: 0.013732421796500667
Epoch: 13 , Loss: 0.0002853677575261433
Epoch: 14 , Loss: 0.00015868208831573604
Epoch: 15 , Loss: 0.00013978754225480637


[I 2025-12-08 18:03:21,025] Trial 2 finished with value: 1.0 and parameters: {'text_emb_dim': 8, 'rnn_hidden': 16, 'dropout_rate': 0.317251245931089, 'num_epochs': 15, 'learning_rate': 0.0036441106586338414, 'weight_decay': 0.00018880031835256228, 'batch_size': 128, 'optimizer': 'RMSprop'}. Best is trial 1 with value: 1.0.


Epoch: 1 , Loss: 1.0582829258244508
Epoch: 2 , Loss: 1.0191179324107564
Epoch: 3 , Loss: 0.9898569910389603
Epoch: 4 , Loss: 0.9570851580352541
Epoch: 5 , Loss: 0.9164553159361433
Epoch: 6 , Loss: 0.8792469000360769
Epoch: 7 , Loss: 0.832032704429262
Epoch: 8 , Loss: 0.7836378638152104
Epoch: 9 , Loss: 0.7352704963866313
Epoch: 10 , Loss: 0.6828667494901426
Epoch: 11 , Loss: 0.6341166541834545
Epoch: 12 , Loss: 0.5820750635900315
Epoch: 13 , Loss: 0.5321706383471276
Epoch: 14 , Loss: 0.4862580257616225


[I 2025-12-08 18:03:48,781] Trial 3 finished with value: 0.8848 and parameters: {'text_emb_dim': 32, 'rnn_hidden': 64, 'dropout_rate': 0.4265857117617877, 'num_epochs': 14, 'learning_rate': 0.0012843080286718295, 'weight_decay': 0.0008954231620878677, 'batch_size': 128, 'optimizer': 'SGD'}. Best is trial 1 with value: 1.0.


Epoch: 1 , Loss: 1.0672007860086214
Epoch: 2 , Loss: 1.0004623338056449
Epoch: 3 , Loss: 0.9462590013830044
Epoch: 4 , Loss: 0.8905487942238586
Epoch: 5 , Loss: 0.8293169921579452
Epoch: 6 , Loss: 0.766293353927783
Epoch: 7 , Loss: 0.7005041275923245
Epoch: 8 , Loss: 0.6411424189710769
Epoch: 9 , Loss: 0.5879790800067183
Epoch: 10 , Loss: 0.5345837174893949
Epoch: 11 , Loss: 0.4921644163398316
Epoch: 12 , Loss: 0.4552191877707887
Epoch: 13 , Loss: 0.4184858357182707
Epoch: 14 , Loss: 0.38621011195472255
Epoch: 15 , Loss: 0.35390381053232917
Epoch: 16 , Loss: 0.32604663697675396
Epoch: 17 , Loss: 0.29915914667871435
Epoch: 18 , Loss: 0.2725785464143601
Epoch: 19 , Loss: 0.24521191675251666
Epoch: 20 , Loss: 0.2228332618459726
Epoch: 21 , Loss: 0.20010551886436656
Epoch: 22 , Loss: 0.17972082849413443
Epoch: 23 , Loss: 0.15964099169729618
Epoch: 24 , Loss: 0.14188686451211144
Epoch: 25 , Loss: 0.12475116969868778
Epoch: 26 , Loss: 0.11115917556297283
Epoch: 27 , Loss: 0.10095402434135016

[I 2025-12-08 18:05:10,724] Trial 4 finished with value: 1.0 and parameters: {'text_emb_dim': 8, 'rnn_hidden': 32, 'dropout_rate': 0.33365759885764196, 'num_epochs': 30, 'learning_rate': 0.0011871510020569583, 'weight_decay': 0.004262727440115467, 'batch_size': 64, 'optimizer': 'SGD'}. Best is trial 1 with value: 1.0.


In [128]:
print("Best hyperparameters:", study.best_params)
print("Best accuracy:", study.best_value)

Best hyperparameters: {'text_emb_dim': 16, 'rnn_hidden': 64, 'dropout_rate': 0.35541525567992005, 'num_epochs': 20, 'learning_rate': 0.0015601527167279818, 'weight_decay': 0.0008968231476781962, 'batch_size': 64, 'optimizer': 'SGD'}
Best accuracy: 1.0
