In [2]:
import pandas as pd
import numpy as np
import random

In [3]:

np.random.seed(42)
random.seed(42)


In [4]:
df = pd.read_csv('../2. Dataset/Customer_Sentiment.csv')
df.head(1)

Unnamed: 0,customer_id,gender,age_group,region,product_category,purchase_channel,platform,customer_rating,review_text,sentiment,response_time_hours,issue_resolved,complaint_registered
0,1,male,60+,north,automobile,online,flipkart,1,very disappointed with the quality.,negative,46,yes,yes


In [5]:
df.columns

Index(['customer_id', 'gender', 'age_group', 'region', 'product_category',
       'purchase_channel', 'platform', 'customer_rating', 'review_text',
       'sentiment', 'response_time_hours', 'issue_resolved',
       'complaint_registered'],
      dtype='object')

In [6]:

df['sentiment_label'] = df['sentiment'].map({
    'positive' : 2,
     'neutral' : 1,
       'negative' : 0
})

df[['sentiment', 'sentiment_label']].head()



Unnamed: 0,sentiment,sentiment_label
0,negative,0
1,positive,2
2,negative,0
3,negative,0
4,neutral,1


In [7]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['gender', 'age_group', 'region', 'product_category', 
                    'purchase_channel', 'platform', 'issue_resolved', 'complaint_registered']

for col in categorical_cols:
    print(df[col].unique())


label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


import pickle

with open("rnn_weights/label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)


['male' 'other' 'female']
['60+' '46-60' '36-45' '18-25' '26-35']
['north' 'central' 'east' 'south' 'west']
['automobile' 'books' 'sports' 'groceries' 'electronics' 'travel'
 'fashion' 'home & kitchen' 'beauty']
['online']
['flipkart' 'swiggy instamart' 'facebook marketplace' 'zepto' 'croma'
 'amazon' 'shopclues' 'tata cliq' 'snapdeal' 'paytm mall' 'ajio' 'myntra'
 'nykaa' 'reliance digital' 'meesho' 'bigbasket' 'lenskart' 'jiomart'
 'others' 'boat']
['yes' 'no']
['yes' 'no']


In [8]:
from sklearn.preprocessing import StandardScaler

numerical_cols = ['customer_rating', 'response_time_hours']

print(df['customer_rating'].unique())
print(df['response_time_hours'].unique())


scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

import pickle

with open("rnn_weights/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

[1 5 2 3 4]
[46  5 38 16 15 10 53  7 56 69 70 48 55  4 44 32 61 11 51 31 59  8 12 52
 57 36 68 64 26 34 18  2  9 24  6 35 45 47 25 29 58 13 17 42 20  1 41 63
 39 67 22 19 43 27  3 28 40 49 14 21 54 23 71 62 66 60 33 50 30 65 37]


In [9]:
df['review_text'].head()

0        very disappointed with the quality.
1         fast delivery and great packaging.
2        very disappointed with the quality.
3    product stopped working after few days.
4                 neutral about the quality.
Name: review_text, dtype: object

In [10]:
def tokenize(text):

  text = text.lower()
  text = text.replace('?','')
  text = text.replace("'","")
  
  return text.split()


vocab = {'<PAD>':0 ,'<UNK>': 1}

def build_vocab(row):
  
  tokenized_question = tokenize(row['review_text'])
  
  for token in tokenized_question:

    if token not in vocab:
      
      vocab[token] = len(vocab)
    

df.apply(build_vocab, axis=1) # for each row

import pickle
with open('rnn_weights/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)


def text_to_indices(text, vocab):

  indexed_text = []

  for token in tokenize(text):

    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

df['review_text'] = df['review_text'].apply(lambda x: text_to_indices(x, vocab))

def pad_sequence(seq, max_len, pad_value=vocab['<PAD>']):
    if len(seq) < max_len:
        seq = seq + [pad_value] * (max_len - len(seq))
    else:
        seq = seq[:max_len]  # decrease
    return seq

max_len = max(df['review_text'].apply(len))

df['review_text'] = df['review_text'].apply(lambda x: pad_sequence(x, max_len))

df['review_text'].head()

0          [2, 3, 4, 5, 6, 0]
1        [7, 8, 9, 10, 11, 0]
2          [2, 3, 4, 5, 6, 0]
3    [12, 13, 14, 15, 16, 17]
4        [18, 19, 5, 6, 0, 0]
Name: review_text, dtype: object

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomerDataset(Dataset):

    def __init__(self, df, seq_cols, feature_cols, target_col):
    
        self.sequences = df[seq_cols]
        self.features = df[feature_cols]
        self.target = df[target_col]
        
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self, idx):
        
         sequence = torch.tensor(self.sequences.iloc[idx], dtype=torch.long).squeeze(0)
         feature =  torch.tensor(self.features.iloc[idx], dtype=torch.float)
         target = torch.tensor(self.target.iloc[idx], dtype=torch.long)

         return sequence , feature , target
    

In [12]:
seq_cols = ['review_text']
features_cols = categorical_cols + numerical_cols
target_col = 'sentiment_label' 

In [13]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [14]:
train_dataset = CustomerDataset(train_df, seq_cols, features_cols, target_col)
test_dataset = CustomerDataset(test_df, seq_cols, features_cols, target_col)


In [15]:
import torch
import torch.nn as nn

class SimpleRNNModel(nn.Module):

    def __init__(self,vocab_size, text_emb_dim, rnn_hidden, feature_dim, num_classes , dropout_rate = 0.3):
       
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, text_emb_dim, padding_idx=0)  
        self.rnn = nn.RNN(text_emb_dim, rnn_hidden, batch_first=True)
        
        self.fc = nn.Sequential(
            nn.Linear(rnn_hidden + feature_dim , 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, num_classes)
        )

    def forward(self, text_seq, feature):

        x = self.embedding(text_seq)     
    
        hidden_combined , hidden_last = self.rnn(x)  # (all hidden combined , hidden for last step )

        hidden_last = hidden_last.squeeze(0)

        x = torch.cat([hidden_last, feature], dim=1)                  

        out = self.fc(x)                     

        return out


In [16]:
device = 'cpu'
if hasattr(torch,'mps') and torch.backends.mps.is_available():
    device = 'mps'
    print("MPS is available")

MPS is available


In [17]:
vocab_size = len(vocab)
feature_dim = 10
num_classes = 3

text_emb_dim = 16
rnn_hidden = 64
dropout_rate = 0.35541525567992005
num_epochs = 20
learning_rate = 0.0015601527167279818
weight_decay = 0.000896823

batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


model = SimpleRNNModel(vocab_size, text_emb_dim, rnn_hidden, feature_dim, num_classes , dropout_rate)
model = model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

criterion = nn.CrossEntropyLoss()

In [18]:
for epoch in range(num_epochs):
    
    model.train()

    total_epoch_loss = 0

    for batch_sequence , batch_feature , batch_target in train_loader:
            
        batch_sequence , batch_feature , batch_target = batch_sequence.to(device) , batch_feature.to(device) , batch_target.to(device)
            
        optimizer.zero_grad()
            
        outputs = model(batch_sequence,batch_feature)
            
        loss = criterion(outputs, batch_target)
            
        loss.backward()
            
        optimizer.step()

        total_epoch_loss = total_epoch_loss + loss.item()
        
    avg_loss = total_epoch_loss/len(train_loader)
    print(f'Epoch: {epoch + 1} , Loss: {avg_loss}')



  sequence = torch.tensor(self.sequences.iloc[idx], dtype=torch.long).squeeze(0)
  feature =  torch.tensor(self.features.iloc[idx], dtype=torch.float)


Epoch: 1 , Loss: 1.0485336281621038
Epoch: 2 , Loss: 0.9580755129028052
Epoch: 3 , Loss: 0.8704612159881348
Epoch: 4 , Loss: 0.7687160155643671
Epoch: 5 , Loss: 0.6670671840445302
Epoch: 6 , Loss: 0.566834348078353
Epoch: 7 , Loss: 0.4787015734960477
Epoch: 8 , Loss: 0.40014756249543576
Epoch: 9 , Loss: 0.33287496679126266
Epoch: 10 , Loss: 0.27421397037399464
Epoch: 11 , Loss: 0.22591518284604192
Epoch: 12 , Loss: 0.18429091456123053
Epoch: 13 , Loss: 0.15319408326388928
Epoch: 14 , Loss: 0.12521330326700364
Epoch: 15 , Loss: 0.10160464799394622
Epoch: 16 , Loss: 0.08392825040716333
Epoch: 17 , Loss: 0.06948142446363315
Epoch: 18 , Loss: 0.05675648315884054
Epoch: 19 , Loss: 0.048536427842732814
Epoch: 20 , Loss: 0.04158819070496498


In [19]:

model.eval() 

correct = 0
total = 0
    
with torch.no_grad():
    
    for batch_sequence , batch_feature , batch_target in test_loader:

        batch_sequence , batch_feature , batch_target = batch_sequence.to(device) , batch_feature.to(device) , batch_target.to(device)
        
        outputs = model(batch_sequence , batch_feature)
        _ , predicted = torch.max(outputs, 1)
        
        total += batch_target.size(0)
            
        correct += (predicted == batch_target).sum().item()

accuracy = correct / total
print(accuracy)

  sequence = torch.tensor(self.sequences.iloc[idx], dtype=torch.long).squeeze(0)
  feature =  torch.tensor(self.features.iloc[idx], dtype=torch.float)


1.0


In [20]:
torch.save(model.state_dict(), "rnn_weights/model_weights.pth")