# Enhancing Customer Experiences through Deep Learning-Powered Sentiment Analysis of Reviews

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from transformers import BertTokenizer

In [None]:
# Define your dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, sentiments, labels, tokenizer, max_len):
        self.texts = texts
        self.sentiments = sentiments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        sentiment = str(self.sentiments[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'sentiment': sentiment,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out[:, -1, :])  # Use the last hidden state
        return out

In [None]:
# Load the dataset from CSV
df = pd.read_csv('/content/Data.csv')

# Assuming the CSV file has 'textID', 'text', 'selected_text', and 'sentiment' columns
texts = df['text'].values
sentiments = df['sentiment'].values
labels = df['selected_text'].values

# Tokenize and encode the texts using a tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 100  # maximum sequence length

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create DataLoader for train and test sets
train_dataset = CustomDataset(X_train, sentiments, y_train, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = CustomDataset(X_test, sentiments, y_test, tokenizer, max_len)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Initialize the LSTM model
input_dim = len(tokenizer)
hidden_dim = 128
output_dim = len(label_encoder.classes_)
model = LSTMModel(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        optimizer.zero_grad()

        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}")

# Evaluation
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Evaluating'):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        outputs = model(input_ids)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy}")


Epoch 1/10: 100%|██████████| 687/687 [02:36<00:00,  4.39it/s]


Epoch 1/10, Loss: 10.028289818520802


Epoch 2/10: 100%|██████████| 687/687 [02:33<00:00,  4.48it/s]


Epoch 2/10, Loss: 9.845884815921256


Epoch 3/10: 100%|██████████| 687/687 [02:29<00:00,  4.58it/s]


Epoch 3/10, Loss: 9.624270639044749


Epoch 4/10: 100%|██████████| 687/687 [02:31<00:00,  4.52it/s]


Epoch 4/10, Loss: 9.546729501504911


Epoch 5/10: 100%|██████████| 687/687 [02:32<00:00,  4.49it/s]


Epoch 5/10, Loss: 9.523981483048575


Epoch 6/10: 100%|██████████| 687/687 [02:28<00:00,  4.62it/s]


Epoch 6/10, Loss: 9.511541656705143


Epoch 7/10: 100%|██████████| 687/687 [02:34<00:00,  4.43it/s]


Epoch 7/10, Loss: 9.504097562342926


Epoch 8/10: 100%|██████████| 687/687 [02:41<00:00,  4.27it/s]


Epoch 8/10, Loss: 9.498932678800136


Epoch 9/10: 100%|██████████| 687/687 [02:27<00:00,  4.66it/s]


Epoch 9/10, Loss: 9.495369822350826


Epoch 10/10: 100%|██████████| 687/687 [02:25<00:00,  4.72it/s]


Epoch 10/10, Loss: 9.49252475782947


Evaluating: 100%|██████████| 172/172 [00:09<00:00, 17.27it/s]

Test Accuracy: 0.008186283427323995





# Model 2:Bidirectional LSTM

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from transformers import BertTokenizer

In [None]:
# Define your dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, sentiments, labels, tokenizer, max_len):
        self.texts = texts
        self.sentiments = sentiments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        sentiment = str(self.sentiments[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'sentiment': sentiment,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Define your Bidirectional LSTM with attention model
class BiLSTMWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTMWithAttention, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.attention = nn.Linear(hidden_dim * 2, 1)

    def forward(self, x, mask):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)

        # Attention mechanism
        attn_weights = torch.tanh(self.attention(lstm_out)).squeeze(-1)
        attn_weights = torch.softmax(attn_weights.masked_fill(mask == 0, -1e9), dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), lstm_out).squeeze(1)

        out = self.fc(context)
        return out

In [None]:
# Load the dataset from CSV
df = pd.read_csv('/content/Data.csv')

# Assuming the CSV file has 'text', 'sentiment', and 'selected_text' columns
texts = df['text'].values
sentiments = df['sentiment'].values
labels = df['selected_text'].values

# Tokenize and encode the texts using a tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 100  # maximum sequence length

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create DataLoader for train and test sets
train_dataset = CustomDataset(X_train, sentiments, y_train, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = CustomDataset(X_test, sentiments, y_test, tokenizer, max_len)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:

# Initialize the BiLSTM with attention model
input_dim = tokenizer.vocab_size
hidden_dim = 128
output_dim = len(label_encoder.classes_)
model = BiLSTMWithAttention(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}")

# Evaluation
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Evaluating'):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy}")


Epoch 1/10: 100%|██████████| 687/687 [03:48<00:00,  3.01it/s]


Epoch 1/10, Loss: 10.024166622189629


Epoch 2/10: 100%|██████████| 687/687 [04:10<00:00,  2.75it/s]


Epoch 2/10, Loss: 9.761353420551862


Epoch 3/10: 100%|██████████| 687/687 [03:49<00:00,  3.00it/s]


Epoch 3/10, Loss: 9.338948913363216


Epoch 4/10: 100%|██████████| 687/687 [03:46<00:00,  3.04it/s]


Epoch 4/10, Loss: 8.976142973531976


Epoch 5/10: 100%|██████████| 687/687 [03:44<00:00,  3.06it/s]


Epoch 5/10, Loss: 8.580507274971952


Epoch 6/10: 100%|██████████| 687/687 [03:43<00:00,  3.07it/s]


Epoch 6/10, Loss: 8.1743949046128


Epoch 7/10: 100%|██████████| 687/687 [03:41<00:00,  3.11it/s]


Epoch 7/10, Loss: 7.784196684280684


Epoch 8/10: 100%|██████████| 687/687 [03:44<00:00,  3.06it/s]


Epoch 8/10, Loss: 7.415026418333317


Epoch 9/10: 100%|██████████| 687/687 [03:45<00:00,  3.05it/s]


Epoch 9/10, Loss: 7.063102202297472


Epoch 10/10: 100%|██████████| 687/687 [03:49<00:00,  2.99it/s]


Epoch 10/10, Loss: 6.724803501862105


Evaluating: 100%|██████████| 172/172 [00:16<00:00, 10.23it/s]

Test Accuracy: 0.03310896852828816



