In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
# Load the dataset
file_path = '/kaggle/input/product-caategory/d1.csv'  # Update the path to your dataset
data = pd.read_csv(file_path)

In [5]:
data

Unnamed: 0,Product Name,Sentiment,Review,Product Category
0,Redmi 12C (4/128GB),Positive,অসাধারণ ফোন।অনেক পছন্দ হয়েছে।একদম অথেনটিক শাও...,Smart Phones
1,Redmi 12C (4/128GB),Positive,"Phone is good according to my uses, Upgraded f...",Smart Phones
2,Redmi 12C (4/128GB),Positive,অল্প দামে দারুন একটা স্মার্টফোন 💙,Smart Phones
3,Redmi 12C (4/128GB),Positive,"Super Fast Delivery ,11200 TK te pailam",Smart Phones
4,Redmi 12C (4/128GB),Positive,Delay Delivery... Good Product.,Smart Phones
...,...,...,...,...
78125,Baseus Bipow Digital Display 20W 10000mAh Powe...,Positive,A good one,Power Bank
78126,Baseus Bipow Digital Display 20W 10000mAh Powe...,Positive,"Overall the product was good. Thanks, Pickaboo...",Power Bank
78127,Baseus Bipow Digital Display 15W 10000mAh Powe...,Positive,This is a very good powerbank in this price po...,Power Bank
78128,Baseus Bipow Digital Display 15W 10000mAh Powe...,Positive,good for long lasting but after 2 year it's fa...,Power Bank


In [2]:
# Load the dataset
file_path = '/kaggle/input/sentiment/d3_sentiment.csv'  # Update the path to your dataset
data = pd.read_csv(file_path)

# Combine the 'Product name', 'Emotion', and 'Review' columns into a single text input
data['Combined'] = data['Product Name'] + " " + data['Review']
texts = data['Combined']
sentiments = data['Sentiment']  # Assuming sentiments are labeled as 'Positive', 'Negative'

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

def encode_data(tokenizer, texts, max_length=256):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_text = tokenizer.encode_plus(
            text,
            max_length=max_length,
            add_special_tokens=True,
            return_attention_mask=True,
            pad_to_max_length=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

input_ids, attention_masks = encode_data(tokenizer, texts)

# Convert sentiment labels to numerical form
label_map = {'Positive': 1, 'Negative': 0}  # Update this map based on your dataset's labels
labels = torch.tensor(sentiments.map(label_map).values)

# Split data into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.2)

# Create DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)

# Load the BERT model
model_name = 'bert-base-multilingual-uncased'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.cuda()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [3]:
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch
import numpy as np
import time
import datetime

# Check if a GPU is available and use it; otherwise, use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the chosen device
model = model.to(device)

# Training loop parameters
epochs = 10

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Total number of training steps
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Lists to store loss values
train_loss_values = []
val_loss_values = []
test_loss_values = []  # New list for test loss

# Function to calculate elapsed time
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

# Function to evaluate loss for validation or test set
def evaluate_loss(model, dataloader, device):
    model.eval()  # Set model to evaluation mode
    total_loss = 0

    for batch in dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

# Main training loop
for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    train_loss_values.append(avg_train_loss)
    training_time = format_time(time.time() - t0)

    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # Validation loss after each epoch
    print("Running Validation...")
    t0 = time.time()
    val_loss = evaluate_loss(model, validation_dataloader, device)
    val_loss_values.append(val_loss)
    print("  Validation Loss: {0:.2f}".format(val_loss))
    print("  Validation epoch took: {:}".format(format_time(time.time() - t0)))

# Testing loss after training completes
print("\nRunning Testing...")
test_loss = evaluate_loss(model, test_dataloader, device)  # Ensure test_dataloader is defined
test_loss_values.append(test_loss)
print("  Testing Loss: {0:.2f}".format(test_loss))

print("\nTraining complete!")




Training...
  Batch   100  of  1,954.    Elapsed: 0:02:12.
  Batch   200  of  1,954.    Elapsed: 0:04:33.
  Batch   300  of  1,954.    Elapsed: 0:06:54.
  Batch   400  of  1,954.    Elapsed: 0:09:15.
  Batch   500  of  1,954.    Elapsed: 0:11:36.
  Batch   600  of  1,954.    Elapsed: 0:13:57.
  Batch   700  of  1,954.    Elapsed: 0:16:19.
  Batch   800  of  1,954.    Elapsed: 0:18:40.
  Batch   900  of  1,954.    Elapsed: 0:21:01.
  Batch 1,000  of  1,954.    Elapsed: 0:23:22.
  Batch 1,100  of  1,954.    Elapsed: 0:25:44.
  Batch 1,200  of  1,954.    Elapsed: 0:28:05.
  Batch 1,300  of  1,954.    Elapsed: 0:30:26.
  Batch 1,400  of  1,954.    Elapsed: 0:32:48.
  Batch 1,500  of  1,954.    Elapsed: 0:35:09.
  Batch 1,600  of  1,954.    Elapsed: 0:37:30.
  Batch 1,700  of  1,954.    Elapsed: 0:39:51.
  Batch 1,800  of  1,954.    Elapsed: 0:42:12.
  Batch 1,900  of  1,954.    Elapsed: 0:44:33.

  Average training loss: 0.01
  Training epoch took: 0:45:48

Training...
  Batch   100  of  

In [4]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Function to get predictions from the model
def get_predictions(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    for batch in dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.append(np.argmax(logits, axis=1))
        true_labels.append(label_ids)

    return np.concatenate(predictions), np.concatenate(true_labels)

# Get predictions and true labels for the test set
test_preds, test_labels = get_predictions(model, test_dataloader, device)

# Step 2: Calculate Accuracy
test_accuracy = accuracy_score(test_labels, test_preds)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Step 3: Generate Classification Report
print("\nClassification Report:")
print(classification_report(test_labels, test_preds, target_names=['Negative', 'Positive']))

# Step 4: Confusion Matrix
conf_matrix = confusion_matrix(test_labels, test_preds)
print("\nConfusion Matrix:")
print(conf_matrix)

# Step 5: Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()


Validation Accuracy: 0.9999
Precision: 0.9999
Recall: 0.9999
F1 Score: 0.9999

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2169
           1       1.00      1.00      1.00     13457

    accuracy                           1.00     15626
   macro avg       1.00      1.00      1.00     15626
weighted avg       1.00      1.00      1.00     15626

