# <Font color = 'indianred'>**Sentiment Analysis using Hugging Face Ecosystem** </font>

## <Font color = 'indianred'>**1. Set Environment**

In [None]:
import sys

# If in Colab, then import the drive module from google.colab
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  # Mount the Google Drive to access files stored there
  drive.mount('/content/drive')

  # Install the latest version of torchtext library quitely without showing output
  !pip install torchtext -qq
  !pip install transformers evaluate wandb datasets accelerate -U -qq
  !pip install transformers evaluate wandb datasets accelerate peft bitsandbytes -U -qq

  basepath = '/content/drive/MyDrive/NLP/Projects'

else:
  basepath = '/content/drive/MyDrive/NLP/Projects'

In [None]:
# standard data science librraies for data handling and v isualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import random

# Importing PyTorch library for tensor computations and neural network modules
import torch
import torch.nn as nn

# For working with textual data vocabularies and for displaying model summaries
from torchtext.vocab import vocab

# Load data fille
from datasets import load_dataset, DatasetDict
from datasets import Dataset

# Utilities for efficient serialization/deserialization of Python objects and for element tallying
import joblib
from collections import Counter

# For creating lightweight attribute classes and for partial function application
from functools import partial

# For filesystem path handling, generating and displaying confusion matrices, and date-time manipulations
from sklearn.metrics import confusion_matrix
from datetime import datetime

# New libraries introduced in this notebook
from sklearn.metrics import multilabel_confusion_matrix, precision_score, recall_score, f1_score
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoConfig
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import PreTrainedModel, PretrainedConfig
from transformers import BitsAndBytesConfig

import wandb
import evaluate
from peft import (
    TaskType,
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
)

In [None]:
# # # Setting up the device for GPU usage

# from torch import cuda
# device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# load the data

base_folder = Path(basepath)
data_folder = base_folder/'datasets/aclImdb'
model_folder = base_folder/'models/imdb/nn'
custom_functions = base_folder/'custom-functions'

In [None]:
model_folder.mkdir(exist_ok=True, parents = True)

In [None]:
model_folder

## <Font color = 'indianred'>**2. Load  data**

In [None]:
# load the data from hugging face
emotion_data = load_dataset('harpreetmann/train_emotion_spring_2024')

In [None]:
emotion_data['train'][0:2]

In [None]:
labels = ['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

In [None]:
#train_data = load_dataset('csv', data_files= str(base_folder/'emotion_detection_train.csv'))
test_data = load_dataset('csv', data_files= str(base_folder/'emotion_detection_test.csv'))

In [None]:
test_data

In [None]:
test_data['train'][0]

In [None]:
# labels = [label for label in test_data['train'].features.keys() if label not in ['ID', 'Tweet']]
# labels

In [None]:
test_data_text = test_data.remove_columns(['ID', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'])
test_data_text = test_data_text.rename_column('Tweet', 'text')

In [None]:
# Initialize counters for each label in train and valid sets
train_label_counts = Counter()
valid_label_counts = Counter()

# Function to update counts
def update_label_counts(dataset, label_counts):
    for label_array in dataset['label']:
        # print(label_array)
        # label_array is expected to be a list of binary values
        for index, label in enumerate(label_array):
            if label == 1:
                label_counts[index] += 1

# Update counts for both datasets
update_label_counts(emotion_data['train'], train_label_counts)
update_label_counts(emotion_data['valid'], valid_label_counts)

print(train_label_counts)

# Display the label distributions
print("Training set label distribution:")
for label, count in train_label_counts.items():
    print(f"Label {labels[label]}: {count}")

print("\nValidation set label distribution:")
for label, count in valid_label_counts.items():
    print(f"Label {labels[label]}: {count}")

In [None]:
# Sample data (assuming you have labels)
labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'love', 'anticipation', 'optimism']
train_counts = [2306, 2330, 1084, 2293, 1850, 714, 306, 656, 891, 1818]
valid_counts = [553, 591, 279, 584, 423, 82, 94, 176, 211, 473]

x = range(len(labels))  # the label locations

# Create two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

# Training set distribution
ax1.bar(x, train_counts, color='skyblue', alpha=0.7, label='Training Set')
ax1.set_title('Training Set Label Distribution')
ax1.set_xticks(x)
ax1.set_xticklabels(labels, rotation=45, ha='right')
ax1.set_xlabel('Labels')
ax1.set_ylabel('Count')
ax1.legend()

# Validation set distribution
ax2.bar(x, valid_counts, color='coral', alpha=0.7, label='Validation Set')
ax2.set_title('Validation Set Label Distribution')
ax2.set_xticks(x)
ax2.set_xticklabels(labels, rotation=45, ha='right')
ax2.set_xlabel('Labels')
ax2.set_ylabel('Count')
ax2.legend()

# Tight layout
fig.tight_layout()

# Display the plot
plt.show()

# <Font color = 'skyblue'>**Task 1 -part A (LoRA)** </font>

## <Font color = 'indianred'>**1. Load pre-trained Tokenizer** </font>

In [None]:
# google-gemma
checkpoint = "google/gemma-1.1-2b-it"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_fn(batch):
    return tokenizer(text = batch["text"], truncation=True)

In [None]:
tokenized_dataset= emotion_data.map(tokenize_fn, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(
    ['text']
)
# tokenized_dataset.set_format(type='torch')

In [None]:
tokenized_dataset

In [None]:
tokenized_test_dataset = test_data_text.map(tokenize_fn, batched=True)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(
    ['text']
)

In [None]:
tokenized_test_dataset

##  <font color = 'indianred'> **2. Model Training**

###  <font color = 'indianred'> **2.1. compute_metrics function** </font>


In [None]:
accuracy_metric = evaluate.load('accuracy', 'multilabel')
f1 = evaluate.load('f1','multilabel')


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # print(logits.shape)
    preds = (logits > 0).astype(int)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1_micro = f1.compute(predictions=preds, references=labels, average='micro')
    f1_macro = f1.compute(predictions=preds, references=labels, average='macro')
    return {'f1_micro':f1_micro['f1'],
            'f1_macro':f1_macro['f1'],
            'accuracy':accuracy['accuracy'],
            }

###  <font color = 'indianred'> **2.2. Training Arguments** </font>

In [None]:
# Define the directory where model checkpoints will be saved
run_name = "emotions_google_gemma"
base_folder = Path(basepath)
model_folder = base_folder / "models"/run_name
# Create the directory if it doesn't exist
model_folder.mkdir(exist_ok=True, parents=True)

# Configure training parameters
training_args = TrainingArguments(
    # Training-specific configurations
    num_train_epochs=5,  # Total number of training epochs
    # Number of samples per training batch for each device
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,

    weight_decay=0.1,  # Apply L2 regularization to prevent overfitting
    learning_rate=1e-4,  # Step size for the optimizer during training
    lr_scheduler_type='linear',
    warmup_steps=0,  # Number of warmup steps for the learning rate scheduler
    optim='adamw_torch',  # Optimizer,
    max_grad_norm = 1.0,

    # Checkpoint saving and model evaluation settings
    output_dir=str(model_folder),  # Directory to save model checkpoints
    evaluation_strategy='steps',  # Evaluate model at specified step intervals
    eval_steps=20,  # Perform evaluation every 10 training steps
    save_strategy="steps",  # Save model checkpoint at specified step intervals
    save_steps=20,  # Save a model checkpoint every 10 training steps
    load_best_model_at_end=True,  # Reload the best model at the end of training
    save_total_limit=2,  # Retain only the best and the most recent model checkpoints
    # Use 'accuracy' as the metric to determine the best model
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,  # A model is 'better' if its accuracy is higher


    # Experiment logging configurations (commented out in this example)
    logging_strategy='steps',
    logging_steps=20,
    report_to='wandb',  # Log metrics and results to Weights & Biases platform
    run_name=run_name,  # Experiment name for Weights & Biases

    fp16=False
    # bf16=True
    # tf32= False
)

### <Font color='indianred'> **2.3. Specify Model** <font/>

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                           num_labels=11,
                                                           problem_type="multi_label_classification" )


config = AutoConfig.from_pretrained(checkpoint)
id2label= {id_: label_ for id_, label_ in enumerate(labels)}
label2id = {label_: id_ for id_, label_ in enumerate(labels)}
config.id2label = id2label
config.label2id = label2id
model.config = config
# model

In [None]:
# model = model.to(device)

In [None]:
model

### <Font color='indianred'> **2.4. LoRA Setup** <font/>

In [None]:
from peft import (
    TaskType,
    LoraConfig,
    get_peft_model,
)

In [None]:
import re
model_modules = str(model.modules)
pattern = r'\((\w+)\): Linear'
linear_layer_names = re.findall(pattern, model_modules)

names = []
# Print the names of the Linear layers
for name in linear_layer_names:
    names.append(name)
target_modules = list(set(names))
target_modules

In [None]:
gemma_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=128,
    lora_alpha=256,
    lora_dropout=0.01,
    bias="lora_only",
    target_modules = ['gate_proj', 'k_proj', 'o_proj', 'q_proj', 'score', 'v_proj'])
gemma_model = get_peft_model(model, gemma_config )
gemma_model.print_trainable_parameters()

In [None]:
gemma_config.target_modules

In [None]:
gemma_model

### <Font color='indianred'> **2.5. Custom Trainer**<font/>

In [None]:
def calculate_pos_weights(dataset):
    # Initialize counters for all labels
    num_labels = len(dataset['train']['label'][0])
    total_positives = [0] * num_labels
    total_negatives = [0] * num_labels

    # Count positives and negatives for each label
    for label_array in dataset['train']['label']:
        for i, label in enumerate(label_array):
            if label == 1:
                total_positives[i] += 1
            else:
                total_negatives[i] += 1

    # Calculate pos_weight for each label
    pos_weight = [total_negatives[i] / max(total_positives[i], 1) for i in range(num_labels)]
    return torch.tensor(pos_weight)

# Calculate the pos_weight using the training set
pos_weights = calculate_pos_weights(emotion_data)


In [None]:
pos_weights

In [None]:
pos_weights= torch.tensor([2., 3., 2., 2., 2., 3., 2., 3., 2., 4., 4.])

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # print(inputs)
        # Extract labels and remove them from inputs
        labels = inputs.pop("labels").float()  # Ensure labels are float for BCE loss
        # print(labels)
        outputs = model(**inputs)
        logits = outputs.get("logits")

        device = next(model.parameters()).device

        # Compute custom loss (BCEWithLogitsLoss is suitable for multi-label)
        # pos_weight can be used to handle class imbalance
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weights.to(device))
        # Reshape labels to match logits dimensions
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [None]:
trainer = CustomTrainer(
    model=gemma_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

### <font color='indianred'>**2.6. Setup WanDB**<font/>

In [None]:
wandb.login()
%env WANDB_PROJECT = emotions_kaggle_S2024

### <font color = 'indianred'> **2.7. Start Training** <font/>

In [None]:
trainer.train()  # start training

In [None]:
trainer.train()  # start training

### <font color='indianred'> **2.8. Validation**<font/>

In [None]:
eval_results = trainer.evaluate(tokenized_dataset["valid"])

In [None]:
eval_results

In [None]:
wandb.log({"eval_accuracy": eval_results["eval_accuracy"], "eval_loss": eval_results["eval_loss"],
"eval_f1_micro": eval_results["eval_f1_micro"], "eval_f1_macro": eval_results["eval_f1_macro"]})

###  <font color = 'indianred'> **Check Confusion Matrix**</font>

In [None]:
# Use the trainer to generate predictions on the tokenized validation dataset.
# The resulting object, valid_output, will contain the model's logits (raw prediction scores) for each input in the validation set.
valid_output = trainer.predict(tokenized_dataset["valid"])

In [None]:
predictions_valid = (valid_output.predictions > 0).astype(int)
labels_valid = valid_output.label_ids.astype(int)

In [None]:
y_true = labels_valid
y_pred = predictions_valid
class_names = labels

mcm = multilabel_confusion_matrix(y_true, y_pred,)

# 1. Individual Heatmaps
for idx, matrix in enumerate(mcm):
    plt.figure(figsize=(5, 4))
    sns.heatmap(matrix, annot=True, fmt='g', cmap='Blues',
                xticklabels=['Predicted Negative', 'Predicted Positive'],
                yticklabels=['True Negative', 'True Positive'])
    plt.title(f'Confusion Matrix for {class_names[idx]}')
    plt.show()

# 2. Aggregate Metrics Heatmap
precision_per_class = precision_score(y_true, y_pred, average=None)
recall_per_class = recall_score(y_true, y_pred, average=None)
f1_per_class = f1_score(y_true, y_pred, average=None)

metrics_df = pd.DataFrame({
    'Precision': precision_per_class,
    'Recall': recall_per_class,
    'F1-Score': f1_per_class
}, index=class_names)

plt.figure(figsize=(10, 8))
# sns.heatmap(metrics_df, annot=True, cmap='Blues')
# plt.title('Metrics for each class')
# plt.show()

ax = sns.heatmap(metrics_df, annot=True, cmap='Blues')
plt.title('Metrics for each class')
plt.tight_layout()  # Adjust layout to not cut off edges

# Log the heatmap to wandb
wandb.log({"Metrics Heatmap": wandb.Image(ax.get_figure())})
plt.show()

# 3. Histogram of Metrics
metrics_df.plot(kind='bar', figsize=(12, 7))
plt.ylabel('Score')
plt.title('Precision, Recall, and F1-Score for Each Class')
plt.show()

In [None]:
wandb.finish()


###  <font color = 'indianred'> **Save the model on HuggingFace**</font>

In [None]:
trainer.push_to_hub("yunaseo/google_gemma_emotion_detection")

## <Font color = 'indianred'>**3. Test Data Prediction** </font>

In [None]:
# Make predictions
predictions = trainer.predict(tokenized_test_dataset["train"])

In [None]:
# Post-processing for multi-label classification
threshold = 0.5  # Example threshold
predicted_labels = (predictions.predictions > threshold).astype(int)

# Convert predictions to labels
predicted_labels = [[label for label, binary in zip(labels, binary_labels) if binary] for binary_labels in predicted_labels]

# Print or use the predicted labels
print(predicted_labels)

In [None]:
predicted_labels[0]

In [None]:
test_data

In [None]:
# Extracting data from the 'train' split of test_data
test_data_train = test_data['train']
tweet_ids = test_data_train['ID']
num_tweets = len(tweet_ids)

In [None]:
# Initializing an empty dictionary to store the predicted labels
predicted_labels_dict = {category: [0] * num_tweets for category in ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']}

# Iterate over each row of predicted labels and update the dictionary
for i, labels in enumerate(predicted_labels):
    for label in labels:
        predicted_labels_dict[label][i] = 1

In [None]:
# Create DataFrame
df = pd.DataFrame(predicted_labels_dict)

# Insert tweet IDs as the first column
df.insert(0, 'ID', tweet_ids)

In [None]:
df

In [None]:
df.to_csv('gemma_predicted.csv', index=False)

In [None]:
from google.colab import files
files.download('gemma_predicted.csv')

In [None]:
files.download('gemma_predicted.csv')

# <Font color = 'skyblue'>**Task 1 -part B (QLoRA)** </font>

## <Font color = 'indianred'>**1. Load pre-trained Tokenizer** </font>

In [None]:
# google-gemma
checkpoint = "google/gemma-1.1-2b-it"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_fn(batch):
    return tokenizer(text = batch["text"], truncation=True)

In [None]:
tokenized_dataset= emotion_data.map(tokenize_fn, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(
    ['text']
)
# tokenized_dataset.set_format(type='torch')

In [None]:
tokenized_dataset

In [None]:
tokenized_test_dataset = test_data_text.map(tokenize_fn, batched=True)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(
    ['text']
)

In [None]:
tokenized_test_dataset

##  <font color = 'indianred'> **2. Model Training**

###  <font color = 'indianred'> **2.1. compute_metrics function** </font>


In [None]:
accuracy_metric = evaluate.load('accuracy', 'multilabel')
f1 = evaluate.load('f1','multilabel')


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # print(logits.shape)
    preds = (logits > 0).astype(int)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1_micro = f1.compute(predictions=preds, references=labels, average='micro')
    f1_macro = f1.compute(predictions=preds, references=labels, average='macro')
    return {'f1_micro':f1_micro['f1'],
            'f1_macro':f1_macro['f1'],
            'accuracy':accuracy['accuracy'],
            }

###  <font color = 'indianred'> **2.2. Training Arguments** </font>

In [None]:
# Define the directory where model checkpoints will be saved
run_name = "emotions_google_gemma"
base_folder = Path(basepath)
model_folder = base_folder / "models"/run_name
# Create the directory if it doesn't exist
model_folder.mkdir(exist_ok=True, parents=True)

# Configure training parameters
training_args = TrainingArguments(
    # Training-specific configurations
    num_train_epochs=5,  # Total number of training epochs
    # Number of samples per training batch for each device
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,

    weight_decay=0.1,  # Apply L2 regularization to prevent overfitting
    learning_rate=1e-4,  # Step size for the optimizer during training
    lr_scheduler_type='linear',
    warmup_steps=0,  # Number of warmup steps for the learning rate scheduler
    optim='adamw_torch',  # Optimizer,
    max_grad_norm = 1.0,

    # Checkpoint saving and model evaluation settings
    output_dir=str(model_folder),  # Directory to save model checkpoints
    evaluation_strategy='steps',  # Evaluate model at specified step intervals
    eval_steps=20,  # Perform evaluation every 10 training steps
    save_strategy="steps",  # Save model checkpoint at specified step intervals
    save_steps=20,  # Save a model checkpoint every 10 training steps
    load_best_model_at_end=True,  # Reload the best model at the end of training
    save_total_limit=2,  # Retain only the best and the most recent model checkpoints
    # Use 'accuracy' as the metric to determine the best model
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,  # A model is 'better' if its accuracy is higher


    # Experiment logging configurations (commented out in this example)
    logging_strategy='steps',
    logging_steps=20,
    report_to='wandb',  # Log metrics and results to Weights & Biases platform
    run_name=run_name,  # Experiment name for Weights & Biases

    fp16=False
    # bf16=True
    # tf32= False
)

### <Font color='indianred'> **2.3. Specify Model** <font/>

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                           num_labels=11,
                                                           problem_type="multi_label_classification",
                                                           quantization_config=bnb_config)

model = prepare_model_for_kbit_training(model)

config = AutoConfig.from_pretrained(checkpoint)
id2label= {id_: label_ for id_, label_ in enumerate(labels)}
label2id = {label_: id_ for id_, label_ in enumerate(labels)}
config.id2label = id2label
config.label2id = label2id
model.config = config
# model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                           num_labels=11,
                                                           problem_type="multi_label_classification",
                                                           quantization_config=bnb_config)

model = prepare_model_for_kbit_training(model)

config = AutoConfig.from_pretrained(checkpoint)
id2label= {id_: label_ for id_, label_ in enumerate(labels)}
label2id = {label_: id_ for id_, label_ in enumerate(labels)}
config.id2label = id2label
config.label2id = label2id
model.config = config
# model

In [None]:
# model = model.to(device)

In [None]:
model

### <Font color='indianred'> **2.4. LoRA Setup** <font/>

In [None]:
import re
model_modules = str(model.modules)
pattern = r'\((\w+)\): Linear'
linear_layer_names = re.findall(pattern, model_modules)

names = []
# Print the names of the Linear layers
for name in linear_layer_names:
    names.append(name)
target_modules = list(set(names))
target_modules

In [None]:
gemma_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=128,
    lora_alpha=256,
    lora_dropout=0.01,
    bias="lora_only",
    #modules_to_save = ['score'],
    target_modules=target_modules)
gemma_model = get_peft_model(model, gemma_config )
gemma_model.print_trainable_parameters()

In [None]:
gemma_config.target_modules

In [None]:
gemma_model

### <Font color='indianred'> **2.5. Custom Trainer**<font/>

In [None]:
def calculate_pos_weights(dataset):
    # Initialize counters for all labels
    num_labels = len(dataset['train']['label'][0])
    total_positives = [0] * num_labels
    total_negatives = [0] * num_labels

    # Count positives and negatives for each label
    for label_array in dataset['train']['label']:
        for i, label in enumerate(label_array):
            if label == 1:
                total_positives[i] += 1
            else:
                total_negatives[i] += 1

    # Calculate pos_weight for each label
    pos_weight = [total_negatives[i] / max(total_positives[i], 1) for i in range(num_labels)]
    return torch.tensor(pos_weight)

# Calculate the pos_weight using the training set
pos_weights = calculate_pos_weights(emotion_data)


In [None]:
pos_weights

In [None]:
pos_weights= torch.tensor([2., 3., 2., 2., 2., 3., 2., 3., 2., 4., 4.])

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # print(inputs)
        # Extract labels and remove them from inputs
        labels = inputs.pop("labels").float()  # Ensure labels are float for BCE loss
        # print(labels)
        outputs = model(**inputs)
        logits = outputs.get("logits")

        device = next(model.parameters()).device

        # Compute custom loss (BCEWithLogitsLoss is suitable for multi-label)
        # pos_weight can be used to handle class imbalance
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weights.to(device))
        # Reshape labels to match logits dimensions
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [None]:
trainer = CustomTrainer(
    model=gemma_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

### <font color='indianred'>**2.6. Setup WanDB**<font/>

In [None]:
wandb.login()
%env WANDB_PROJECT = emotions_kaggle_S2024

### <font color = 'indianred'> **2.7. Start Training** <font/>

In [None]:
trainer.train()  # start training

In [None]:
trainer.train()  # start training

### <font color='indianred'> **2.8. Validation**<font/>

In [None]:
eval_results = trainer.evaluate(tokenized_dataset["valid"])

In [None]:
eval_results

In [None]:
wandb.log({"eval_accuracy": eval_results["eval_accuracy"], "eval_loss": eval_results["eval_loss"],
"eval_f1_micro": eval_results["eval_f1_micro"], "eval_f1_macro": eval_results["eval_f1_macro"]})

###  <font color = 'indianred'> **Check Confusion Matrix**</font>

In [None]:
# Use the trainer to generate predictions on the tokenized validation dataset.
# The resulting object, valid_output, will contain the model's logits (raw prediction scores) for each input in the validation set.
valid_output = trainer.predict(tokenized_dataset["valid"])

In [None]:
predictions_valid = (valid_output.predictions > 0).astype(int)
labels_valid = valid_output.label_ids.astype(int)

In [None]:
y_true = labels_valid
y_pred = predictions_valid
class_names = labels

mcm = multilabel_confusion_matrix(y_true, y_pred,)

# 1. Individual Heatmaps
for idx, matrix in enumerate(mcm):
    plt.figure(figsize=(5, 4))
    sns.heatmap(matrix, annot=True, fmt='g', cmap='Blues',
                xticklabels=['Predicted Negative', 'Predicted Positive'],
                yticklabels=['True Negative', 'True Positive'])
    plt.title(f'Confusion Matrix for {class_names[idx]}')
    plt.show()

# 2. Aggregate Metrics Heatmap
precision_per_class = precision_score(y_true, y_pred, average=None)
recall_per_class = recall_score(y_true, y_pred, average=None)
f1_per_class = f1_score(y_true, y_pred, average=None)

metrics_df = pd.DataFrame({
    'Precision': precision_per_class,
    'Recall': recall_per_class,
    'F1-Score': f1_per_class
}, index=class_names)

plt.figure(figsize=(10, 8))
# sns.heatmap(metrics_df, annot=True, cmap='Blues')
# plt.title('Metrics for each class')
# plt.show()

ax = sns.heatmap(metrics_df, annot=True, cmap='Blues')
plt.title('Metrics for each class')
plt.tight_layout()  # Adjust layout to not cut off edges

# Log the heatmap to wandb
wandb.log({"Metrics Heatmap": wandb.Image(ax.get_figure())})
plt.show()

# 3. Histogram of Metrics
metrics_df.plot(kind='bar', figsize=(12, 7))
plt.ylabel('Score')
plt.title('Precision, Recall, and F1-Score for Each Class')
plt.show()

In [None]:
wandb.finish()


###  <font color = 'indianred'> **Save the model on HuggingFace**</font>

In [None]:
trainer.push_to_hub("yunaseo/google_gemma_qlora_emotion_detection")

## <Font color = 'indianred'>**3. Test Data Prediction** </font>

In [None]:
# Make predictions
predictions = trainer.predict(tokenized_test_dataset["train"])

In [None]:
# Post-processing for multi-label classification
threshold = 0.5  # Example threshold
predicted_labels = (predictions.predictions > threshold).astype(int)

# Convert predictions to labels
predicted_labels = [[label for label, binary in zip(labels, binary_labels) if binary] for binary_labels in predicted_labels]

# Print or use the predicted labels
print(predicted_labels)

In [None]:
predicted_labels[0]

In [None]:
test_data

In [None]:
# Extracting data from the 'train' split of test_data
test_data_train = test_data['train']
tweet_ids = test_data_train['ID']
num_tweets = len(tweet_ids)

In [None]:
# Initializing an empty dictionary to store the predicted labels
predicted_labels_dict = {category: [0] * num_tweets for category in ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']}

# Iterate over each row of predicted labels and update the dictionary
for i, labels in enumerate(predicted_labels):
    for label in labels:
        predicted_labels_dict[label][i] = 1

In [None]:
# Create DataFrame
df = pd.DataFrame(predicted_labels_dict)

# Insert tweet IDs as the first column
df.insert(0, 'ID', tweet_ids)

In [None]:
df

In [None]:
df.to_csv('gemma_qlora_predicted.csv', index=False)

In [None]:
from google.colab import files
files.download('gemma_qlora_predicted.csv')

# <Font color = 'skyblue'>**Task 2. MTEB(BERT) with QLoRA** </font>

## <Font color = 'indianred'>**1. Load pre-trained Tokenizer** </font>

In [None]:
# distilroberta-base
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_fn(batch):
    return tokenizer(text = batch["text"], truncation=True)

In [None]:
tokenized_dataset= emotion_data.map(tokenize_fn, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(
    ['text']
)
# tokenized_dataset.set_format(type='torch')

In [None]:
tokenized_dataset

In [None]:
tokenized_test_dataset = test_data_text.map(tokenize_fn, batched=True)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(
    ['text']
)

In [None]:
tokenized_test_dataset

##  <font color = 'indianred'> **2. Model Training**

###  <font color = 'indianred'> **2.1. compute_metrics function** </font>


In [None]:
accuracy_metric = evaluate.load('accuracy', 'multilabel')
f1 = evaluate.load('f1','multilabel')


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # print(logits.shape)
    preds = (logits > 0).astype(int)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1_micro = f1.compute(predictions=preds, references=labels, average='micro')
    f1_macro = f1.compute(predictions=preds, references=labels, average='macro')
    return {'f1_micro':f1_micro['f1'],
            'f1_macro':f1_macro['f1'],
            'accuracy':accuracy['accuracy'],
            }

###  <font color = 'indianred'> **2.2. Training Arguments** </font>

In [None]:
# Define the directory where model checkpoints will be saved
run_name = "emotions_bert_qlora"
base_folder = Path(basepath)
model_folder = base_folder / "models"/run_name
# Create the directory if it doesn't exist
model_folder.mkdir(exist_ok=True, parents=True)

# Configure training parameters
training_args = TrainingArguments(
    # Training-specific configurations
    num_train_epochs=10,  # Total number of training epochs
    # Number of samples per training batch for each device
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    # auto_find_batch_size=True,
    weight_decay=1.0,  # Apply L2 regularization to prevent overfitting
    learning_rate=1e-4,  # Step size for the optimizer during training
    lr_scheduler_type='linear',
    warmup_steps=0,  # Number of warmup steps for the learning rate scheduler
    optim='adamw_torch',  # Optimizer,

    # Checkpoint saving and model evaluation settings
    output_dir=str(model_folder),  # Directory to save model checkpoints
    evaluation_strategy='steps',  # Evaluate model at specified step intervals
    eval_steps=20,  # Perform evaluation every 10 training steps
    save_strategy="steps",  # Save model checkpoint at specified step intervals
    save_steps=20,  # Save a model checkpoint every 10 training steps
    load_best_model_at_end=True,  # Reload the best model at the end of training
    save_total_limit=2,  # Retain only the best and the most recent model checkpoints
    # Use 'accuracy' as the metric to determine the best model
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,  # A model is 'better' if its accuracy is higher


    # Experiment logging configurations (commented out in this example)
    logging_strategy='steps',
    logging_steps=20,
    report_to='wandb',  # Log metrics and results to Weights & Biases platform
    run_name=run_name,  # Experiment name for Weights & Biases

    fp16=True,
)


### <Font color='indianred'> **2.3. Specify Model** <font/>

In [None]:
bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  llm_int8_skip_modules = ['score'],
  bnb_4bit_quant_type="nf4",
  bnb_4bit_use_double_quant=True,
  bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                           num_labels=11,
                                                           problem_type="multi_label_classification" )


model = prepare_model_for_kbit_training(model)

config = AutoConfig.from_pretrained(checkpoint)
id2label= {id_: label_ for id_, label_ in enumerate(labels)}
label2id = {label_: id_ for id_, label_ in enumerate(labels)}
config.id2label = id2label
config.label2id = label2id
model.config = config
model.config.pad_token_id = tokenizer.pad_token_id
# model

In [None]:
model

In [None]:
model.config

### <Font color='indianred'> **2.4. QLoRA Setup** <font/>

In [None]:
from peft import (
    TaskType,
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
)

In [None]:
import re
model_modules = str(model.modules)
pattern = r'\((\w+)\): Linear'
linear_layer_names = re.findall(pattern, model_modules)

names = []
# Print the names of the Linear layers
for name in linear_layer_names:
    names.append(name)
target_modules = list(set(names))
target_modules

In [None]:
bert_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=128,
    lora_alpha=256,
    lora_dropout=0.01,
    bias="lora_only",
    modules_to_save = ['classifier'],
    target_modules = ['dense', 'query', 'value', 'key'])
bert_model = get_peft_model(model, bert_config )
bert_model.print_trainable_parameters()

In [None]:
bert_config.target_modules

In [None]:
bert_model

### <Font color='indianred'> **2.5. Custom Trainer**<font/>

In [None]:
def calculate_pos_weights(dataset):
    # Initialize counters for all labels
    num_labels = len(dataset['train']['label'][0])
    total_positives = [0] * num_labels
    total_negatives = [0] * num_labels

    # Count positives and negatives for each label
    for label_array in dataset['train']['label']:
        for i, label in enumerate(label_array):
            if label == 1:
                total_positives[i] += 1
            else:
                total_negatives[i] += 1

    # Calculate pos_weight for each label
    pos_weight = [total_negatives[i] / max(total_positives[i], 1) for i in range(num_labels)]
    return torch.tensor(pos_weight)

# Calculate the pos_weight using the training set
pos_weights = calculate_pos_weights(emotion_data)

In [None]:
pos_weights

In [None]:
pos_weights= torch.tensor([2., 3., 2., 2., 2., 3., 2., 3., 2., 4., 4.])

In [None]:
pos_weights

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # print(inputs)
        # Extract labels and remove them from inputs
        labels = inputs.pop("labels").float()  # Ensure labels are float for BCE loss
        # print(labels)
        outputs = model(**inputs)
        logits = outputs.get("logits")

        device = next(model.parameters()).device

        # Compute custom loss (BCEWithLogitsLoss is suitable for multi-label)
        # pos_weight can be used to handle class imbalance
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weights.to(device))
        # Reshape labels to match logits dimensions
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [None]:
trainer = CustomTrainer(
    model=bert_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

### <font color='indianred'>**2.5 Setup WanDB**<font/>

In [None]:
wandb.login()
%env WANDB_PROJECT = emotions_kaggle_S2024

### <font color = 'indianred'> **2.6 Start Training** <font/>

In [None]:
trainer.train()  # start training

### <font color='indianred'> **2.7 Validation**<font/>

In [None]:
eval_results = trainer.evaluate(tokenized_dataset["valid"])

In [None]:
eval_results

In [None]:
wandb.log({"eval_accuracy": eval_results["eval_accuracy"], "eval_loss": eval_results["eval_loss"],
"eval_f1_micro": eval_results["eval_f1_micro"], "eval_f1_macro": eval_results["eval_f1_macro"]})

In [None]:
# After training, let us check the best checkpoint
# We need this for Inference
best_model_checkpoint_step = trainer.state.best_model_checkpoint.split('-')[-1]
print(f"The best model was saved at step {best_model_checkpoint_step}.")

###  <font color = 'indianred'> **Check Confusion Matrix**</font>

In [None]:
# Use the trainer to generate predictions on the tokenized validation dataset.
# The resulting object, valid_output, will contain the model's logits (raw prediction scores) for each input in the validation set.
valid_output = trainer.predict(tokenized_dataset["valid"])

In [None]:
predictions_valid = (valid_output.predictions > 0).astype(int)
labels_valid = valid_output.label_ids.astype(int)

In [None]:
y_true = labels_valid
y_pred = predictions_valid
class_names = labels

mcm = multilabel_confusion_matrix(y_true, y_pred,)

# 1. Individual Heatmaps
for idx, matrix in enumerate(mcm):
    plt.figure(figsize=(5, 4))
    sns.heatmap(matrix, annot=True, fmt='g', cmap='Blues',
                xticklabels=['Predicted Negative', 'Predicted Positive'],
                yticklabels=['True Negative', 'True Positive'])
    plt.title(f'Confusion Matrix for {class_names[idx]}')
    plt.show()

# 2. Aggregate Metrics Heatmap
precision_per_class = precision_score(y_true, y_pred, average=None)
recall_per_class = recall_score(y_true, y_pred, average=None)
f1_per_class = f1_score(y_true, y_pred, average=None)

metrics_df = pd.DataFrame({
    'Precision': precision_per_class,
    'Recall': recall_per_class,
    'F1-Score': f1_per_class
}, index=class_names)

plt.figure(figsize=(10, 8))
# sns.heatmap(metrics_df, annot=True, cmap='Blues')
# plt.title('Metrics for each class')
# plt.show()

ax = sns.heatmap(metrics_df, annot=True, cmap='Blues')
plt.title('Metrics for each class')
plt.tight_layout()  # Adjust layout to not cut off edges

# Log the heatmap to wandb
wandb.log({"Metrics Heatmap": wandb.Image(ax.get_figure())})
plt.show()

# 3. Histogram of Metrics
metrics_df.plot(kind='bar', figsize=(12, 7))
plt.ylabel('Score')
plt.title('Precision, Recall, and F1-Score for Each Class')
plt.show()

In [None]:
wandb.finish()

###  <font color = 'indianred'> **Save the model on HuggingFace**</font>

In [None]:
#trainer.push_to_hub("yunase/Bert_QLoRA_emotion_detection")

## <Font color = 'indianred'>**3. Test Data Prediction** </font>

In [None]:
# Make predictions
predictions = trainer.predict(tokenized_test_dataset["train"])

In [None]:
# Post-processing for multi-label classification
threshold = 0.5  # Example threshold
predicted_labels = (predictions.predictions > threshold).astype(int)

# Convert predictions to labels
predicted_labels = [[label for label, binary in zip(labels, binary_labels) if binary] for binary_labels in predicted_labels]

# Print or use the predicted labels
print(predicted_labels)

In [None]:
predicted_labels[0]

In [None]:
test_data

In [None]:
# Extracting data from the 'train' split of test_data
test_data_train = test_data['train']
tweet_ids = test_data_train['ID']
num_tweets = len(tweet_ids)

In [None]:
# Initializing an empty dictionary to store the predicted labels
predicted_labels_dict = {category: [0] * num_tweets for category in ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']}

# Iterate over each row of predicted labels and update the dictionary
for i, labels in enumerate(predicted_labels):
    for label in labels:
        predicted_labels_dict[label][i] = 1

In [None]:
# Create DataFrame
df = pd.DataFrame(predicted_labels_dict)

# Insert tweet IDs as the first column
df.insert(0, 'ID', tweet_ids)

In [None]:
df

In [None]:
df.to_csv('bert_qlora_predicted.csv', index=False)

In [None]:
from google.colab import files
files.download('bert_qlora_predicted.csv')

# <Font color = 'skyblue'>**Task 3 (optional) BERT** </font>

## <Font color = 'indianred'>**1. Load pre-trained Tokenizer** </font>

In [None]:
# distilroberta-base
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_fn(batch):
    return tokenizer(text = batch["text"], truncation=True)

In [None]:
tokenized_dataset= emotion_data.map(tokenize_fn, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(
    ['text']
)
# tokenized_dataset.set_format(type='torch')

In [None]:
tokenized_dataset

In [None]:
tokenized_test_dataset = test_data_text.map(tokenize_fn, batched=True)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(
    ['text']
)

In [None]:
tokenized_test_dataset

##  <font color = 'indianred'> **2. Model Training**

###  <font color = 'indianred'> **2.1. compute_metrics function** </font>


In [None]:
accuracy_metric = evaluate.load('accuracy', 'multilabel')
f1 = evaluate.load('f1','multilabel')


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # print(logits.shape)
    preds = (logits > 0).astype(int)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1_micro = f1.compute(predictions=preds, references=labels, average='micro')
    f1_macro = f1.compute(predictions=preds, references=labels, average='macro')
    return {'f1_micro':f1_micro['f1'],
            'f1_macro':f1_macro['f1'],
            'accuracy':accuracy['accuracy'],
            }

###  <font color = 'indianred'> **2.2. Training Arguments** </font>

In [None]:
# Define the directory where model checkpoints will be saved
run_name = "emotions_bert"
base_folder = Path(basepath)
model_folder = base_folder / "models"/run_name
# Create the directory if it doesn't exist
model_folder.mkdir(exist_ok=True, parents=True)

# Configure training parameters
training_args = TrainingArguments(
    # Training-specific configurations
    num_train_epochs=10,  # Total number of training epochs
    # Number of samples per training batch for each device
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    # auto_find_batch_size=True,
    weight_decay=1.0,  # Apply L2 regularization to prevent overfitting
    learning_rate=1e-4,  # Step size for the optimizer during training
    lr_scheduler_type='linear',
    warmup_steps=0,  # Number of warmup steps for the learning rate scheduler
    optim='adamw_torch',  # Optimizer,

    # Checkpoint saving and model evaluation settings
    output_dir=str(model_folder),  # Directory to save model checkpoints
    evaluation_strategy='steps',  # Evaluate model at specified step intervals
    eval_steps=20,  # Perform evaluation every 10 training steps
    save_strategy="steps",  # Save model checkpoint at specified step intervals
    save_steps=20,  # Save a model checkpoint every 10 training steps
    load_best_model_at_end=True,  # Reload the best model at the end of training
    save_total_limit=2,  # Retain only the best and the most recent model checkpoints
    # Use 'accuracy' as the metric to determine the best model
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,  # A model is 'better' if its accuracy is higher


    # Experiment logging configurations (commented out in this example)
    logging_strategy='steps',
    logging_steps=20,
    report_to='wandb',  # Log metrics and results to Weights & Biases platform
    run_name=run_name,  # Experiment name for Weights & Biases

    fp16=True,
)


### <Font color='indianred'> **2.3. Specify Model** <font/>

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                           num_labels=11,
                                                           problem_type="multi_label_classification" )


config = AutoConfig.from_pretrained(checkpoint)
id2label= {id_: label_ for id_, label_ in enumerate(labels)}
label2id = {label_: id_ for id_, label_ in enumerate(labels)}
config.id2label = id2label
config.label2id = label2id
model.config = config

# model

In [None]:
model

In [None]:
model.config

### <Font color='indianred'> **2.4. Custom Trainer**<font/>

In [None]:
def calculate_pos_weights(dataset):
    # Initialize counters for all labels
    num_labels = len(dataset['train']['label'][0])
    total_positives = [0] * num_labels
    total_negatives = [0] * num_labels

    # Count positives and negatives for each label
    for label_array in dataset['train']['label']:
        for i, label in enumerate(label_array):
            if label == 1:
                total_positives[i] += 1
            else:
                total_negatives[i] += 1

    # Calculate pos_weight for each label
    pos_weight = [total_negatives[i] / max(total_positives[i], 1) for i in range(num_labels)]
    return torch.tensor(pos_weight)

# Calculate the pos_weight using the training set
pos_weights = calculate_pos_weights(emotion_data)

In [None]:
pos_weights

In [None]:
pos_weights= torch.tensor([2., 3., 2., 2., 2., 3., 2., 3., 2., 4., 4.])

In [None]:
pos_weights

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # print(inputs)
        # Extract labels and remove them from inputs
        labels = inputs.pop("labels").float()  # Ensure labels are float for BCE loss
        # print(labels)
        outputs = model(**inputs)
        logits = outputs.get("logits")

        device = next(model.parameters()).device

        # Compute custom loss (BCEWithLogitsLoss is suitable for multi-label)
        # pos_weight can be used to handle class imbalance
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weights.to(device))
        # Reshape labels to match logits dimensions
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

### <font color='indianred'>**2.5 Setup WanDB**<font/>

In [None]:
wandb.login()
%env WANDB_PROJECT = emotions_kaggle_S2024

### <font color = 'indianred'> **2.6 Start Training** <font/>

In [None]:
trainer.train()  # start training

### <font color='indianred'> **2.7 Validation**<font/>

In [None]:
eval_results = trainer.evaluate(tokenized_dataset["valid"])

In [None]:
eval_results

In [None]:
wandb.log({"eval_accuracy": eval_results["eval_accuracy"], "eval_loss": eval_results["eval_loss"],
"eval_f1_micro": eval_results["eval_f1_micro"], "eval_f1_macro": eval_results["eval_f1_macro"]})

In [None]:
# After training, let us check the best checkpoint
# We need this for Inference
best_model_checkpoint_step = trainer.state.best_model_checkpoint.split('-')[-1]
print(f"The best model was saved at step {best_model_checkpoint_step}.")

###  <font color = 'indianred'> **Check Confusion Matrix**</font>

In [None]:
# Use the trainer to generate predictions on the tokenized validation dataset.
# The resulting object, valid_output, will contain the model's logits (raw prediction scores) for each input in the validation set.
valid_output = trainer.predict(tokenized_dataset["valid"])

In [None]:
predictions_valid = (valid_output.predictions > 0).astype(int)
labels_valid = valid_output.label_ids.astype(int)

In [None]:
y_true = labels_valid
y_pred = predictions_valid
class_names = labels

mcm = multilabel_confusion_matrix(y_true, y_pred,)

# 1. Individual Heatmaps
for idx, matrix in enumerate(mcm):
    plt.figure(figsize=(5, 4))
    sns.heatmap(matrix, annot=True, fmt='g', cmap='Blues',
                xticklabels=['Predicted Negative', 'Predicted Positive'],
                yticklabels=['True Negative', 'True Positive'])
    plt.title(f'Confusion Matrix for {class_names[idx]}')
    plt.show()

# 2. Aggregate Metrics Heatmap
precision_per_class = precision_score(y_true, y_pred, average=None)
recall_per_class = recall_score(y_true, y_pred, average=None)
f1_per_class = f1_score(y_true, y_pred, average=None)

metrics_df = pd.DataFrame({
    'Precision': precision_per_class,
    'Recall': recall_per_class,
    'F1-Score': f1_per_class
}, index=class_names)

plt.figure(figsize=(10, 8))
# sns.heatmap(metrics_df, annot=True, cmap='Blues')
# plt.title('Metrics for each class')
# plt.show()

ax = sns.heatmap(metrics_df, annot=True, cmap='Blues')
plt.title('Metrics for each class')
plt.tight_layout()  # Adjust layout to not cut off edges

# Log the heatmap to wandb
wandb.log({"Metrics Heatmap": wandb.Image(ax.get_figure())})
plt.show()

# 3. Histogram of Metrics
metrics_df.plot(kind='bar', figsize=(12, 7))
plt.ylabel('Score')
plt.title('Precision, Recall, and F1-Score for Each Class')
plt.show()

In [None]:
wandb.finish()

###  <font color = 'indianred'> **Save the model on HuggingFace**</font>

In [None]:
trainer.push_to_hub("yunase/Bert_emotion_detection")

## <Font color = 'indianred'>**3. Test Data Prediction** </font>

In [None]:
# Make predictions
predictions = trainer.predict(tokenized_test_dataset["train"])

In [None]:
# Post-processing for multi-label classification
threshold = 0.5  # Example threshold
predicted_labels = (predictions.predictions > threshold).astype(int)

# Convert predictions to labels
predicted_labels = [[label for label, binary in zip(labels, binary_labels) if binary] for binary_labels in predicted_labels]

# Print or use the predicted labels
print(predicted_labels)

In [None]:
predicted_labels[0]

In [None]:
test_data

In [None]:
# Extracting data from the 'train' split of test_data
test_data_train = test_data['train']
tweet_ids = test_data_train['ID']
num_tweets = len(tweet_ids)

In [None]:
# Initializing an empty dictionary to store the predicted labels
predicted_labels_dict = {category: [0] * num_tweets for category in ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']}

# Iterate over each row of predicted labels and update the dictionary
for i, labels in enumerate(predicted_labels):
    for label in labels:
        predicted_labels_dict[label][i] = 1

In [None]:
# Create DataFrame
df = pd.DataFrame(predicted_labels_dict)

# Insert tweet IDs as the first column
df.insert(0, 'ID', tweet_ids)

In [None]:
df

In [None]:
df.to_csv('bert_predicted.csv', index=False)

In [None]:
from google.colab import files
files.download('bert_predicted.csv')