In [28]:
import torch
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    AlbertTokenizer, AlbertForSequenceClassification,
    DistilBertTokenizer, DistilBertForSequenceClassification,
    ElectraTokenizer, ElectraForSequenceClassification,
    RobertaTokenizer, RobertaForSequenceClassification,
    Trainer, TrainingArguments
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix

# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Load datasets
selected_label_samples = pd.read_csv('cleaned_selected_label_samples_withothers.csv')

# Extract the necessary columns
selected_texts = selected_label_samples['Cleaned_Review'].fillna('').tolist()

# Create multi-label targets
label_columns = ['Electric Fit', 'Affordability', 'Customer Care', 'Other']
labels = selected_label_samples[label_columns].fillna('').values.tolist()
labels = [list(filter(lambda x: x != '', label)) for label in labels]

# Binarize the labels
mlb = MultiLabelBinarizer(classes=label_columns)
labels = mlb.fit_transform(labels)

# Check if there are any issues with individual texts or labels
for i, text in enumerate(selected_texts):
    if not isinstance(text, str):
        print(f"Non-string text found at index {i}: {text}")
for i, label in enumerate(labels):
    if not isinstance(label, (list, tuple)):
        print(f"Non-list/tuple label found at index {i}: {label}")

# Check and remove any unintended labels
intended_labels = label_columns  # the intended labels
unwanted_labels = set(mlb.classes_) - set(intended_labels)

if unwanted_labels:
    print(f"Unwanted labels detected: {unwanted_labels}")
    for label in unwanted_labels:
        # Find the index of the unwanted label
        index = list(mlb.classes_).index(label)
        # Remove the column from labels
        labels = np.delete(labels, index, axis=1)
        # Remove the label from mlb.classes_
        mlb.classes_ = np.delete(mlb.classes_, index)

# Verify the cleaned labels
print("Final labels:", mlb.classes_)
print("Sample labels after cleaning:", labels[:5])

# Function to encode data
def encode_data(texts, tokenizer, labels=None, max_length=512):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    dataset = CustomDataset(encodings, labels)
    return dataset

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(selected_texts, labels, test_size=0.2)

# Dictionary of pre-trained model architectures and their corresponding tokenizers and model classes
architectures = {
    "ALBERT": ("albert-base-v2", AlbertTokenizer, AlbertForSequenceClassification),
    "BERT": ("bert-base-uncased", BertTokenizer, BertForSequenceClassification),
    "DistilBERT": ("distilbert-base-uncased", DistilBertTokenizer, DistilBertForSequenceClassification),
    "ELECTRA": ("google/electra-base-discriminator", ElectraTokenizer, ElectraForSequenceClassification),
    "RoBERTa": ("roberta-base", RobertaTokenizer, RobertaForSequenceClassification)
}

# Function to train and evaluate models
def train_and_evaluate_model(name, model_name, tokenizer_class, model_class):
    print(f"Training {name} model...")

    # Load pre-trained tokenizer and model
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name, num_labels=len(mlb.classes_))

    # Encode datasets
    train_dataset = encode_data(train_texts, tokenizer, train_labels)
    val_dataset = encode_data(val_texts, tokenizer, val_labels)

    # Set up training arguments and trainer
    training_args = TrainingArguments(
        output_dir=f'./results/{name}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'./logs/{name}',
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    # Train the model
    trainer.train()

    # Save the trained model and tokenizer
    model.save_pretrained(f'./{name.lower()}_model')
    tokenizer.save_pretrained(f'./{name.lower()}_tokenizer')

    # Evaluate the model
    trainer.evaluate()

    # Get predictions on the validation set
    predictions = trainer.predict(val_dataset)
    preds = torch.sigmoid(torch.tensor(predictions.predictions)).round().numpy()
    labels = predictions.label_ids

    # Compute overall metrics
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    roc_auc = roc_auc_score(labels, preds, average='weighted')
    tn, fp, fn, tp = confusion_matrix(labels.ravel(), preds.ravel()).ravel()

    # Collect overall metrics
    overall_metrics = [name, labels.shape[0], tp, fn, fp, tn, accuracy, roc_auc, precision, recall, f1]

    # Compute per-label metrics
    precision_per_label, recall_per_label, f1_per_label, _ = precision_recall_fscore_support(labels, preds, average=None, labels=[0, 1, 2, 3])

    # Collect per-label metrics
    per_label_metrics = []
    for i, label in enumerate(label_columns):
        per_label_metrics.append([name, label, precision_per_label[i], recall_per_label[i], f1_per_label[i]])

    print(f'{name} - Accuracy: {accuracy}')
    print(f'{name} - Precision: {precision}')
    print(f'{name} - Recall: {recall}')
    print(f'{name} - F1 Score: {f1}')
    print(f'{name} - ROC AUC: {roc_auc}')

    for i, label in enumerate(label_columns):
        print(f'{name} - {label} - Precision: {precision_per_label[i]}')
        print(f'{name} - {label} - Recall: {recall_per_label[i]}')
        print(f'{name} - {label} - F1 Score: {f1_per_label[i]}')

    # Return the results for this model
    return overall_metrics, per_label_metrics

# Train and evaluate each model
overall_results = []
label_results = []

for name, (model_name, tokenizer_class, model_class) in architectures.items():
    overall_metrics, per_label_metrics = train_and_evaluate_model(name, model_name, tokenizer_class, model_class)
    overall_results.append(overall_metrics)
    label_results.extend(per_label_metrics)

# Convert lists to dataframes
def process_and_save_overall_results(results):
    columns = ["Name", "Share", "True-Positives", "False-Negatives", "False-Positives", 
               "True-Negatives", "Accuracy", "AUC", "Precision", "Recall", "F1"]

    overall_results_table = pd.DataFrame(results, columns=columns)
    overall_results_table["Type"] = "Transformer"
    overall_results_table = overall_results_table[["Name", "Type", "Share", "True-Positives", 
                                                   "False-Negatives", "False-Positives", 
                                                   "True-Negatives", "Accuracy", "AUC", 
                                                   "Precision", "Recall", "F1"]]

    # Output results
    overall_results_table.sort_values("Accuracy", ascending=False).to_csv("Transformer Overall Model Performance.csv", index=False)
    print(overall_results_table.sort_values("Accuracy", ascending=False))

def process_and_save_label_results(results):
    columns = ["Name", "Label", "Precision", "Recall", "F1"]
    label_results_table = pd.DataFrame(results, columns=columns)

    # Output results
    label_results_table.to_csv("Transformer Per-Label Model Performance.csv", index=False)
    print(label_results_table)

# Process and save the results
process_and_save_overall_results(overall_results)
process_and_save_label_results(label_results)




Non-list/tuple label found at index 0: [1 1 0 0]
Non-list/tuple label found at index 1: [1 0 0 0]
Non-list/tuple label found at index 2: [1 1 0 0]
Non-list/tuple label found at index 3: [1 0 0 0]
Non-list/tuple label found at index 4: [1 0 0 0]
Non-list/tuple label found at index 5: [1 0 0 0]
Non-list/tuple label found at index 6: [1 1 0 0]
Non-list/tuple label found at index 7: [1 1 0 0]
Non-list/tuple label found at index 8: [1 1 0 0]
Non-list/tuple label found at index 9: [1 0 0 0]
Non-list/tuple label found at index 10: [1 0 0 0]
Non-list/tuple label found at index 11: [1 0 0 0]
Non-list/tuple label found at index 12: [1 0 0 0]
Non-list/tuple label found at index 13: [1 0 0 0]
Non-list/tuple label found at index 14: [1 0 0 0]
Non-list/tuple label found at index 15: [1 0 0 0]
Non-list/tuple label found at index 16: [1 0 0 0]
Non-list/tuple label found at index 17: [1 1 0 0]
Non-list/tuple label found at index 18: [1 0 0 0]
Non-list/tuple label found at index 19: [1 0 0 0]
Non-list/t

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/1095 [00:00<?, ?it/s]

{'loss': 0.6959, 'grad_norm': 6.11526346206665, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}
{'loss': 0.6817, 'grad_norm': 5.730230331420898, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.05}
{'loss': 0.657, 'grad_norm': 6.248964309692383, 'learning_rate': 3e-06, 'epoch': 0.08}
{'loss': 0.6325, 'grad_norm': 6.530795574188232, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.11}
{'loss': 0.6059, 'grad_norm': 6.6633992195129395, 'learning_rate': 5e-06, 'epoch': 0.14}
{'loss': 0.6009, 'grad_norm': 6.805171489715576, 'learning_rate': 6e-06, 'epoch': 0.16}
{'loss': 0.5847, 'grad_norm': 6.675745964050293, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.19}
{'loss': 0.5859, 'grad_norm': 9.082491874694824, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.22}
{'loss': 0.597, 'grad_norm': 6.636030197143555, 'learning_rate': 9e-06, 'epoch': 0.25}
{'loss': 0.6054, 'grad_norm': 3.6249876022338867, 'learning_rate': 1e-05, 'epoch': 0.27}
{'loss': 0.5572, 'grad_norm': 4.8425388

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

ALBERT - Accuracy: 0.5912208504801097
ALBERT - Precision: 0.7399511993300734
ALBERT - Recall: 0.6730310262529833
ALBERT - F1 Score: 0.7036700479327372
ALBERT - ROC AUC: 0.7880567884486863
ALBERT - Electric Fit - Precision: 0.7216981132075472
ALBERT - Electric Fit - Recall: 0.6401673640167364
ALBERT - Electric Fit - F1 Score: 0.6784922394678492
ALBERT - Affordability - Precision: 0.7635467980295566
ALBERT - Affordability - Recall: 0.7110091743119266
ALBERT - Affordability - F1 Score: 0.7363420427553444
ALBERT - Customer Care - Precision: 0.7537688442211056
ALBERT - Customer Care - Recall: 0.7653061224489796
ALBERT - Customer Care - F1 Score: 0.759493670886076
ALBERT - Other - Precision: 0.7210884353741497
ALBERT - Other - Recall: 0.572972972972973
ALBERT - Other - F1 Score: 0.6385542168674698
Training BERT model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/1095 [00:00<?, ?it/s]

{'loss': 0.7057, 'grad_norm': 1.9634474515914917, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}
{'loss': 0.7034, 'grad_norm': 2.2877564430236816, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.05}
{'loss': 0.699, 'grad_norm': 2.1018452644348145, 'learning_rate': 3e-06, 'epoch': 0.08}
{'loss': 0.667, 'grad_norm': 3.4851415157318115, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.11}
{'loss': 0.6296, 'grad_norm': 5.06710147857666, 'learning_rate': 5e-06, 'epoch': 0.14}
{'loss': 0.6266, 'grad_norm': 2.5439136028289795, 'learning_rate': 6e-06, 'epoch': 0.16}
{'loss': 0.6014, 'grad_norm': 2.4772183895111084, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.19}
{'loss': 0.6013, 'grad_norm': 3.286633253097534, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.22}
{'loss': 0.6049, 'grad_norm': 2.634096384048462, 'learning_rate': 9e-06, 'epoch': 0.25}
{'loss': 0.5865, 'grad_norm': 1.6950933933258057, 'learning_rate': 1e-05, 'epoch': 0.27}
{'loss': 0.5699, 'grad_norm': 2.24

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

BERT - Accuracy: 0.6131687242798354
BERT - Precision: 0.7362315767933688
BERT - Recall: 0.7159904534606205
BERT - F1 Score: 0.7232841627969065
BERT - ROC AUC: 0.8051912395787095
BERT - Electric Fit - Precision: 0.7142857142857143
BERT - Electric Fit - Recall: 0.6903765690376569
BERT - Electric Fit - F1 Score: 0.7021276595744681
BERT - Affordability - Precision: 0.7877358490566038
BERT - Affordability - Recall: 0.7660550458715596
BERT - Affordability - F1 Score: 0.7767441860465116
BERT - Customer Care - Precision: 0.72
BERT - Customer Care - Recall: 0.826530612244898
BERT - Customer Care - F1 Score: 0.7695961995249406
BERT - Other - Precision: 0.7210884353741497
BERT - Other - Recall: 0.572972972972973
BERT - Other - F1 Score: 0.6385542168674698
Training DistilBERT model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/1095 [00:00<?, ?it/s]

{'loss': 0.6976, 'grad_norm': 0.6479833722114563, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}
{'loss': 0.6974, 'grad_norm': 0.7836894392967224, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.05}
{'loss': 0.6858, 'grad_norm': 0.7924975156784058, 'learning_rate': 3e-06, 'epoch': 0.08}
{'loss': 0.6785, 'grad_norm': 0.7940378189086914, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.11}
{'loss': 0.6677, 'grad_norm': 0.8157926201820374, 'learning_rate': 5e-06, 'epoch': 0.14}
{'loss': 0.6477, 'grad_norm': 1.0488625764846802, 'learning_rate': 6e-06, 'epoch': 0.16}
{'loss': 0.6177, 'grad_norm': 1.21943199634552, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.19}
{'loss': 0.6044, 'grad_norm': 1.2522403001785278, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.22}
{'loss': 0.6072, 'grad_norm': 1.1732819080352783, 'learning_rate': 9e-06, 'epoch': 0.25}
{'loss': 0.6044, 'grad_norm': 0.8032050132751465, 'learning_rate': 1e-05, 'epoch': 0.27}
{'loss': 0.5719, 'grad_norm': 

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

DistilBERT - Accuracy: 0.6200274348422496
DistilBERT - Precision: 0.7504149379635892
DistilBERT - Recall: 0.698090692124105
DistilBERT - F1 Score: 0.7197943647025733
DistilBERT - ROC AUC: 0.8008707580124058
DistilBERT - Electric Fit - Precision: 0.7280701754385965
DistilBERT - Electric Fit - Recall: 0.694560669456067
DistilBERT - Electric Fit - F1 Score: 0.7109207708779444
DistilBERT - Affordability - Precision: 0.7871287128712872
DistilBERT - Affordability - Recall: 0.7293577981651376
DistilBERT - Affordability - F1 Score: 0.7571428571428571
DistilBERT - Customer Care - Precision: 0.7407407407407407
DistilBERT - Customer Care - Recall: 0.8163265306122449
DistilBERT - Customer Care - F1 Score: 0.7766990291262136
DistilBERT - Other - Precision: 0.746268656716418
DistilBERT - Other - Recall: 0.5405405405405406
DistilBERT - Other - F1 Score: 0.6269592476489029
Training ELECTRA model...


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/1095 [00:00<?, ?it/s]

{'loss': 0.7005, 'grad_norm': 0.7063828706741333, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}
{'loss': 0.699, 'grad_norm': 0.7650415897369385, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.05}
{'loss': 0.6877, 'grad_norm': 0.7011369466781616, 'learning_rate': 3e-06, 'epoch': 0.08}
{'loss': 0.6808, 'grad_norm': 0.8205815553665161, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.11}
{'loss': 0.6674, 'grad_norm': 0.678305983543396, 'learning_rate': 5e-06, 'epoch': 0.14}
{'loss': 0.6545, 'grad_norm': 0.8077537417411804, 'learning_rate': 6e-06, 'epoch': 0.16}
{'loss': 0.6368, 'grad_norm': 0.8415026664733887, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.19}
{'loss': 0.6241, 'grad_norm': 0.9668040871620178, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.22}
{'loss': 0.6173, 'grad_norm': 0.8034710884094238, 'learning_rate': 9e-06, 'epoch': 0.25}
{'loss': 0.6155, 'grad_norm': 0.6466066241264343, 'learning_rate': 1e-05, 'epoch': 0.27}
{'loss': 0.5822, 'grad_norm': 

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

ELECTRA - Accuracy: 0.598079561042524
ELECTRA - Precision: 0.7492117501636618
ELECTRA - Recall: 0.665871121718377
ELECTRA - F1 Score: 0.7005164156552552
ELECTRA - ROC AUC: 0.7859240156285686
ELECTRA - Electric Fit - Precision: 0.704225352112676
ELECTRA - Electric Fit - Recall: 0.6276150627615062
ELECTRA - Electric Fit - F1 Score: 0.6637168141592921
ELECTRA - Affordability - Precision: 0.7522522522522522
ELECTRA - Affordability - Recall: 0.7660550458715596
ELECTRA - Affordability - F1 Score: 0.759090909090909
ELECTRA - Customer Care - Precision: 0.7842105263157895
ELECTRA - Customer Care - Recall: 0.7602040816326531
ELECTRA - Customer Care - F1 Score: 0.772020725388601
ELECTRA - Other - Precision: 0.7666666666666667
ELECTRA - Other - Recall: 0.4972972972972973
ELECTRA - Other - F1 Score: 0.6032786885245902
Training RoBERTa model...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/1095 [00:00<?, ?it/s]

{'loss': 0.6907, 'grad_norm': 1.1890785694122314, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}
{'loss': 0.6787, 'grad_norm': 1.5389797687530518, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.05}
{'loss': 0.6895, 'grad_norm': 0.9579929709434509, 'learning_rate': 3e-06, 'epoch': 0.08}
{'loss': 0.6677, 'grad_norm': 1.0940566062927246, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.11}
{'loss': 0.6378, 'grad_norm': 1.3059630393981934, 'learning_rate': 5e-06, 'epoch': 0.14}
{'loss': 0.6164, 'grad_norm': 2.471775770187378, 'learning_rate': 6e-06, 'epoch': 0.16}
{'loss': 0.5913, 'grad_norm': 2.1235299110412598, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.19}
{'loss': 0.5862, 'grad_norm': 2.2028725147247314, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.22}
{'loss': 0.5861, 'grad_norm': 1.8784575462341309, 'learning_rate': 9e-06, 'epoch': 0.25}
{'loss': 0.6078, 'grad_norm': 1.8240472078323364, 'learning_rate': 1e-05, 'epoch': 0.27}
{'loss': 0.5425, 'grad_norm':

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

RoBERTa - Accuracy: 0.607681755829904
RoBERTa - Precision: 0.7507808650196306
RoBERTa - Recall: 0.7136038186157518
RoBERTa - F1 Score: 0.7247392374468419
RoBERTa - ROC AUC: 0.8057098203169629
RoBERTa - Electric Fit - Precision: 0.7024793388429752
RoBERTa - Electric Fit - Recall: 0.7112970711297071
RoBERTa - Electric Fit - F1 Score: 0.7068607068607069
RoBERTa - Affordability - Precision: 0.7681818181818182
RoBERTa - Affordability - Recall: 0.7752293577981652
RoBERTa - Affordability - F1 Score: 0.771689497716895
RoBERTa - Customer Care - Precision: 0.7454545454545455
RoBERTa - Customer Care - Recall: 0.8367346938775511
RoBERTa - Customer Care - F1 Score: 0.7884615384615384
RoBERTa - Other - Precision: 0.7983193277310925
RoBERTa - Other - Recall: 0.5135135135135135
RoBERTa - Other - F1 Score: 0.625
         Name         Type  Share  True-Positives  False-Negatives  \
2  DistilBERT  Transformer    729             585              253   
1        BERT  Transformer    729             600    

In [29]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# Load the trained model and tokenizer from the specified directory
model_path = './roberta_model'
tokenizer_path = './roberta_tokenizer'
model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

# Move model to device (CPU or MPS)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

# Load the cleaned unlabelled main dataset
cleaned_speakev = pd.read_csv('cleaned_filtered_main_removed_missing_duplicates.csv')

# Ensure all texts in cleaned_speakev are strings
cleaned_speakev_texts = [str(text) for text in cleaned_speakev['Cleaned_Review'].tolist() if isinstance(text, (str, int, float))]

# Log the length of the texts to identify potential issues
text_lengths = [len(text) for text in cleaned_speakev_texts]
print(f"Text length stats - Min: {min(text_lengths)}, Max: {max(text_lengths)}, Average: {sum(text_lengths)/len(text_lengths)}")

# Define batch size
batch_size = 32

# Initialize an empty list to store predictions
all_predictions = []

# Process the texts in batches
for i in range(0, len(cleaned_speakev_texts), batch_size):
    batch_texts = cleaned_speakev_texts[i:i + batch_size]
    
    # Tokenize the current batch of texts
    cleaned_speakev_encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=512, return_tensors='pt')
    
    # Move tensors to device
    input_ids = cleaned_speakev_encodings['input_ids'].to(device)
    attention_mask = cleaned_speakev_encodings['attention_mask'].to(device)
    
    # Verify the shape of the encoded inputs
    print(f"Processing batch {i//batch_size + 1} / {len(cleaned_speakev_texts)//batch_size + 1}")
    print(f"Input IDs shape: {input_ids.shape}")
    print(f"Attention mask shape: {attention_mask.shape}")
    
    # Perform inference
    with torch.no_grad():
        try:
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.sigmoid(outputs.logits).round().tolist()
            all_predictions.extend(predictions)
        except RuntimeError as e:
            print(f"RuntimeError: {e}")
            # Additional debugging or logging can be added here
            raise

# Convert all_predictions to a NumPy array
all_predictions_array = np.array(all_predictions)

# Ensure the shape of the array is correct
if all_predictions_array.ndim == 1:
    all_predictions_array = all_predictions_array.reshape(-1, 1)

# Define the label columns
label_columns = ['Electric Fit', 'Affordability', 'Customer Care', 'Other']

# Load the MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=label_columns)
mlb.fit([label_columns])  # Fit the MultiLabelBinarizer with label columns

# Convert predictions to labels
predicted_labels = mlb.inverse_transform(all_predictions_array)

# Add predictions to the cleaned_speakev dataframe
cleaned_speakev['Predicted_Label'] = predicted_labels

# Save the cleaned_speakev dataframe with predictions
cleaned_speakev.to_csv('cleaned_filtered_main_removed_missing_duplicates_with_predictions.csv', index=False)

print("Predictions added and saved to 'main_with_predictions.csv'")


  cleaned_speakev = pd.read_csv('cleaned_filtered_main_removed_missing_duplicates.csv')


Text length stats - Min: 1, Max: 31104, Average: 236.7796230944026
Processing batch 1 / 37599
Input IDs shape: torch.Size([32, 512])
Attention mask shape: torch.Size([32, 512])
Processing batch 2 / 37599
Input IDs shape: torch.Size([32, 170])
Attention mask shape: torch.Size([32, 170])
Processing batch 3 / 37599
Input IDs shape: torch.Size([32, 141])
Attention mask shape: torch.Size([32, 141])
Processing batch 4 / 37599
Input IDs shape: torch.Size([32, 206])
Attention mask shape: torch.Size([32, 206])
Processing batch 5 / 37599
Input IDs shape: torch.Size([32, 101])
Attention mask shape: torch.Size([32, 101])
Processing batch 6 / 37599
Input IDs shape: torch.Size([32, 230])
Attention mask shape: torch.Size([32, 230])
Processing batch 7 / 37599
Input IDs shape: torch.Size([32, 155])
Attention mask shape: torch.Size([32, 155])
Processing batch 8 / 37599
Input IDs shape: torch.Size([32, 111])
Attention mask shape: torch.Size([32, 111])
Processing batch 9 / 37599
Input IDs shape: torch.Siz