## Dependencies

In [None]:
! pip install torch pandas datasets transformers scikit-learn datasets 

## Imports

In [None]:
import torch, gc
import json
from torch import cuda
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    BertForSequenceClassification, 
    BertTokenizerFast,
    TrainingArguments,
    Trainer
)
from collections import Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,confusion_matrix, ConfusionMatrixDisplay

## Data pre processing

Loading the dataset from Hugging Face and using pandas to read the train and test datasets into dataframes, in order to analyse the data.

In [None]:
def load_and_process_data(file_path, label, category):
    with open(file_path) as f:
        data = json.load(f)
        for e in data:
            e['text'] = e.pop('Completion')
            e['labels'] = label
            e['category'] = category
    return pd.DataFrame(data)

df_llama = load_and_process_data('meta-llama/Meta-Llama-3-8B-Instruct_valid.json', 0, 'Meta-Llama-3-8B-Instruct')
df_phi3 = load_and_process_data('microsoft/Phi-3-mini-4k-instruct_valid.json', 1, 'Phi-3-mini-4k-instruct')
df_mixtral = load_and_process_data('mistralai/Mixtral-8x7B-Instruct-v0.1_valid.json', 2, 'Mixtral-8x7B-Instruct-v0.1')
df_gpt4 = load_and_process_data('openai/GPT4_valid.json', 3, 'GPT4')

df_combined = pd.concat([df_llama, df_phi3, df_mixtral, df_gpt4], ignore_index=True)

unique_ids = df_combined['id'].unique()

unique_ids_list = unique_ids.tolist()
unique_ids_list

### Train dataset

In [None]:
df_combined.shape

In [None]:
df_combined.head()

In [None]:
labels_combined = df_combined['labels'] 


label_counts_combined = labels_combined.value_counts()

label_counts_combined

In [None]:
labels = ['Meta-Llama-3-8B-Instruct','Phi-3-mini-4k-instruct', 'Mixtral-8x7B-Instruct-v0.1', 'GPT4']
labels

In [None]:
NUM_LABELS = len(labels)

id2label={id:label for id,label in enumerate(labels)}
label2id={label:id for id,label in enumerate(labels)}

In [None]:
id2label

In [None]:
label2id

In [None]:
train_size = 0.8
train_ids, test_eval_ids = train_test_split(unique_ids, train_size=train_size, random_state=42)
df_train = df_combined[df_combined['id'].isin(train_ids)]
print("Train Set Shape:", df_train.shape)

In [None]:
df_train['category'].value_counts().plot(kind='bar', figsize=(10,10))

In [None]:
test_size = 0.5

test_ids, eval_ids = train_test_split(test_eval_ids, test_size=test_size, random_state=42)
df_test = df_combined[df_combined['id'].isin(test_ids)]
df_eval = df_combined[df_combined['id'].isin(eval_ids)]

print("Test Set Shape:", df_test.shape)
print("Evaluation Set Shape:", df_eval.shape)

In [None]:
df_test['category'].value_counts().plot(kind='bar', figsize=(10,10))

In [None]:
df_eval['category'].value_counts().plot(kind='bar', figsize=(10,10))

In [None]:
model_id = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_id)

train_dataset = Dataset.from_pandas(df_train)
eval_dataset = Dataset.from_pandas(df_eval)
test_dataset = Dataset.from_pandas(df_test)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length = 512)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
eval_dataset = eval_dataset.map(tokenize, batched=True, batch_size=len(eval_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

In [None]:
train_dataset

In [None]:
test_dataset

In [None]:
eval_dataset

## Training

Before anything else, we need to verify that we are using the GPU correctly

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

Load the model locally.

In [None]:
model = BertForSequenceClassification.from_pretrained(model_id, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)

Create a repository on huggingface and copy its name into a variable.

In [None]:
output_dir = 'logs/bert-base-uncased-llm-classificator'


training_args = TrainingArguments(
    output_dir= output_dir, 
    do_train=True,
    do_eval=True,
    num_train_epochs=3,              
    per_device_train_batch_size=64,  
    per_device_eval_batch_size=64,
    warmup_steps=100,                
    weight_decay=0.01,
    logging_strategy='steps',            
    logging_dir=f"{output_dir}/logs",            
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps", 
    load_best_model_at_end=True
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    
    class_precision, class_recall, class_f1, _ = precision_recall_fscore_support(labels, preds, average=None)
    
    acc = accuracy_score(labels, preds)
    
    metrics = {
        'Accuracy': acc,
        'Macro_F1': macro_f1,
        'Macro_Precision': macro_precision,
        'Macro_Recall': macro_recall,
        'Class_Precision': class_precision.tolist(),
        'Class_Recall': class_recall.tolist(),
        'Class_F1': class_f1.tolist()
    }
    
    return metrics
    

trainer = Trainer(
    model=model,                     
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=eval_dataset,            
    compute_metrics= compute_metrics
)




In [None]:
trainer.train()

In [None]:
"""q=[trainer.evaluate(eval_dataset=test_dataset)]

pd.DataFrame(q, index=["test"]).iloc[:,:8]"""

In [None]:
def plot_confusion_matrix(y_predicted, y_true, labels):
    cm = confusion_matrix(y_true, y_predicted, normalize='true')
    fig, ax = plt.subplots(figsize=(6,6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap='Blues', values_format='.2f', ax=ax, colorbar=False)
    plt.title('Normalized Confusion Matrix')
    plt.show()


y_valid = test_dataset['labels']

predictions = trainer.predict(test_dataset)
y_predictions = predictions.predictions.argmax(-1)


plot_confusion_matrix(y_predictions, y_valid, labels)


In [None]:
predictions.metrics

## Save the model locally

In [None]:
model_path = 'bert-base-uncased-llm-classificator'
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
gc.collect()
del model
torch.cuda.empty_cache()