## Dependencies

In [None]:
! pip install torch pandas datasets transformers scikit-learn datasets 

## Imports

In [None]:
import torch, gc
import json
from torch import cuda
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    BertForSequenceClassification, 
    BertTokenizerFast,
    TrainingArguments,
    Trainer
)
from collections import Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

## Data pre processing

Loading the dataset from Hugging Face and using pandas to read the train and test datasets into dataframes, in order to analyse the data.

In [None]:
with open('meta-llama/Meta-Llama-3-8B-Instruct_valid.json') as f:
    data = json.load(f)
    for e in data:
        del e['id']
        e['text'] = e['Completion']
        del e['Completion']
        e['labels'] = 0 
        e['category'] = 'Meta-Llama-3-8B-Instruct'
df_llama = pd.DataFrame(data)

with open('microsoft/Phi-3-mini-4k-instruct_valid.json') as f:
    data = json.load(f)
    for e in data:
        del e['id']
        e['text'] = e['Completion']
        del e['Completion']
        e['labels'] = 1
        e['category'] = 'Phi-3-mini-4k-instruct'   
df_phi3 = pd.DataFrame(data)

with open('mistralai/Mixtral-8x7B-Instruct-v0.1_valid.json') as f:
    data = json.load(f)
    for e in data:
        del e['id']
        e['text'] = e['Completion']
        del e['Completion']
        e['labels'] = 2
        e['category'] = 'Mixtral-8x7B-Instruct-v0.1'
df_mixtral = pd.DataFrame(data)

with open('openai/GPT4_valid.json') as f:
    data = json.load(f)
    for e in data:
        del e['id']
        e['text'] = e['Completion']
        del e['Completion']
        e['labels'] = 3
        e['category'] = 'GPT4'
df_gpt4 = pd.DataFrame(data)

df_combined = pd.concat([df_llama, df_phi3, df_mixtral, df_gpt4], ignore_index=True)
      
df_combined

### Train dataset

In [None]:
df_combined.shape

In [None]:
df_combined.head()

In [None]:
labels_combined = df_combined['labels'] 


label_counts_combined = labels_combined.value_counts()

label_counts_combined

In [None]:
labels = ['Meta-Llama-3-8B-Instruct','Phi-3-mini-4k-instruct', 'Mixtral-8x7B-Instruct-v0.1', 'GPT4']
labels

In [None]:
NUM_LABELS = len(labels)

id2label={id:label for id,label in enumerate(labels)}
label2id={label:id for id,label in enumerate(labels)}

In [None]:
id2label

In [None]:
label2id

In [None]:
target_variable = 'labels'

test_size = 0.8

df_test_eval, df_train = train_test_split(df_combined, test_size=test_size, stratify=df_combined[target_variable], random_state=42)

print("Train Set Shape:", df_train.shape)
print("Test and Evaluation Sets Shape:", df_test_eval.shape)


label_counts_train = df_train[target_variable].value_counts()
label_counts_test_eval = df_test_eval[target_variable].value_counts()
print("Label counts in train set:\n", label_counts_train)
print("Label counts in test and evaluation sets:\n", label_counts_test_eval)

In [None]:
df_train['category'].value_counts().plot(kind='bar', figsize=(10,10))

Divided the test dataset into test and validation. Using train_test_split function we can control the proportion of data going to the validation set and that the test and evaluation sets have a similar class distribution.

In [None]:
target_variable = 'labels'

test_size = 0.5

df_test, df_eval = train_test_split(df_test_eval, test_size=test_size, stratify=df_test_eval[target_variable], random_state=42)

print("Test Set Shape:", df_test.shape)
print("Evaluation Set Shape:", df_eval.shape)


label_counts_test = df_test[target_variable].value_counts()
label_counts_eval = df_eval[target_variable].value_counts()
print("Label counts in test set:\n", label_counts_test)
print("Label counts in evaluation set:\n", label_counts_eval)

In [None]:
df_test['category'].value_counts().plot(kind='bar', figsize=(10,10))

In [None]:
df_eval['category'].value_counts().plot(kind='bar', figsize=(10,10))

In [None]:
model_id = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_id)

train_dataset = Dataset.from_pandas(df_train)
eval_dataset = Dataset.from_pandas(df_eval)
test_dataset = Dataset.from_pandas(df_test)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length = 512)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
eval_dataset = eval_dataset.map(tokenize, batched=True, batch_size=len(eval_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

In [None]:
train_dataset

In [None]:
test_dataset

In [None]:
eval_dataset

## Training

Before anything else, we need to verify that we are using the GPU correctly

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

Load the model locally.

In [None]:
model = BertForSequenceClassification.from_pretrained(model_id, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)

Create a repository on huggingface and copy its name into a variable.

In [None]:
output_dir = 'logs/bert-base-uncased-llm-classificator'


training_args = TrainingArguments(
    output_dir= output_dir, 
    do_train=True,
    do_eval=True,
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,
    warmup_steps=100,                
    weight_decay=0.01,
    logging_strategy='steps',            
    logging_dir=f"{output_dir}/logs",            
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps", 
    load_best_model_at_end=True
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Compute micro-averaged metrics
    micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    
    # Compute macro-averaged metrics
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    

    
    # Compute accuracy
    acc = accuracy_score(labels, preds)
    
   
    metrics = {
        'Accuracy': acc,
        'Micro_F1': micro_f1,
        'Micro_Precision': micro_precision,
        'Micro_Recall': micro_recall,
        'Macro_F1': macro_f1,
        'Macro_Precision': macro_precision,
        'Macro_Recall': macro_recall
    }
    

    
    return metrics
    

trainer = Trainer(
    model=model,                     
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=eval_dataset,            
    compute_metrics= compute_metrics
)




In [None]:
trainer.train()

In [None]:
q=[trainer.evaluate(eval_dataset=test_dataset)]

pd.DataFrame(q, index=["test"]).iloc[:,:8]

## Save the model locally

In [None]:
model_path = 'bert-base-uncased-llm-classificator'
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
gc.collect()
del model
torch.cuda.empty_cache()