# Installs

In [None]:
#!pip install transformers accelerate peft bitsandbytes datasets torch mlflow -q
#!pip install flash-attn --no-build-isolation -q

Note: you may need to restart the kernel to use updated packages.


In [None]:
#!pip install mlflow 

# Imports

In [None]:
import os
import pdb
from inspect import signature
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow

import numpy as np
import torch
import torch.nn as nn
from accelerate import PartialState
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score, make_scorer, recall_score, \
                            confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    PreTrainedModel,
    Trainer,
    TrainingArguments,
    TrainerCallback
)
from transformers.modeling_outputs import SequenceClassifierOutput

from datasets import Dataset, load_dataset

## Device

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Config

In [None]:
checkpoint = "deepseek-ai/DeepSeek-Coder-V2-Lite-Base"
batch_size=16

# MLFlow config

In [None]:
# To not hang for an hour if no connection could be established
mlflow.environment_variables.MLFLOW_HTTP_REQUEST_TIMEOUT = 10

mlflow.set_tracking_uri("https://nondecayed-laurinda-pleiophyllous.ngrok-free.dev")

mlflow.set_experiment("MLFlow and metrics interpretability testing")

## Google drive mount


from google.colab import drive
drive.mount("/content/drive")

# Dataset 

## Paths

In [None]:
BASE_LOCATION: Path = Path(__file__).parent


# Datasets
DATASET_PATHS = {
    "local": {
        "train": BASE_LOCATION.parents[3] / "datasets/data/train_set.csv",
        "eval": BASE_LOCATION.parents[3] / "datasets/data/eval_set.csv",
    },
    "local_two": {"train": "train_set.csv", "eval": "eval_set.csv"},
    "local_three": {
        "train": "drive/MyDrive/fine_tuning/train_set.csv",
        "eval": "drive/MyDrive/fine_tuning/eval_set.csv",
    },
    "kaggle": {
        "train": "/kaggle/input/python-codes-time-complexity/train_set.csv",
        "eval": "/kaggle/input/python-codes-time-complexity/eval_set.csv",
    },
}

## Uploading

In [None]:
def upload_datasets(dataset_paths=DATASET_PATHS):
    for path in dataset_paths:
        if os.path.exists(dataset_paths[path]["train"]) and os.path.exists(dataset_paths[path]["eval"]):
            print("Data found!")
            return dataset_paths[path]["train"], dataset_paths[path]["eval"]

    return FileNotFoundError(f"Datasets do not exist in the current paths: {dataset_paths}")

train_set_path, eval_set_path = upload_datasets()

train_set = load_dataset("csv", data_files=train_set_path)["train"]
eval_set = load_dataset("csv", data_files=eval_set_path)["train"]

# Metrics

### Hierarchy score

In [None]:
LABELS_HIERARCHY = {
    'constant': 1,
    'logn': 2,
    'linear': 3,
    'nlogn': 4,
    'quadratic': 5,
    'cubic': 6,
    'np': 7
}

N_CLASSES = len(LABELS_HIERARCHY)

# Hierarchy score
def hc_score(y_true, y_pred, n_classes=N_CLASSES):
    assert len(y_true) == len(y_pred), (
        f"The amount of y_true labels: {len(y_true)} does not equal to the amount of y_pred: {len(y_pred)}."
    )

    n_samples = len(y_true)

    return (np.sum(np.abs(y_pred - y_true)) / n_classes) / n_samples

### Other metrics

In [None]:
def compute_metrics(eval_preds):
    # Make preds & labels global for access in callbacks
    global last_preds, last_labels

    logits, labels = eval_preds
    preds = np.argmax(logits[0], axis=-1) if isinstance(logits, tuple) else np.argmax(logits, axis=-1)

    # Save for callbacking
    last_preds, last_labels = preds, labels

    # Calculate accuracy
    accuracy = accuracy_score(y_true=labels, y_pred=preds)
    
    # Calculate F-1 Macro
    f1_macro_score = f1_score(y_true=labels, y_pred=preds, average="macro")

    # Calculate per-class recall
    recall_scores = recall_score(y_true=labels, y_pred=preds, average=None, labels=[0, 1, 4, 5, 2, 3, 6]) # reorder labels here because of how labelEncoder encodes the labels in not complexity-wise ascending order
    recall_per_class = {}

    # Zip label: score into a dict
    for label, score in zip(LABELS_HIERARCHY.keys(), recall_scores):
        recall_per_class[label] = np.round(score, 2)

    # Calculate Hierarchy Score
    hierarchy_score = hc_score(y_true=labels, y_pred=preds)

    return {
        "accuracy": accuracy,
        "f1_macro": f1_macro_score,
        "recall_score": recall_per_class,
        "hierarchy_score": hierarchy_score,
    }

### Eval metric callbacks

In [None]:
class ConfusionMatrixCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        if last_preds is not None:
            # Calculate confusion matrix
            disp = ConfusionMatrixDisplay.from_predictions(y_true=last_labels, y_pred=last_preds, labels=[0, 1, 4, 5, 2, 3, 6], # reorder labels here because of how labelEncoder encodes the labels in not complexity-wise ascending order
                                                            display_labels=["O(1)", "O(logn)", "O(n)","O(nlogn)",
                                                            "O(n^2)", "O(n^3)","np",])
            # Get fig and axes
            fig = disp.figure_
            ax = disp.ax_

            # Make slightly wider to fit xtick labels
            fig.set_size_inches(10, 5)
            fig.tight_layout()
            
            #plt.show()

            # Save as png and Log to MLFlow
            fig.savefig("confusion_matrix.png")
            # Close and unregister, so that it doesn't print
            plt.close(fig)
            mlflow.log_artifact("confusion_matrix.png")


class RecallScoreCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        # Parse recall scores
        recall_scores = kwargs['metrics']['eval_recall_score']

        # Create a barplot
        ax = sns.barplot(x=np.array(list(recall_scores.keys())),
                    y=np.array(list(recall_scores.values())),
                   )
        # Add labels
        ax.set_xlabel(xlabel="Complexity", labelpad=20, fontsize=14)
        ax.set_ylabel(ylabel="Recall score", labelpad=20, fontsize=14)
        
        #plt.show()
        
        # Save as png and log to MLFLow
        fig = ax.get_figure()
        fig.savefig("recall_per_score.png")
        # Close and unregister, so that it doesn't print
        plt.close(fig)
        mlflow.log_artifact("recall_per_score.png")

# Tokenizing

In [None]:
# Tokenization
# Setting up Label Encoder
labelEncoder = LabelEncoder()
labelEncoder.fit(train_set["complexity"])

def tokenize_data(data, tokenizer):
    # Tokenizing
    tokenized = tokenizer(
        data["code"],
        truncation=True,
        max_length=512,
    )
    tokenized["labels"] = labelEncoder.transform(data["complexity"])
    return tokenized

def set_tokenizer(checkpoint):
    try:
        tokenizer = AutoTokenizer.from_pretrained(checkpoint, pad_token="<pad>")
    except Exception as e:
        print(f"Failed to load {checkpoint}: {e}")
        checkpoint = "-".join(checkpoint.split("-")[:2])
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        print(f"Falling back to {checkpoint}")

    X_train = train_set.map(
        lambda x: tokenize_data(x, tokenizer),
        batched=True,
        remove_columns=train_set.column_names,
    )
    X_eval = eval_set.map(
        lambda x: tokenize_data(x, tokenizer),
        batched=True,
        remove_columns=eval_set.column_names,
    )

    # Data Collator
    tokenizer.padding_side = "left"
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    return tokenizer, data_collator, X_train, X_eval

In [None]:
tokenizer, data_collator, train_set, eval_set = set_tokenizer(checkpoint)

# Model

## General model setup

In [None]:
# Model loading
def set_model(checkpoint, tokenizer, ModelType=AutoModel):
    # Setup bitsandbytes quantization config
    """quant_config = setup_bnb_config()"""

    # Load a pretrained model
    model = ModelType.from_pretrained(
        checkpoint,
        torch_dtype=torch.bfloat16,
        num_labels=N_CLASSES,
        trust_remote_code=True,
        device_map="auto"
        #device_map=PartialState().process_index,
        #quantization_config=quant_config,
        #attn_implementation="flash_attention_2", Only for newer models
    )

    # Accomodating the size of the token embeddings for the potential missing <pad> token
    model.resize_token_embeddings(len(tokenizer), mean_resizing=False)

    # Passing the pad token id to the model config
    model.config.pad_token_id = tokenizer.pad_token_id
    return model

## Custom head for Deepseek v2

In [None]:
# Custom classifier head
class DeepseekV2ForSequenceClassification(PreTrainedModel):
    config_class = AutoConfig

    def __init__(self, base_model, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = base_model

        self.dense = nn.Linear(config.n_embd, config.num_labels, bias=False, dtype=self.model.dtype)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.config.n_embd

    def forward(self, input_ids=None, attention_mask=None, labels=None, *args, **kwargs):
        outputs = self.model(input_ids, attention_mask)

        hidden_states = outputs.last_hidden_state
        logits = self.dense(hidden_states)

        # Batch size
        if input_ids is not None:
            batch_size = input_ids.shape[0]

        # If padding token id is not configured and the batch size is > 1
        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        # If padding token id is not configured
        if self.config.pad_token_id is None:
            last_non_pad_token = -1
        # if encoded inputs exist => find the last non padded token to pool data from
        elif input_ids is not None:
            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, dtype=torch.int32)
            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)

        # Pooling logits from the last non padded token across the batches
        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]

        # Calculating loss if labels are provided
        loss = None
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                pooled_logits=pooled_logits,
                config=self.config,
            )

        return SequenceClassifierOutput(loss=loss, logits=pooled_logits)

## Quantization (BnB)

In [None]:
# Bitsandbytes (Quantization)
def setup_bnb_config():
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_storage=torch.bfloat16,
    )
    return quant_config

## LoRA

In [None]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    # target_modules = ['q_proj', 'v_proj'], # Qwen
    target_modules="all-linear",  # Heavy, universal
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",  # might not work with this on
)

In [None]:
model = set_model(checkpoint, tokenizer, AutoModelForSequenceClassification)
# model = DeepseekV2ForSequenceClassification(model, model.config)
#model = get_peft_model(model=model, peft_config=peft_config)

#print(f"Model: {model}")
# print(f"Model config: {model.config}")

# Training

## TrainingArgs

In [None]:
training_args = TrainingArguments(
    output_dir=f"training_results/{checkpoint}/",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    # eval_steps=5,
    # learning_rate=2e-4, # Testing
    bf16=True,
    #gradient_checkpointing=True,
    report_to="mlflow",
    num_train_epochs=3,
    #warmup_steps=100,  # Testing
    label_names=["labels"],
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=1,
    load_best_model_at_end=True,
    run_name="full data 3 epochs",
    #deepspeed="configs/ds_config.json",
)

## Trainer

In [None]:
# Building
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=eval_set,
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[ConfusionMatrixCallback(), RecallScoreCallback()]
)

In [None]:
# Train
trainer.train()

# Evaluating

In [None]:
# Save metrics
eval_metrics = trainer.evaluate(eval_dataset=eval_set)
trainer.save_metrics(split="eval", metrics=eval_metrics)

# Saving the full model
if trainer.is_fsdp_enabled:
    trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")

trainer.save_model(f"./best_model/{checkpoint}/")
print("The best model was saved.")

# Flushing CUDA

In [None]:
!pip install GPUtil

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()

free_gpu_cache()

# Inference

In [None]:
device = torch.cuda.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer, data_collator, train_set, eval_set = set_tokenizer(checkpoint)


In [None]:
def predict(inputs):
    # Tokenizing inputs
    test_sample = tokenizer(inputs, return_tensors='pt', padding=True, truncation=True)
    inputs = Dataset.from_dict({key: value.to(model.device) for key, value in test_sample.items()})

    # Predicting & decoding inputs
    preds = trainer.predict(test_dataset=inputs)
    preds = labelEncoder.inverse_transform(y=np.ravel(np.argmax(preds.predictions[0], axis=-1)))

    return preds

In [None]:
test_sample = """
class Solution:
    def topKFrequent(self, nums: List[int], k: int) -> List[int]:
        count = {}
        for num in nums:
            count[num] = 1 + count.get(num, 0)

        arr = []
        for num, cnt in count.items():
            arr.append([cnt, num])
        arr.sort()

        res = []
        while len(res) < k:
            res.append(arr.pop()[1])
        return res
        """

predict(test_sample)