In [1]:
pip install pandas scikit-learn peft datasets tensorboardX numba --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model
from datasets import load_dataset, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, make_scorer
import os 
import torch
import pdb

# Dataset uploading

In [3]:
DATASET_PATHS = {
    "local": {
        "train": "../../datasets/train_set.csv",
        "test": "../../datasets/test_set.csv"
    },
    "local_two": {
        "train": "train_set.csv",
        "test": "test_set.csv"
    },
    
    "kaggle": {
        "train": "/kaggle/input/python-codes-time-complexity/train_set.csv",
        "test": "/kaggle/input/python-codes-time-complexity/test_set.csv"
    }
}

def upload_datasets(dataset_paths=DATASET_PATHS):
    for path in dataset_paths:
        if os.path.exists(dataset_paths[path]['train']) and os.path.exists(dataset_paths[path]['test']):
            return dataset_paths[path]['train'], dataset_paths[path]['test']

    return FileNotFoundError(f"Datasets do not exist in the current paths: {dataset_paths}")
            

train_set_path, test_set_path = upload_datasets()

# Metrics

### Ordering labels by Hierarchy

In [4]:
LABELS_HIERARCHY = {
    'constant': 1,
    'logn': 2,
    'linear': 3,
    'nlogn': 4,
    'quadratic': 5,
    'cubic': 6,
    'np': 7
}

N_CLASSES = len(LABELS_HIERARCHY)

# Dataset uploading

In [5]:
train_set = load_dataset("csv", data_files=train_set_path)['train']
test_set = load_dataset("csv", data_files=test_set_path)['train']

train_labels = train_set['complexity']
test_labels = test_set['complexity']

# Checkpoint

In [6]:
checkpoint = "deepseek-ai/deepseek-coder-1.3b-base"

# Evaluating

### Writing the custom metric *Hierarchy Complexity Score*

In [7]:
def hc_score(y_true, y_pred, n_classes=N_CLASSES):
    assert len(y_true) == len(y_pred), f"The amount of y_true labels: {len(y_true)} does not equal to the amount of y_pred: {len(y_pred)}."

    n_samples = len(y_true)
    
    return (np.sum(np.abs(y_pred - y_true)) / n_classes) / n_samples

## Computing metrics

In [8]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits[0], axis=-1) if isinstance(logits, tuple) else np.argmax(logits, axis=-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    # Calculate F-1 Macro
    f1_macro_score = f1_score(labels, preds, average='macro')
    # Calculate Hierarchy Score
    hierarchy_score = hc_score(labels, preds)

    return {
        "accuracy": accuracy,
        "f1_macro": f1_macro_score,
        "hierarchy_score": hierarchy_score
    }

# Tokenizing

## Label tokenizing

In [9]:
labelEncoder = LabelEncoder()
labelEncoder.fit(train_set['complexity'])

## Feature tokenizing

In [10]:
def tokenize_data(samples, tokenizer):
    tokenized = tokenizer(samples['code'], truncation=True, max_length=512)
    tokenized['labels'] = labelEncoder.transform(samples['complexity'])
    return tokenized


def set_tokenizer(checkpoint):
    try:
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    except Exception as e:
        print(f"Failed to load {checkpoint}: {e}")
        checkpoint = "-".join(checkpoint.split("-")[:2])
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        print(f"Falling back to {checkpoint}")
    
    X_train = train_set.map(lambda x: tokenize_data(x, tokenizer), batched=True)
    X_eval = test_set.map(lambda x: tokenize_data(x, tokenizer), batched=True)

    # Collator for batch padding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    return tokenizer, data_collator, X_train, X_eval

# Model

In [11]:
def set_model(checkpoint):
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=7, torch_dtype='bfloat16')

    return model

# Hyperparameters

In [12]:
def set_training_args(checkpoint, batch_size=16):
    training_args = TrainingArguments(output_dir=f"training_results/{checkpoint}/", 
                                      eval_strategy="epoch",
                                      save_strategy="epoch",
                                      logging_strategy="epoch",
                                      #learning_rate=2e-4, # Testing
                                      bf16=True, 
                                      report_to='tensorboard',
                                      num_train_epochs=3,
                                      #warmup_steps=100, # Testing
                                      per_device_train_batch_size=batch_size,
                                      per_device_eval_batch_size=batch_size,
                                      gradient_accumulation_steps = 4, # Testing
                                      load_best_model_at_end=True,
                                     )
    return training_args

# LoRA

#### Check module names in the model to specify them in *target_modules* param

model = set_model(checkpoint)
for name, module in model.named_modules():
    print(name)

## LoRA config

config = LoraConfig(
    r=8, 
    lora_alpha=8,
    #target_modules = ["query", "key", 'value', 'dense', 'word_embeddings'], # Not sure about this
    lora_dropout=0.1,
    bias='none',
    modules_to_save=['classifier'], # Not sure about this one either
    task_type = "SEQ_CLS"
)

model_lora = get_peft_model(model=set_model(checkpoint), peft_config=config)
model_lora.print_trainable_parameters()

### Flash the drive

!rm -rf training_results

# Trainer

In [13]:
model = set_model(checkpoint)

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/deepseek-coder-1.3b-base and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def finetune(checkpoint):
    # Collecting
    tokenizer, data_collator, train_set, eval_set = set_tokenizer(checkpoint)
    #model = model_lora
    training_args = set_training_args(checkpoint=checkpoint, batch_size=2)
    
    # Building
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=eval_set,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    # Train
    trainer.train()
    
    # Save metrics
    test_metrics = trainer.evaluate(eval_dataset=eval_set)
    trainer.save_metrics(split="test", metrics=test_metrics)

    return trainer

trainer = finetune(checkpoint)

tokenizer_config.json:   0%|          | 0.00/793 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Map:   0%|          | 0/3911 [00:00<?, ? examples/s]

Map:   0%|          | 0/978 [00:00<?, ? examples/s]

  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


ValueError: Cannot handle batch sizes > 1 if no padding token is defined.

# Flushing CUDA

!pip install GPUtil

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

free_gpu_cache()

# Inference

In [None]:
device = torch.cuda.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer, data_collator, train_set, eval_set = set_tokenizer(checkpoint)


In [None]:
def predict(inputs):
    # Tokenizing inputs
    test_sample = tokenizer(inputs, return_tensors='pt', padding=True, truncation=True)
    inputs = Dataset.from_dict({key: value.to(model.device) for key, value in test_sample.items()})

    # Predicting & decoding inputs
    preds = trainer.predict(test_dataset=inputs)
    preds = labelEncoder.inverse_transform(y=np.ravel(np.argmax(preds.predictions[0], axis=-1)))
    
    return preds

In [None]:
test_sample = """
class Solution:
    def topKFrequent(self, nums: List[int], k: int) -> List[int]:
        count = {}
        for num in nums:
            count[num] = 1 + count.get(num, 0)

        arr = []
        for num, cnt in count.items():
            arr.append([cnt, num])
        arr.sort()

        res = []
        while len(res) < k:
            res.append(arr.pop()[1])
        return res
        """

predict(test_sample)