In [9]:
!pip install evaluate



In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset, Dataset
from sklearn.preprocessing import LabelEncoder
import numpy as np
import evaluate
import torch

ModuleNotFoundError: No module named 'datasets'

# Dataset uploading

In [None]:
train_set = load_dataset("csv", data_files="/kaggle/input/python-codes-time-complexity/train_set.csv")
test_set = load_dataset("csv", data_files="/kaggle/input/python-codes-time-complexity/test_set.csv")

train_labels = train_set['train']['complexity']
test_labels = test_set['train']['complexity']

In [None]:
train_set

# Checkpoint

In [1]:
checkpoint = "Salesforce/codet5p-770m-py"

# Tokenizing

## Label tokenizing

In [5]:
labelEncoder = LabelEncoder()
labelEncoder.fit(train_labels)

## Feature tokenizing

In [6]:
def tokenize_data(samples, tokenizer):
    tokenized = tokenizer(samples['code'], truncation=True, max_length=512)
    tokenized['labels'] = labelEncoder.transform(samples['complexity'])
    return tokenized


def set_tokenizer(checkpoint):
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    X_train = train_set.map(lambda x: tokenize_data(x, tokenizer), batched=True)
    X_eval = test_set.map(lambda x: tokenize_data(x, tokenizer), batched=True)

    # Collator for batch padding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    return tokenizer, data_collator, X_train, X_eval

# Model

In [2]:
def set_model(checkpoint, lora=None):
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=7) if not lora else lora
    return model

# Hyperparameters

In [8]:
def set_training_args(checkpoint, batch_size=16):
    training_args = TrainingArguments(output_dir=f"training_results/{checkpoint}/", 
                                      eval_strategy="epoch",
                                      save_strategy="epoch",
                                      fp16=True,
                                      report_to='none',
                                      num_train_epochs=3, 
                                      per_device_train_batch_size=batch_size,
                                      per_device_eval_batch_size=batch_size,
                                      gradient_accumulation_steps = 4,
                                      logging_steps=100,
                                      load_best_model_at_end=True,
                                      #label_names=['complexity']
                                     )
    return training_args

# Evaluating

In [9]:
def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits[0], axis=-1) if isinstance(logits, tuple) else np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# LoRA

In [3]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, 
    lora_alpha=16,
    target_modules = ["q_proj", "v_proj"] # Not sure about this
    lora_dropout=0.1,
    bias='none',
    modules_to_save=['classifier'] # Not sure about this one either
)

model_lora = get_peft_model(model=set_model("Salesforce/codet5p-770m-py"), peft_config=config)
model_lora.print_trainable_parameters()

SyntaxError: invalid syntax. Perhaps you forgot a comma? (3909250928.py, line 6)

# Trainer 

In [None]:
def train(checkpoint):
    tokenizer, data_collator, train_set, eval_set = set_tokenizer(checkpoint)
    model = set_model(model_lora)
    training_args = set_training_args(checkpoint=checkpoint, batch_size=2)

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set['train'],
    eval_dataset=eval_set['train'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,)
    
    trainer.train()


mass_train_models(checkpoint)

# Flushing CUDA

In [None]:
!pip install GPUtil

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()                           

Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
  Created wheel for GPUtil: filename=GPUtil-1.4.0-py3-none-any.whl size=7392 sha256=422dc88337d0f9d865bf23d8227a89228015376ec3c628b3972e8f6970568db7
  Stored in directory: /root/.cache/pip/wheels/a9/8a/bd/81082387151853ab8b6b3ef33426e98f5cbfebc3c397a9d4d0
Successfully built GPUtil
Installing collected packages: GPUtil
Successfully installed GPUtil-1.4.0
Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% | 85% |
|  1 |  0% | 76% |


# Inference

In [None]:
print(torch.cuda.device_count())

In [None]:
device = torch.cuda.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def predict(inputs):
    # Tokenizing inputs
    test_sample = tokenizer(inputs, return_tensors='pt')
    inputs = Dataset.from_dict({key: value.to(model.device) for key, value in test_sample.items()})

    # Predicting & decoding inputs
    preds = trainer.predict(test_dataset=inputs)
    preds = labelEncoder.inverse_transform(y=np.ravel(np.argmax(preds.predictions, axis=-1)))
    
    return preds

In [None]:
test_sample = """
class Solution:
    def isValid(self, s: str) -> bool:
        bracketMap = {"(": ")", "[": "]", "{": "}"}
        openSet = set(["(", "[", "{"])
        stack = []
        for char in s:
            if char in openSet:
                stack.append(char)
            elif stack and char == bracketMap[stack[-1]]:
                stack.pop()
            else:
                return False
        return stack == []
        """

predict(test_sample)