In [30]:
!pip install evaluate



In [31]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset, Dataset
from sklearn.preprocessing import LabelEncoder
import numpy as np
import evaluate
import torch

# Dataset uploading

In [32]:
train_set = load_dataset("csv", data_files="/kaggle/input/python-codes-time-complexity/train_set.csv")
test_set = load_dataset("csv", data_files="/kaggle/input/python-codes-time-complexity/test_set.csv")

train_labels = train_set['train']['complexity']
test_labels = test_set['train']['complexity']

# Checkpoint pool

In [33]:
checkpoints = ["microsoft/codebert-base", "neulab/codebert-python", "microsoft/graphcodebert-base", 
              "Salesforce/codet5-base", "Salesforce/codet5-base-codexglue-sum-python", "Salesforce/codet5p-220m-py",
              "Salesforce/codet5-base-multi-sum", "microsoft/unixcoder-base"]

# Tokenizing

## Label tokenizing

In [34]:
labelEncoder = LabelEncoder()
labelEncoder.fit(train_labels)

## Feature tokenizing

In [61]:
def tokenize_data(samples, tokenizer):
    tokenized = tokenizer(samples['code'], truncation=True)
    tokenized['labels'] = labelEncoder.transform(samples['complexity']) if samples['complexity'] else None
    return tokenized


def set_tokenizer(checkpoint):
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    X_train = train_set.map(lambda x: tokenize_data(x, tokenizer), batched=True)
    X_eval = test_set.map(lambda x: tokenize_data(x, tokenizer), batched=True)

    # Collator for batch padding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    return tokenizer, data_collator, X_train, X_eval

# Model

In [62]:
def set_model(checkpoint):
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=7)
    return model

# Hyperparameters

In [63]:
def set_training_args(checkpoint):
    training_args = TrainingArguments(output_dir=f"{checkpoint}/training_results/", eval_strategy="epoch",
                                  fp16=True, report_to='none', num_train_epochs=3,
                                 per_device_train_batch_size=32, per_device_eval_batch_size=32)
    return training_args

# Evaluating

In [64]:
def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Trainer 

In [65]:
def mass_train_models(checkpoins):
    for checkpoint in checkpoints:
        
        tokenizer, data_collator, train_set, eval_set = set_tokenizer(checkpoint)
        model = set_model(checkpoint)
        training_args = set_training_args(checkpoint)
    
        trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set['train'],
        eval_dataset=eval_set['train'],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,)
        
        print(f"CHECKPOINT: {checkpoint}")
        trainer.train()
        


mass_train_models(checkpoints)

Map:   0%|          | 0/3911 [00:00<?, ? examples/s]

Map:   0%|          | 0/978 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


CHECKPOINT: microsoft/codebert-base


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.950118,0.667689
2,No log,0.73041,0.745399
3,No log,0.666263,0.777096


TypeError: save_metrics() missing 2 required positional arguments: 'split' and 'metrics'

In [66]:
trainer.save_metrics("all", compute_metrics)

FileNotFoundError: [Errno 2] No such file or directory: 'training_results/all_results.json'

In [16]:
device = torch.cuda.device("cuda" if torch.cuda.is_available() else "cpu")

In [64]:
def predict(inputs):
    # Tokenizing inputs
    test_sample = tokenizer(inputs, return_tensors='pt')
    inputs = Dataset.from_dict({key: value.to(model.device) for key, value in test_sample.items()})

    # Predicting & decoding inputs
    preds = trainer.predict(test_dataset=inputs)
    preds = labelEncoder.inverse_transform(y=np.ravel(np.argmax(preds.predictions, axis=-1)))
    
    return preds

In [86]:
test_sample = """
class Solution:
    def isValid(self, s: str) -> bool:
        bracketMap = {"(": ")", "[": "]", "{": "}"}
        openSet = set(["(", "[", "{"])
        stack = []
        for char in s:
            if char in openSet:
                stack.append(char)
            elif stack and char == bracketMap[stack[-1]]:
                stack.pop()
            else:
                return False
        return stack == []
        """

predict(test_sample)

array(['linear'], dtype='<U9')