In [8]:
import datasets
from transformers import AutoModelForSequenceClassification, AutoTokenizer,Trainer,TrainingArguments,DataCollatorWithPadding
import pandas 
import torch
from evaluate import load
modelName="huggingface/CodeBERTa-small-v1"
model=AutoModelForSequenceClassification.from_pretrained("huggingface/CodeBERTa-small-v1",num_labels=4)
tokenizer=AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
data_collator=DataCollatorWithPadding(tokenizer)

# for param in model.base_model.parameters():
#     param.requires_grad = False

# for param in model.classifier.parameters():
#     param.requires_grad = True

print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable parameters: 83453956


In [2]:

trainds=datasets.load_dataset("parquet",data_files="task_c/task_c_training_set_1.parquet")["train"]
testds=datasets.load_dataset("parquet",data_files="task_c/task_c_test_set_sample.parquet")["train"]
valds=datasets.load_dataset("parquet",data_files="task_c/task_c_validation_set.parquet")["train"]


In [3]:
from collections import Counter
Counter(trainds['label'])


Counter({1: 210471, 0: 485483, 2: 85520, 3: 118526})

In [4]:

collection=[]
for i in range(4):
    classDs=trainds.filter(lambda x: x["label"]==i)
    classDs=classDs.shuffle(seed=36)
    classDs=classDs.select(range(85520))
    collection.append(classDs)

downsampledDS=datasets.concatenate_datasets(collection)
trainds=downsampledDS.shuffle(seed=42)


In [7]:
def tokenFunction(row):
    text = f"{row['code']} </s> {row['generator']} </s> {row['language']}"
    tokens= tokenizer(text,truncation=True, padding="max_length", max_length=256)
    tokens["labels"]=row["label"]
    return tokens
tokenizedtrain=trainds.map(tokenFunction)
tokenizedtest=testds.map(tokenFunction)
tokenizedval=valds.map(tokenFunction)


In [9]:
trainingArgs=TrainingArguments(
    output_dir="./results/SemEvalC",
    eval_strategy="epoch",
    fp16=True,
    num_train_epochs=4,
    learning_rate=1e-5,
    per_device_eval_batch_size=8,
    logging_steps=10000,
    save_strategy="epoch",
)

f1_func=load("f1")
acc_func=load("accuracy")
recall_func=load("recall")
def f1_clac(preds):
    logits=preds.predictions
    labels=preds.label_ids
    prediction=logits.argmax(axis=-1)
    return {
        "accuracy":acc_func.compute(predictions=prediction,references=labels)['accuracy'],
        "recall":recall_func.compute(predictions=prediction,references=labels, average="macro")['recall'],
        "f1":f1_func.compute(predictions=prediction,references=labels,average="macro")['f1'],
    }

trainer =Trainer(
    model=model,
    args=trainingArgs,
    tokenizer=tokenizer,
    compute_metrics=f1_clac,
    train_dataset=tokenizedtrain,
    eval_dataset=tokenizedval,
    data_collator=data_collator
)
trainer.train()
test_metrics = trainer.evaluate(tokenizedtest)
trainer.save_model("./results/SemEvalC/final_model")

# Save the tokenizer
tokenizer.save_pretrained("./results/SemEvalC/final_model")
print(f"Test Loss: {test_metrics['eval_loss']:.4f}, F1: {test_metrics['eval_f1']:.4f}")

  trainer =Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Recall,F1
1,0.3779,0.272468,0.893255,0.857476,0.837018
2,0.3293,0.290983,0.897105,0.8691,0.845562
3,0.2877,0.343864,0.89733,0.868062,0.845225
4,0.2489,0.394302,0.89921,0.869472,0.847187


Test Loss: 0.3744, F1: 0.8474


In [10]:
trainer.save_model("./results/SemEvalC/final_small_model_full")

# Save the tokenizer
tokenizer.save_pretrained("./results/SemEvalC/final_small_model_full")
import json
with open("./results/SemEvalC/final_small_model_full/metrics.json", "w") as f:
    json.dump(test_metrics, f, indent=4)
