In [13]:
import warnings
warnings.simplefilter("ignore")

In [14]:
import comet_ml
comet_ml.init(project_name="CourtlyCrafter")

In [15]:
import pandas as pd
from datasets import Dataset

In [16]:
train = pd.read_csv("./dataset/train.csv")
val = pd.read_csv("./dataset/val.csv")
test = pd.read_csv("./dataset/test.csv")

In [17]:
merge_df = pd.concat([train, val, test])
merge_df = merge_df.reset_index(drop=True)

dataset = Dataset.from_pandas(merge_df)

In [18]:
dataset.train_test_split(test_size=0.2, shuffle=True)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 121394
    })
    test: Dataset({
        features: ['text'],
        num_rows: 30349
    })
})

In [19]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-mini")

In [20]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

In [21]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/151743 [00:00<?, ? examples/s]

In [22]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [26]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def get_example(index):
    return dataset[index]["text"]

def compute_metrics(pred):
    experiment = comet_ml.config.get_global_experiment()

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1 = precision_recall_fscore_support(
        labels, 
        preds, 
        average="macro"
    )

    acc = accuracy_score(labels, preds)

    if experiment:
        epoch = int(experiment.curr_epoch) if experiment.curr_epoch is not None else 0
        experiment.set_epoch(epoch)

        experiment.log_confusion_matrix(
            y_true = labels,
            y_predicted = preds,

            filename = f"confusion-matrix-epoch-{epoch}.json",
            index_to_example_function=get_example,
    )
    
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [27]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

In [28]:
model = AutoModelForCausalLM.from_pretrained("prajjwal1/bert-mini")

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


In [29]:
model = model.to("cuda")

In [30]:
%env COMET_MODE=ONLINE
%env COMET_LOG_ASSETS=TRUE

env: COMET_MODE=ONLINE
env: COMET_LOG_ASSETS=TRUE


In [31]:
training_args = TrainingArguments(
    output_dir="./output/CourtlyCrafter",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
    report_to=["comet_ml"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [32]:
trainer.train()

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/kaifshaikhhhh/courtlycrafter/2f70a07c0d51402e9d49f5591b846e6b



  0%|          | 0/56904 [00:00<?, ?it/s]

IndexError: Invalid key: 122206 is out of bounds for size 0

In [None]:
experiment.end()