In [None]:
import sys
import os

# Get the absolute path to the src folder
src_path = os.path.abspath(os.path.join(os.getcwd(), "../src"))

# Add src to sys.path
sys.path.append(src_path)

print(f"Added {src_path} to sys.path")

In [None]:
from comet_ml import start
from dotenv import load_dotenv
from datasets import load_from_disk

load_dotenv('./../settings.env')

In [None]:
import comet_ml

comet_ml.login(project_name="multiclass-text-classification")

In [None]:
experiment = start()
artifact = experiment.get_artifact("news_dataset_hugging_face")

artifact.download("./../data/processed/")
experiment.end()

In [None]:
dataset_path = "./../data/processed/news_dataset_hugging_face/"
tokenized_dataset_dict = load_from_disk(dataset_path)

In [None]:
from config import config
pre_trained_bert_model = config.pre_trained_bert_model
SEED = 42

In [None]:
id2label = {
            0: 'Business',
            1: 'Science & Technology',
            2: 'Entertainment',
            3: 'Health',
        }
label2id = {v: k for k, v in id2label.items()}

In [None]:
tokenized_dataset_dict

In [None]:
train_dataset = tokenized_dataset_dict["train"].shuffle(seed=SEED).select(range(200))
validation_dataset = tokenized_dataset_dict["validation"].shuffle(seed=SEED).select(range(200))
test_dataset = tokenized_dataset_dict["test"].shuffle(seed=SEED).select(range(200))

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(pre_trained_bert_model)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(pre_trained_bert_model, num_labels=4)

In [None]:
train_dataset

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def get_example(index):
    return validation_dataset[index]["title_prepared"]


def compute_metrics(pred):
    """
    Computes classification metrics for a multiclass classification task with 4 classes.
    
    Args:
        pred: The predictions from the model containing label_ids and logits.
    
    Returns:
        dict: A dictionary containing accuracy, F1-score, precision, and recall.
    """
    experiment = comet_ml.get_running_experiment()

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro"
    )
    acc = accuracy_score(labels, preds)

    if experiment:
        epoch = int(experiment.curr_epoch) if experiment.curr_epoch is not None else 0
        experiment.set_epoch(epoch)
        experiment.log_confusion_matrix(
            y_true=labels,
            y_predicted=preds,
            file_name=f"confusion-matrix-epoch-{epoch}.json",
            labels=["Business", "Science & Technology", "Entertainment", "Health"],
            index_to_example_function=get_example,
        )

    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:
from transformers import AutoTokenizer, Trainer, TrainingArguments

In [None]:
%env COMET_MODE=ONLINE
%env COMET_LOG_ASSETS=TRUE

training_args = TrainingArguments(
    seed=SEED,
    output_dir="./../models",
    overwrite_output_dir=True,
    num_train_epochs=1,
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_total_limit=10,
    save_steps=25,
    per_device_train_batch_size=8,
    report_to=["comet_ml"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)
trainer.train()

In [None]:
comet_ml.get_running_experiment().end()

In [None]:
trainer.push_to_hub()