In [1]:
# train_vanilla.py
import inspect
import os, json, random
from dataclasses import dataclass, asdict
import numpy as np
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# # =======================
# # USER CONFIG (edit this)
# # =======================
# @dataclass
# class Config:
#     data: str = "/workspaces/artist_predicition-using-BERT/ubernew/data/lyrics_dataset.json"          # JSON/JSONL with fields: text, label
#     model: str = "roberta-base"                              # e.g. "roberta-base", "distilbert-base-uncased"
#     out: str = "results/roberta-base.vanilla"                # fresh output dir
#     batch: int = 16
#     epochs: int = 4
#     lr: float = 5e-5
#     warmup_ratio: float = 0.06
#     weight_decay: float = 0.01
#     seed: int = 42
#     val_ratio: float = 0.10                                  # from the remaining train split
#     test_ratio: float = 0.10                                 # carved from full dataset
#     max_length: int = 256
#     logging_steps: int = 50
#     num_workers: int = 2
#     report_to_tb: bool = False                               # True → logs to TensorBoard

# CONFIG = Config()


In [3]:

@dataclass
class Config:
    data: str = "/workspaces/artist_predicition-using-BERT/ubernew/data/lyrics_dataset.json"          # JSON/JSONL with fields: text, label
    model: str = "roberta-base"
    out: str = "results/roberta-base.vanilla"
    batch: int = 16
    epochs: int = 4
    lr: float = 5e-5
    warmup_ratio: float = 0.06
    weight_decay: float = 0.01
    seed: int = 42
    val_ratio: float = 0.10
    test_ratio: float = 0.10
    max_length: int = 256
    logging_steps: int = 50
    num_workers: int = 2
    report_to_tb: bool = False

CONFIG = Config()


In [None]:

def seed_everything(seed: int = 42):
    random.seed(seed); np.random.seed(seed)

def compute_metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=-1)
    acc = accuracy_score(y_true, y_pred)
    p, r, f1_w, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted", zero_division=0)
    _, _, f1_m, _ = precision_recall_fscore_support(y_true, y_pred, average="macro", zero_division=0)
    return {"accuracy": acc, "f1_weighted": f1_w, "f1_macro": f1_m, "precision_w": p, "recall_w": r}

def main(cfg: Config):
    seed_everything(cfg.seed)

    # 1) Load dataset and encode labels -> ClassLabel (required for stratify)
    raw = load_dataset("json", data_files=cfg.data, split="train")
    raw = raw.class_encode_column("label")  # turns string labels into ClassLabel-encoded ints

    # 2) Split with stratification on the encoded "label" column
    ds = raw.train_test_split(test_size=cfg.test_ratio, seed=cfg.seed, stratify_by_column="label")
    tmp = ds["train"].train_test_split(test_size=cfg.val_ratio, seed=cfg.seed, stratify_by_column="label")
    dds = DatasetDict(train=tmp["train"], validation=tmp["test"], test=ds["test"])

    # 3) Get label names from feature metadata (avoids manual drift)
    label_names = dds["train"].features["label"].names
    id2label = {i: name for i, name in enumerate(label_names)}
    label2id = {name: i for i, name in enumerate(label_names)}

    # 4) Tokenizer & encode
    tok = AutoTokenizer.from_pretrained(cfg.model, use_fast=True)

    def preprocess(batch):
        enc = tok(batch["text"], truncation=True, max_length=cfg.max_length)
        # batch["label"] is already a list of ints from ClassLabel
        enc["labels"] = batch["label"]
        return enc

    dds = dds.map(preprocess, batched=True, remove_columns=dds["train"].column_names)
    collator = DataCollatorWithPadding(tokenizer=tok)

    # 5) Fresh model (no resume)
    model = AutoModelForSequenceClassification.from_pretrained(
        cfg.model, num_labels=len(label_names), id2label=id2label, label2id=label2id
    )

        # 6) Training args — force a clean run
    def make_training_args(cfg):
        base_kwargs = dict(
            output_dir=cfg.out,
            overwrite_output_dir=True,
            learning_rate=cfg.lr,
            per_device_train_batch_size=cfg.batch,
            per_device_eval_batch_size=cfg.batch,
            num_train_epochs=cfg.epochs,
            weight_decay=cfg.weight_decay,
            logging_steps=cfg.logging_steps,
            seed=cfg.seed,
            dataloader_num_workers=cfg.num_workers,
            warmup_ratio=cfg.warmup_ratio,
            save_total_limit=2,
            report_to=(["none"] if getattr(cfg, "report_to_tb", False) is False else ["tensorboard"]),
        )
        sig = inspect.signature(TrainingArguments.__init__)
        allowed = set(sig.parameters.keys())
        safe_kwargs = {k: v for k, v in base_kwargs.items() if k in allowed}
        return TrainingArguments(**safe_kwargs)

    # ✅ actually create the args
    training_args = make_training_args(cfg)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dds["train"],
        eval_dataset=dds["validation"],
        tokenizer=tok,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # 7) Evaluate + persist artifacts
    os.makedirs(cfg.out, exist_ok=True)
    metrics_val = trainer.evaluate(dds["validation"])
    metrics_test = trainer.evaluate(dds["test"])
    with open(os.path.join(cfg.out, "metrics_val.json"), "w") as f:
        json.dump(metrics_val, f, indent=2)
    with open(os.path.join(cfg.out, "metrics_test.json"), "w") as f:
        json.dump(metrics_test, f, indent=2)
    with open(os.path.join(cfg.out, "labels.json"), "w") as f:
        json.dump({"id2label": id2label, "label2id": label2id}, f, indent=2)
    with open(os.path.join(cfg.out, "run_config.json"), "w") as f:
        json.dump(asdict(cfg), f, indent=2)

    print("Saved:", cfg.out)
    print("Val:", metrics_val)
    print("Test:", metrics_test)


: 

In [None]:

if __name__ == "__main__":
    main(CONFIG)


Map: 100%|██████████| 129/129 [00:00<00:00, 1577.33 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(tr

Step,Training Loss
50,2.4178


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
