In [None]:
# !pip install "torch==2.4.1" tensorboard 
# !pip install flash-attn "setuptools<71.0.0" scikit-learn 
# !pip install --upgrade torch torchvision
# !pip install wandb
# !pip install matplotlib
# 
# !pip install  --upgrade \
#   "datasets==3.1.0" \
#   "accelerate==1.2.1" \
#   "hf-transfer==0.1.8" \
#   "transformers==4.47.1" \
#  
# # ModernBERT is not yet available in an official release, so we need to install it from github
# !pip install "git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1" --upgrade

In [1]:
from datasets import load_dataset

raw_dataset = load_dataset("csv", data_files="datasets/modern-bert-embeddings/boolq_embeds.csv")

print(f"Dataset size: {len(raw_dataset['train'])}")
print(raw_dataset["train"])

FileNotFoundError: Unable to find '/home/woi/code/Energy-Optimal-Inferencing/datasets/modern-bert-embeddings/boolq_embeds.csv'

In [None]:
import matplotlib.pyplot as plt

text_lengths = [len(row) for row in raw_dataset["train"]["input_text"]]
max_length = max(text_lengths)
min_length = min(text_lengths)
mean_length = sum(text_lengths) / len(text_lengths)
median_length = sorted(text_lengths)[len(text_lengths) // 2]
percentile_95 = sorted(text_lengths)[int(0.95 * len(text_lengths))]

print(f"Max length: {max_length}")
print(f"Min length: {min_length}")
print(f"Mean length: {mean_length:.2f}")
print(f"Median length: {median_length}")
print(f"95th percentile: {percentile_95}")

plt.hist(text_lengths, bins=50, alpha=0.75, color="blue")
plt.axvline(mean_length, color="red", linestyle="dashed", linewidth=1, label="Mean")
plt.axvline(median_length, color="green", linestyle="dashed", linewidth=1, label="Median")
plt.title("Distribution of Input Text Lengths")
plt.xlabel("Text Length")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from random import randrange
 
random_id = randrange(len(raw_dataset['train']))
# raw_dataset['train'][random_id]

In [None]:
raw_dataset

In [None]:
from transformers import AutoTokenizer
from datasets import Value

model_id = "answerdotai/ModernBERT-base"
# model_id = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.model_max_length = 1024 
 
def tokenize(batch):
    return tokenizer(batch['input_text'], padding='max_length', truncation=True, return_tensors="pt")
 
raw_dataset =  raw_dataset.rename_column("meta-llama__Llama-3.2-1B-Instruct_chosen", "labels")
raw_dataset = raw_dataset["train"].train_test_split(
    test_size=0.2, 
    seed=42)
    
tokenized_dataset = raw_dataset.map(tokenize, batched=True,remove_columns=["input_text"])
tokenized_dataset = tokenized_dataset.cast_column("labels", Value("int64"))

# def remap_labels(batch):
#     batch["labels"] = batch["labels"] - 1  
#     return batch

# tokenized_dataset = tokenized_dataset.map(remap_labels)
 
print(tokenized_dataset["train"].features.keys())

In [None]:
# from datasets import Dataset, DatasetDict
# import numpy as np
# from sklearn.model_selection import train_test_split

# def compute_embeddings_and_labels(batch):
#     embeddings = np.array(batch["input_text_modern_bert_embed"])
#     labels = np.array(batch["chosen_model"])
    
#     if embeddings.ndim == 1:
#         embeddings = embeddings[:, np.newaxis]
    
#     return {"input_ids": embeddings, "labels": labels}

# processed_dataset = raw_dataset.map(compute_embeddings_and_labels, batched=True)

# train_data, test_data = train_test_split(processed_dataset["train"].to_pandas(), test_size=0.2, random_state=42)
# final_dataset = DatasetDict({
#     "train": Dataset.from_pandas(train_data),
#     "test": Dataset.from_pandas(test_data)
# })

# print(final_dataset)
# print(final_dataset["train"].features.keys())

In [None]:
print(type(tokenized_dataset["train"][0]["labels"]))

In [None]:
print(type(tokenized_dataset["train"][0]["input_ids"]))

In [None]:
print(tokenized_dataset["train"][0]["labels"], type(tokenized_dataset["train"][0]["labels"]))

In [None]:
from transformers import AutoModelForSequenceClassification
 
model_id = "answerdotai/ModernBERT-base"
# model_id = "google-bert/bert-base-uncased"

num_labels = 2
# num_labels = 5  
# label2id = {str(i): i for i in range(1, num_labels + 1)}
# id2label = {i: str(i) for i in range(1, num_labels + 1)}

model = AutoModelForSequenceClassification.from_pretrained(
    model_id, 
    num_labels=num_labels, 
    # label2id=label2id, 
    # id2label=id2label,
).to("cuda")

model.config.problem_type = "single_label_classification"

In [None]:
for param in model.model.parameters():
    param.requires_grad = False

In [None]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, hidden_size, num_labels):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(hidden_size, 128),
            nn.ReLU(),
            nn.Linear(128, num_labels)
        )

    def forward(self, features):
        return self.model(features)

In [None]:
model.classifier = MLP(
    hidden_size=model.config.hidden_size, 
    num_labels=model.config.num_labels
).to(model.device)

In [None]:
trainable_params = [name for name, p in model.named_parameters() if p.requires_grad]
print("Trainable parameters:", trainable_params)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    f1 = f1_score(labels, predictions, average="weighted")
    accuracy = accuracy_score(labels, predictions)

    metrics = {
        "f1": float(f1) if f1 == 1 else f1,
        "accuracy": float(accuracy),
    }

    wandb.log(metrics)

    return metrics

In [None]:
import wandb
wandb.login()

In [None]:
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="modernbert-llm-router",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=5,
    bf16=True,
    optim="sgd",
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="wandb",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_token=HfFolder.get_token(),
)
 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [None]:
wandb.init(
    project="modernbert-llm-router",
    name="experiment-1",
    config=training_args.to_dict(),
)

In [None]:
trainer.train()

In [None]:
wandb.finish()

In [None]:
tokenizer.save_pretrained("modernbert-llm-router")
trainer.create_model_card()
trainer.push_to_hub()

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="modernbert-llm-router", device=0)
 
sample = "How does the structure and function of plasmodesmata affect cell-to-cell communication and signaling in plant tissues, particularly in response to environmental stresses?"
 
pred = classifier(sample)
print(pred)