In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import numpy as np
from datasets import Dataset, DatasetDict
import torch
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os

model_id = "answerdotai/ModernBERT-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)

sns.set_theme(
    font="Liberation Serif",
    rc={
        "figure.figsize": (7.5, 3.75),
        "font.size": 11,
        "figure.dpi": 300,
    },
)

seed = 42

## Dataset

In [2]:
# load it
ellipse_df = pd.read_csv("data/ellipse_corpus_w_splits_prompts_utf8.csv")

# set up text so it includes assignment
ellipse_df["text"] = (
    ellipse_df["full_text"] + tokenizer.sep_token + ellipse_df["assignment"]
)

ellipse_df_clean = (
    ellipse_df[["text", "Cohesion", "split"]]
    .rename(
        columns={
            "full_text": "text",
            "Cohesion": "label",
        }
    )
    .assign(label=lambda df: df.label.astype("float"))
    .reset_index(drop=True)
)

ellipse_dd = DatasetDict(
    {
        "train": Dataset.from_pandas(
            ellipse_df_clean[ellipse_df_clean["split"] == "training"].reset_index(
                drop=True
            )
        ),
        # Use a subsample to validate the notebook:
        # "train": Dataset.from_pandas(
        #     asap_df_clean[asap_df_clean["pubpriv"] == "0"]
        #     .reset_index(drop=True)
        #     .sample(200)
        # ),
        "dev": Dataset.from_pandas(
            ellipse_df_clean[ellipse_df_clean["split"] == "validation"].reset_index(
                drop=True
            )
        ),
        "test": Dataset.from_pandas(
            ellipse_df_clean[ellipse_df_clean["split"] == "testing"].reset_index(
                drop=True
            )
        ),
    }
)


def tokenize_inputs(example):
    return tokenizer(example["text"])


# tokenize text
ellipse_dd_tokenized = ellipse_dd.map(
    tokenize_inputs, batched=True, remove_columns=["text"]
)

ellipse_dd_tokenized

Map:   0%|          | 0/3732 [00:00<?, ? examples/s]

Map:   0%|          | 0/829 [00:00<?, ? examples/s]

Map:   0%|          | 0/1921 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'split', 'input_ids', 'attention_mask'],
        num_rows: 3732
    })
    dev: Dataset({
        features: ['label', 'split', 'input_ids', 'attention_mask'],
        num_rows: 829
    })
    test: Dataset({
        features: ['label', 'split', 'input_ids', 'attention_mask'],
        num_rows: 1921
    })
})

In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import (
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)
from sklearn.metrics import mean_squared_error


def model_init(trial):
    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1)
    return model


def compute_metrics(eval_pred):
    preds, labels = eval_pred
    mse = mean_squared_error(labels, preds)

    return {"mse": mse}

In [4]:
# This is the function we will be using for HP optimization
# Customize the trainer
score_to_predict = "cohesion"

training_args = TrainingArguments(
    output_dir="../bin",
    optim="adamw_torch",  # Specify your optimizer
    logging_dir=f"../logs/{score_to_predict}",
    load_best_model_at_end=False,
    metric_for_best_model="mse",  # We will be using mean squared error to evaluate model performance
    # evaluation_strategy='epoch', # Evaluate model performance at the end of each epoch
    save_strategy="no",  # I prefer to perform a training run separately once the best parameters are discovered.
    warmup_steps=500,
    greater_is_better=False,
    log_level="error",
    # disable_tqdm = False,
    # report_to='wandb',
    # The hyper parameters we are tuning (umber of epochs, learning rate, and batch size) are called in from the configuration dictionary
    num_train_epochs=1,
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
)

# Initialize the trainer
trainer = Trainer(
    model=None,  # this is to emphasize that we are not passing the model directly
    args=training_args,
    train_dataset=ellipse_dd_tokenized["train"].select(range(10)),
    eval_dataset=ellipse_dd_tokenized["dev"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    model_init=model_init,  # we pass a function that initializes the model afresh at the start of each trial
)


# Start training loop
trainer.train()

  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avo

Step,Training Loss


TrainOutput(global_step=1, training_loss=0.0, metrics={'train_runtime': 77.5661, 'train_samples_per_second': 0.129, 'train_steps_per_second': 0.013, 'total_flos': 11153877843600.0, 'train_loss': 0.0, 'epoch': 1.0})

Process ForkProcess-7:
Process ForkProcess-10:
=(true | false)
Process ForkProcess-11:
Process ForkProcess-13:
Process ForkProcess-8:
Process ForkProcess-9:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Process ForkProcess-5:
Process ForkProcess-12:
Process ForkProcess-4:
Process ForkProcess-3:
Process ForkProcess-2:
Process ForkProcess-26:
Process ForkProcess-17:
Process ForkProcess-14:
Process ForkProcess-28:
Process ForkProcess-24:
Process ForkProcess-6:
Process ForkProcess-25:
Process ForkProcess-15:
Process ForkProcess-23:
Process ForkProcess-22:
Process ForkProcess-31:
Process ForkProcess-18:
Process ForkProcess-27:
Process ForkProcess-29:
Process ForkProcess-16:
Process ForkProcess-1:
Process ForkProcess-30:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback