In [1]:
import pandas as pd

In [2]:
# analyze length of text
train_df = pd.read_csv("../data/docee/train_all.csv")
test_df = pd.read_csv("../data/docee/test_all.csv")

In [3]:
from sklearn.model_selection import train_test_split

RANDOM_STATE=18091999

dev_df, test_df = train_test_split(test_df, test_size=0.5, random_state=RANDOM_STATE)

In [4]:
from src.utils import compute_metrics
from src.data import DoceeDataset
from transformers import Trainer, TrainingArguments, IntervalStrategy, SchedulerType, RobertaTokenizerFast, RobertaForSequenceClassification

DEVICE="cuda:0"
MODEL_NAME_OR_PATH = "roberta-base"
CACHE_DIR = "../pretrained_models"
NUM_LABELS = 59

model = RobertaForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=MODEL_NAME_OR_PATH,
    cache_dir=CACHE_DIR,
    num_labels=NUM_LABELS
).to(DEVICE)  # TODO - figure out how was this initialized

tokenizer = RobertaTokenizerFast.from_pretrained(
    pretrained_model_name_or_path=MODEL_NAME_OR_PATH,
    cache_dir=CACHE_DIR
)

train_dataset = DoceeDataset(train_df[:10], tokenizer=tokenizer)
eval_dataset = DoceeDataset(dev_df[:10], tokenizer=tokenizer)

OUTPUT_DIR="../outputs/docee_roberta_proto"
PER_DEVICE_TRAIN_BATCH_SIZE=2
PER_DEVICE_EVAL_BATCH_SIZE=2
GRADIENT_ACCUMULATION_STEPS=1
MAX_GRAD_NORM=1
LEARNING_RATE=2e-5
WEIGHT_DECAY=0.01
NUM_TRAIN_EPOCHS=1  # for testing only
MAX_STEPS=100  # overrides NUM_TRAIN_EPOCHS, but that's fine for now
LR_SCHEDULER_TYPE=SchedulerType.LINEAR
WARMUP_RATIO=0.06  # allegedly a good value, but sweepable
LOGGING_STRATEGY=IntervalStrategy.STEPS
LOGGING_STEPS=10  # for testing purposes only
LOGGING_NAN_INF_FILTER=False
SAVE_STRATEGY=IntervalStrategy.STEPS
SAVE_TOTAL_LIMIT=2
EVAL_STEPS=10
DATALOADER_NUM_WORKERS=2
RUN_NAME="DocEE Roberta Prototype"
LOAD_BEST_MODEL_AT_END=True
METRIC_FOR_BEST_MODEL="f1_macro"
GREATER_IS_BETTER=True
REPORT_TO="wandb"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy=IntervalStrategy.STEPS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    max_grad_norm=MAX_GRAD_NORM,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    max_steps=MAX_STEPS,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_ratio=WARMUP_RATIO,
    logging_strategy=LOGGING_STRATEGY,
    logging_steps=LOGGING_STEPS,
    logging_nan_inf_filter=LOGGING_NAN_INF_FILTER,
    save_strategy=SAVE_STRATEGY,
    save_total_limit=SAVE_TOTAL_LIMIT,
    eval_steps=EVAL_STEPS,
    dataloader_num_workers=DATALOADER_NUM_WORKERS,
    run_name=RUN_NAME,
    load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    greater_is_better=GREATER_IS_BETTER,
    report_to=REPORT_TO,
)

trainer = Trainer(
    args=training_args,
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 2
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  _warn_prf(aver

TrainOutput(global_step=100, training_loss=2.4605980205535887, metrics={'train_runtime': 57.352, 'train_samples_per_second': 3.487, 'train_steps_per_second': 1.744, 'total_flos': 49504589792952.0, 'train_loss': 2.4605980205535887, 'epoch': 20.0})