In [62]:
# cell 1: basic imports
import numpy as np
from datasets import load_dataset
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction
)
import torch


In [38]:
import pandas as pd

In [39]:
# 3a. Discover all unique buckets
df = pd.read_csv('../data/uniform_excerpts_2.csv')
unique_labels = sorted(df['label'].unique())
# e.g. ['1400s','1500s',...,'2000s']
label2id = {lab: i for i, lab in enumerate(unique_labels)}
id2label = {i: lab for lab, i in label2id.items()}


In [None]:
# … your label2id / id2label setup …

# Load CSV
dataset = load_dataset('csv', data_files='../data/uniform_excerpts_2.csv')['train']

# Encode & clean up columns
def encode_label(example):
    example['label_id'] = label2id[example['label']]
    return example

dataset = (
    dataset
      .map(encode_label, batched=False)
      .remove_columns('label')
      .rename_column('label_id','labels')
)

dataset = dataset.train_test_split(test_size=0.1)

# Tokenizer & Model
MODEL_CHECKPOINT = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# (Optionally freeze BERT body)
for p in model.bert.parameters():
    p.requires_grad = False

# Tokenize
def preprocess(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

tokenized = dataset.map(preprocess, batched=True)
metric = evaluate.load("accuracy")

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}
# Trainer
training_args = TrainingArguments(
    output_dir='bert-century-classifier',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="./logs",
    report_to="none",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    do_eval=True,
    do_predict=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train!
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


In [None]:
results = trainer.evaluate()

In [23]:
print(results)

{'eval_loss': 1.6077361106872559, 'eval_runtime': 50.3754, 'eval_samples_per_second': 11.811, 'eval_steps_per_second': 1.489, 'epoch': 3.0}
