In [1]:
# cell 1: basic imports
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

In [3]:
# 3a. Discover all unique buckets
df = pd.read_csv('data/uniform_excerpts_2.csv')
unique_labels = sorted(df['label'].unique())
# e.g. ['1400s','1500s',...,'2000s']
label2id = {lab: i for i, lab in enumerate(unique_labels)}
id2label = {i: lab for lab, i in label2id.items()}


In [5]:
# … your label2id / id2label setup …

# Load CSV
dataset = load_dataset('csv', data_files='data/uniform_excerpts_2.csv')['train']

# Encode & clean up columns
def encode_label(example):
    example['label_id'] = label2id[example['label']]
    return example

dataset = (
    dataset
      .map(encode_label, batched=False)
      .remove_columns('label')
      .rename_column('label_id','label')
)

dataset = dataset.train_test_split(test_size=0.1)

# Tokenizer & Model
MODEL_CHECKPOINT = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# (Optionally freeze BERT body)
for p in model.bert.parameters():
    p.requires_grad = False

# Tokenize
def preprocess(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

tokenized = dataset.map(preprocess, batched=True)

# Trainer
training_args = TrainingArguments(
    output_dir='bert-century-classifier',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy='epoch',
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['test'],
    tokenizer=tokenizer,
)

# Train!
trainer.train()


Generating train split: 5949 examples [00:00, 112683.30 examples/s]
Map: 100%|██████████| 5949/5949 [00:00<00:00, 44031.83 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 5354/5354 [00:00<00:00, 6049.13 examples/s]
Map: 100%|██████████| 595/595 [00:00<00:00, 6672.88 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.7596,1.686007
2,1.6886,1.635299
3,1.6311,1.618957




TrainOutput(global_step=2010, training_loss=1.6810452817091301, metrics={'train_runtime': 2974.8626, 'train_samples_per_second': 5.399, 'train_steps_per_second': 0.676, 'total_flos': 4226241548611584.0, 'train_loss': 1.6810452817091301, 'epoch': 3.0})