# DistilBERT and HuggingFace

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EvalPrediction
)
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

2025-04-23 09:23:18.738206: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745421800.174257  174647 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745421800.829619  174647 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745421805.327568  174647 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745421805.327598  174647 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745421805.327601  174647 computation_placer.cc:177] computation placer alr

In [2]:
# Load the data
df = pd.read_csv("uniform_excerpts_2.csv")
#df = pd.read_parquet("uniform_excerpts_2.parquet")


# Encode labels
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label"])
df.head()

Unnamed: 0,text,text_number,label,label_id
0,"seen that my intention was, to shew them that ...",15469,1700s,3
1,"the spectacle. Mrs. Charmond did not see them,...",482,1800s,4
2,King and the ladies of the Court rode out to t...,467,1600s,2
3,"assembly, he had published a letter, in which ...",1346,1800s,4
4,"some moan. '""Lo, all these trophies of affecti...",1137,1500s,1


In [3]:
df.shape

(5949, 4)

In [4]:
# Train/Val/Test split
train_df, val_test_df = train_test_split(df, test_size=0.2, stratify=df["label_id"], random_state=42)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, stratify=val_test_df["label_id"], random_state=42)

In [5]:
train_df.shape

(4759, 4)

In [6]:
val_df.shape

(595, 4)

In [7]:
test_df.shape

(595, 4)

In [8]:
# Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))
val_dataset = Dataset.from_pandas(val_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))
test_dataset = Dataset.from_pandas(test_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))

In [9]:
def tokenize_dataset(model_name, train_df, val_df, test_df):
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

    def tokenize_function(examples):
        return tokenizer(
            examples["text"], truncation=True, padding="max_length", max_length=256
        )

    def prepare(dataset):
        dataset = Dataset.from_pandas(dataset[["text", "label_id"]].rename(columns={"label_id": "labels"}))
        dataset = dataset.map(tokenize_function, batched=True)
        dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
        return dataset

    return tokenizer, prepare(train_df), prepare(val_df), prepare(test_df)

In [10]:
def train_model(model_name, label_encoder, train_dataset, val_dataset, tokenizer):
    model = DistilBertForSequenceClassification.from_pretrained(
        model_name, num_labels=len(label_encoder.classes_)
    )

    training_args = TrainingArguments(
        output_dir="./era_model",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=1e-4,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=4,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy"
    )

    def compute_metrics(p: EvalPrediction):
        preds = np.argmax(p.predictions, axis=1)
        return {"accuracy": accuracy_score(p.label_ids, preds)}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics
    )

    trainer.train()
    return trainer

In [11]:
model_name = 'distilbert-base-uncased'

In [12]:
tokenizer, train_dataset, val_dataset, test_dataset = tokenize_dataset(model_name, train_df, val_df, test_df)

Map:   0%|          | 0/4759 [00:00<?, ? examples/s]

Map:   0%|          | 0/595 [00:00<?, ? examples/s]

Map:   0%|          | 0/595 [00:00<?, ? examples/s]

In [13]:
trainer = train_model(
    model_name=model_name,
    label_encoder=label_encoder,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    tokenizer=tokenizer
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9601,0.750984,0.689076
2,0.4892,0.653614,0.759664
3,0.2235,0.890275,0.739496
4,0.0794,1.033699,0.751261


In [14]:
# Evaluate
results = trainer.evaluate(test_dataset)
print("Test Results:", results)

Test Results: {'eval_loss': 0.7034649848937988, 'eval_accuracy': 0.773109243697479, 'eval_runtime': 18.2304, 'eval_samples_per_second': 32.638, 'eval_steps_per_second': 2.084, 'epoch': 4.0}


In [15]:
# Classification report
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids
print(classification_report(true_labels, pred_labels, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

       1400s       0.85      0.97      0.91        95
       1500s       0.82      0.93      0.87       100
       1600s       0.71      0.58      0.64       100
       1700s       0.59      0.58      0.59       100
       1800s       0.72      0.66      0.69       100
       1900s       0.92      0.93      0.93       100

    accuracy                           0.77       595
   macro avg       0.77      0.77      0.77       595
weighted avg       0.77      0.77      0.77       595

