In [17]:
from datasets import DatasetDict, Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction

import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [2]:
#TODO
# Fix the tensors at the bottom
# Finish fine tuning

In [2]:
dataset_dict = load_dataset("csv", data_files="data/gutenberg/uniform_excerpts_2.csv")

In [3]:
df = pd.read_csv("data/gutenberg/uniform_excerpts_2.csv")

In [4]:
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label"])
df.head()


Unnamed: 0,text,text_number,label,label_id
0,"seen that my intention was, to shew them that ...",15469,1700s,3
1,"the spectacle. Mrs. Charmond did not see them,...",482,1800s,4
2,King and the ladies of the Court rode out to t...,467,1600s,2
3,"assembly, he had published a letter, in which ...",1346,1800s,4
4,"some moan. '""Lo, all these trophies of affecti...",1137,1500s,1


In [5]:
train_df, val_test_df = train_test_split(df, test_size=0.15, stratify=df["label_id"], random_state=104, shuffle=True)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, stratify=val_test_df["label_id"], random_state=104, shuffle=True)


In [6]:
train_dataset = Dataset.from_pandas(train_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))
eval_dataset = Dataset.from_pandas(val_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))
test_dataset = Dataset.from_pandas(test_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))

In [7]:
#define pre-trained model path
model_path = "google-bert/bert-base-uncased"

#load model tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_path)

def tokenize_dataset(model_name, train_df, val_df, test_df):
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    def tokenize_function(examples):
        return tokenizer(
            examples["text"], truncation=True, padding="max_length", max_length=256
        )

    def prepare(dataset):
        dataset = Dataset.from_pandas(dataset[["text", "label_id"]].rename(columns={"label_id": "labels"}))
        dataset = dataset.map(tokenize_function, batched=True)
        dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
        return dataset

    return tokenizer, prepare(train_df), prepare(val_df), prepare(test_df)

In [11]:
#hyperparamaters
lr = 1e-4
batch_size = 16
num_epochs = 4

def train_model(model_path, label_encoder, train_dataset, eval_dataset, tokenizer):
    training_args = TrainingArguments(
        output_dir="bert-english-classifier_teacher",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy"
    )


    def compute_metrics(p: EvalPrediction):
        preds = np.argmax(p.predictions, axis=1)
        return {"accuracy": accuracy_score(p.label_ids, preds)}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics
    )


    trainer.train()
    return trainer

In [12]:
#load model with binary classifcation heaed
#id2label = {0: "1400", 1: "1500", 2: "1600", 3: "1700", 4: "1800", 5: "1900"}
# label2id = {1400: "0", 1500: "1", 1600: "2", 1700: "3", 1800: "4", 1900: "5"}
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = len(label_encoder.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
'''#freeze all base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

#unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True'''

'#freeze all base model parameters\nfor name, param in model.base_model.named_parameters():\n    param.requires_grad = False\n\n#unfreeze base model pooling layers\nfor name, param in model.base_model.named_parameters():\n    if "pooler" in name:\n        param.requires_grad = True'

In [121]:
#define text preprocessing
'''def preprocess_function(examples):
    #return tokenized text with truncation and padding
    return tokenizer(examples["text"], truncation=True, padding=True)'''

#preprocess all datasets
'''train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)'''


'train_dataset = train_dataset.map(preprocess_function, batched=True)\neval_dataset = eval_dataset.map(preprocess_function, batched=True)'

In [14]:
tokenizer, train_dataset, val_dataset, test_dataset = tokenize_dataset(model_path, train_df, val_df, test_df)


Map: 100%|██████████| 5056/5056 [00:00<00:00, 7951.36 examples/s]
Map: 100%|██████████| 446/446 [00:00<00:00, 9530.08 examples/s]
Map: 100%|██████████| 447/447 [00:00<00:00, 8597.64 examples/s]


In [None]:
#load metrics
'''accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")'''

'''def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}'''

'''def compute_metrics(eval_pred):
    #get predictions
    predictions, labels = eval_pred

    #softmax to get probabilities
    probabilities = np.exp(predictions)/ np.exp(predictions).sum(-1, keepdims=True)
    positive_class_probs = probabilities[:, 1]

    #compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)["roc_auc"], 3)

    #predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)

    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)["accuracy"], 3)

    return {"Accuracy": acc, "AUC": auc}'''

Map: 100%|██████████| 5056/5056 [00:00<00:00, 8227.79 examples/s]
Map: 100%|██████████| 446/446 [00:00<00:00, 8477.14 examples/s]
Map: 100%|██████████| 447/447 [00:00<00:00, 9645.70 examples/s]


'def compute_metrics(eval_pred):\n    #get predictions\n    predictions, labels = eval_pred\n\n    #softmax to get probabilities\n    probabilities = np.exp(predictions)/ np.exp(predictions).sum(-1, keepdims=True)\n    positive_class_probs = probabilities[:, 1]\n\n    #compute auc\n    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)["roc_auc"], 3)\n\n    #predict most probable class\n    predicted_classes = np.argmax(predictions, axis=1)\n\n    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)["accuracy"], 3)\n\n    return {"Accuracy": acc, "AUC": auc}'

In [None]:

'''#hyperparameters
lr = 1e-4
batch_size = 16
num_epochs = 4

training_args = TrainingArguments(
    output_dir = "bert-english-classifier_teacher",
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    logging_strategy = "epoch",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
)'''

In [None]:
'''#TODO: When running train_dataset and eval_dataset, there is a tensor error where the trainer cant create a tensor. From what I gather, this might be
# issue pertaining to the label in the uniform_excerpts dataset which isn't a flat integer.

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()'''
trainer = train_model(
    model_path=model_path,
    label_encoder=label_encoder,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [99]:
results = trainer.evaluate(test_dataset)
print("Test Results:", results)
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids
print(classification_report(true_labels, pred_labels, target_names=label_encoder.classes_))



ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['labels']