In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [3]:
from datasets import load_dataset, ClassLabel
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments

import matplotlib.pyplot as plt
import pickle
import torch
from torch.nn.functional import cross_entropy

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
with open("../data/TREC_classes.pkl", "rb") as file:
    TREC_classes = pickle.load(file)

In [6]:
data_files = {
    'train': 'TREC_question_classification/train/*.csv',
    'test': 'TREC_question_classification/test/*.csv'
}
TREC = load_dataset('csv', data_files=data_files)
class_labels = ClassLabel(names=TREC_classes)
TREC = TREC.cast_column('label', class_labels)
# this is a fifty class classification model
num_classes = len(TREC['train'].features['label'].names)

In [7]:
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                           num_labels=num_classes).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.we

In [8]:
def tokenize(batch):
    return tokenizer(batch['question'], truncation=True, padding=True)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}

In [9]:
# tokenizing the input, and setting it to tensor format
TREC_encoded = TREC.map(tokenize, batched=True, batch_size=None)
TREC_encoded.set_format('torch', columns=['label', 'input_ids', 'attention_mask'])

In [10]:
# training the model on the training data.
model_name = f'{model_ckpt}-finetuned-TREC-dataset'
batch_size = 64
logging_steps = len(TREC['train']) // batch_size
train_args = TrainingArguments(output_dir=model_name,
                               evaluation_strategy='epoch',
                               per_device_train_batch_size=batch_size,
                               per_device_eval_batch_size=batch_size,
                               learning_rate=2e-5,
                               weight_decay=0.01,
                               num_train_epochs=5,
                               log_level='error',
                               disable_tqdm=False,
                               push_to_hub=False,
                               hub_token='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'
                               )

In [11]:
trainer = Trainer(model=model, args=train_args,
                  train_dataset=TREC_encoded['train'],
                  eval_dataset=TREC_encoded['test'],
                  compute_metrics=compute_metrics,
                 tokenizer=tokenizer)

In [12]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.208443,0.726,0.660759
2,No log,0.798866,0.83,0.807543
3,1.370300,0.64367,0.852,0.828836
4,1.370300,0.578458,0.874,0.859577
5,0.254000,0.567326,0.884,0.872305


TrainOutput(global_step=1210, training_loss=0.6970146242252067, metrics={'train_runtime': 180.2188, 'train_samples_per_second': 428.701, 'train_steps_per_second': 6.714, 'total_flos': 820255610547600.0, 'train_loss': 0.6970146242252067, 'epoch': 5.0})

In [13]:
prediction_output = trainer.predict(TREC_encoded['test'])
prediction_output.predictions
prediction_output.label_ids
prediction_output.metrics

array([[-1.2369462 , -1.2023038 , -1.0174    , ..., -0.89801925,
         0.7977742 , -0.75918317],
       [-2.4245784 , -1.5209309 , -0.00859236, ..., -2.6680021 ,
        -1.9946532 , -2.719173  ],
       [-1.2595564 , -1.2557795 , -0.06258786, ..., -1.3581948 ,
        -2.0361462 , -1.793386  ],
       ...,
       [-2.229744  , -0.80216616, -1.1742074 , ..., -2.1869643 ,
        -1.3337531 , -2.1011848 ],
       [-2.2594566 ,  0.95140016,  3.035543  , ..., -2.9332294 ,
        -1.9684222 , -2.751257  ],
       [-1.2841662 ,  2.50552   ,  7.7424617 , ..., -2.0258007 ,
        -2.3634462 , -2.2467375 ]], dtype=float32)

array([40, 32, 28,  2, 39, 40, 29, 18,  5,  2, 32, 30, 49, 30, 39, 43, 22,
       30,  2, 39, 30,  2, 35,  2,  2, 30,  2, 35, 35, 35, 46, 34,  2, 47,
       39, 45, 35, 40, 38, 38, 18,  2,  2,  2,  6,  6, 45,  2, 38, 29,  2,
       30, 30,  2,  3,  2, 39, 32, 32, 46, 32, 30, 35, 13, 36, 28, 25,  2,
       39, 35, 43, 41, 32, 30, 35,  2, 10, 40, 49, 30,  2, 39, 30, 43, 39,
        5,  2, 30,  2, 33, 12,  2, 47, 39, 29, 35, 39,  2,  2, 17,  2, 30,
        4, 28, 32,  2, 47,  6, 39, 35,  8,  2, 40, 30, 18, 30, 39, 22, 39,
       35, 39, 40,  2, 30, 40, 30, 40, 39,  2, 45, 40, 30,  2, 19,  2, 17,
       35, 30, 31,  2,  2,  6, 35,  6,  2,  7, 30, 26,  2,  2, 25, 30,  2,
       39, 29, 35, 32,  2, 35, 36,  2,  2, 39, 35, 25, 35, 39,  2, 36, 30,
       39, 46, 39,  2, 35, 36, 35, 39,  8, 40, 35,  2, 35,  2, 22, 30, 30,
        2, 35, 22, 17, 40, 45, 39,  2, 32, 10, 30, 19, 22, 19, 32, 30,  6,
       43, 35, 30, 47,  3,  3, 39, 46, 36, 35,  2, 15,  2, 25, 38,  2,  2,
       39,  2, 17, 34, 22

{'test_loss': 0.5673259496688843,
 'test_accuracy': 0.884,
 'test_f1': 0.8723049773203582,
 'test_runtime': 0.2331,
 'test_samples_per_second': 2145.38,
 'test_steps_per_second': 34.326}

In [14]:
# cm = confusion_matrix(prediction_output.label_ids, 
#                  prediction_output.predictions.argmax(-1),
#                  normalize='true')
# fig, ax = plt.subplots(figsize=(12, 15))
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)
# disp.plot(cmap='Blues', ax=ax, values_format='.2f', colorbar=False)
# plt.show()

In [15]:
# error analysis
def forward_pass_with_label(batch):
    inputs = {k:v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
    outputs = model(**inputs)
    pred_labels = outputs.logits.argmax(-1)
    loss = cross_entropy(outputs.logits, batch['label'].to(device), reduction='none')
    return {'predicted_labels':pred_labels, 
           'loss':loss}

In [16]:
TREC_encoded['test'] = TREC_encoded['test'].map(forward_pass_with_label, batched=True, batch_size=None)
TREC_encoded.set_format('pandas')

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [37]:
def integer2string(row):
    return TREC_encoded['train'].features['label'].int2str(row)

In [40]:
cols = ['question', 'label', 'predicted_labels', 'loss']
df_error = TREC_encoded['test'][:][cols]
df_error['label'] = df_error['label'].apply(lambda x: integer2string(x))
df_error['predicted_labels'] = df_error['predicted_labels'].apply(lambda x: integer2string(x))
df_error.sort_values(by=['loss'], inplace=True, ascending=False)
df_error.to_csv('../data/TREC_error_analysis.csv', index=False, index_label=False)