In [1]:
import os
from tqdm import tqdm
import numpy as np
from pathlib import Path
import argparse
import logging
import torch
from torch.utils.data import DataLoader
#import wandb
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (Trainer, TrainingArguments, DataCollatorWithPadding,
                          AutoTokenizer, AutoModelForSequenceClassification)

In [7]:
model_name_or_path = "microsoft/mpnet-base"
dataset_path = '../../bin/multirc_dataset.hf'
output_dir = 'results/hp-tuning'
model_max_length = 512
eval_steps = 1000
eval_accumulation_steps = 2
save_total_limit = 4
batch_size = 32
num_epochs = 8
learning_rate = 3e-05
seed = 42
metric = 'accuracy'
entity = 'ai-aloe'
project_name = 'short answer scoring'

id2label = {0: "incorrect_answer", 1: "correct_answer"}
label2id = {"incorrect_answer": 0, "correct_answer": 1}

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    max_length=model_max_length,
    )

def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,
                                                               num_labels=2,
                                                               id2label=id2label,
                                                               label2id=label2id)
    return model

In [9]:
import os

def preprocess_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)
    
ds = DatasetDict.load_from_disk(dataset_path)
ds = ds.map(preprocess_function, batched=False)

Map:   0%|          | 0/19170 [00:00<?, ? examples/s]

Map:   0%|          | 0/4080 [00:00<?, ? examples/s]

Map:   0%|          | 0/3962 [00:00<?, ? examples/s]

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 19170
    })
    valid: Dataset({
        features: ['index', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 4080
    })
    test: Dataset({
        features: ['index', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 3962
    })
})

In [11]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Using the latest cached version of the module from /home/jovyan/project-archive/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Wed May 15 22:14:12 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.


In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length')

In [13]:
import os
os.environ['WANDB_DISABLED'] = 'true'

trainer = Trainer(
    model_init=model_init,
)

training_args = TrainingArguments(
    output_dir = f'./results/mpnet_checkpoints',
    optim = 'adamw_torch',
    num_train_epochs = num_epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    learning_rate = learning_rate,
    logging_dir = f'./logs/content',
    save_total_limit = 10,
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy',
    evaluation_strategy = "epoch",
    save_strategy = "epoch", 
    greater_is_better = True,
    seed=seed,
    log_level = 'error',  
    disable_tqdm = False, 
) 

    # Call the Trainer
trainer = Trainer(
    model_init = model_init,
    args = training_args,
    data_collator=data_collator,
    train_dataset = ds['train'],
    eval_dataset = ds['valid'],
    compute_metrics = compute_metrics
    #callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
import time

start_time = time.time()
preds, labels, metrics= trainer.predict(ds['test'])
predictions = np.argmax(preds, axis=1)
end_time = time.time()
print(end_time-start_time)

In [None]:
from sklearn import metrics
from matplotlib import pyplot as plt

confusion_matrix = metrics.confusion_matrix(labels, predictions)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['incorrect', 'correct'])

cm_display.plot()
plt.title('Predicted and True Classifications of Correct and Incorrect Answers (MPnet)')
plt.show()

In [None]:
print(metrics.classification_report(labels, predictions))

In [10]:
trainer.save_model("../bin/mpnet_classifier")

In [16]:
import sklearn
sklearn.metrics.cohen_kappa_score(labels, predictions)

0.8378378378378378

## TESTING

In [24]:
import torch
import gc
from time import perf_counter
from sklearn import metrics
from transformers import AutoModelForSequenceClassification

dataset_path = '../../bin/multirc_dataset.hf'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

def preprocess_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)
    
ds = DatasetDict.load_from_disk(dataset_path)
ds = ds.map(preprocess_function, batched=False)


preds = []
times = []
model = AutoModelForSequenceClassification.from_pretrained("wesleymorris/short-answer-classification").to(device)
for text in ds['test']['text']:
    start_time = perf_counter()
    inputs = tokenizer(text, return_tensors='pt').to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    preds.append(model.config.id2label[predicted_class_id])
    times.append(perf_counter()-start_time)

df = ds['test'].to_pandas()
df['preds']=preds
df['times']=times

labels = []
for x in ds['test']['labels']:
    if x == 1:
        labels.append('correct_answer' )
    else: 
        labels.append('incorrect_answer') 

print(metrics.classification_report(labels, preds))

cuda


Map:   0%|          | 0/19170 [00:00<?, ? examples/s]

Map:   0%|          | 0/4080 [00:00<?, ? examples/s]

Map:   0%|          | 0/3962 [00:00<?, ? examples/s]

                  precision    recall  f1-score   support

  correct_answer       0.80      0.76      0.78      1722
incorrect_answer       0.82      0.85      0.84      2240

        accuracy                           0.81      3962
       macro avg       0.81      0.81      0.81      3962
    weighted avg       0.81      0.81      0.81      3962



In [17]:

# df.to_csv('mpnet-results.csv')

In [18]:
labels = []
for x in ds['test']['labels']:
    if x == 1:
        labels.append('correct_answer' )
    else: 
        labels.append('incorrect_answer') 

from sklearn import metrics
from matplotlib import pyplot as plt

print(metrics.classification_report(labels, preds))

                  precision    recall  f1-score   support

  correct_answer       0.80      0.76      0.78      1722
incorrect_answer       0.82      0.85      0.84      2240

        accuracy                           0.81      3962
       macro avg       0.81      0.81      0.81      3962
    weighted avg       0.81      0.81      0.81      3962



In [19]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
model.push_to_hub("short-answer-classification")
tokenizer.push_to_hub("short-answer-classification")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/wesleymorris/short-answer-classification/commit/7307cc1eba077afbdfc9be2e2559910e403996bc', commit_message='Upload tokenizer', commit_description='', oid='7307cc1eba077afbdfc9be2e2559910e403996bc', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
from datasets import load_dataset
import os

dataset = load_dataset("tiedaar/question_scoring_stresstest")['train'].to_pandas()
dataset

Unnamed: 0,subsection_num,source,question,answer,mpnet_response,bleurt_response,correct_response


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ABC', 'Hello World', '2023-05-10 04:53:08.014230'],
        num_rows: 202
    })
})