In [13]:
#This notebook offers an example of building a multi-label text classifier on student survey responses
#Note the sample data has unique labels for each response, making this a multiclass classification problem
#However, in practice, most survey responses will be able to take more than one label, making it a multilabel classification problem
#Since the actual student data is protected by FERPA and cannot be shared, we are only using fake responses generated by ChatGPT
#we do not have enough data to train a successful model or evaluate the model, but this provides the structure of how our model was trained.

#import packages
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import numpy as np


#This is a dataset of fake student survey responses generated with ChatGPT
surveys = pd.read_excel("Topics/fake_student_survey_data.xlsx")
surveys_all = Dataset.from_pandas(surveys)

In [14]:
example = surveys_all[0]
example

{'text': '    "Create innovation hubs that support entrepreneurial ventures. Pursuing my business ideas was challenging without dedicated resources and mentorship."',
 'Academic': 0,
 'Facilities': 0,
 'Career': 1,
 'Financial': 0,
 'Diversity': 0,
 'Wellness': 0,
 'Social': 0,
 'Technology': 0,
 'Sustainability': 0,
 'Policy': 0,
 'Communication': 0,
 'Ethics': 0}

In [16]:
labels = [label for label in surveys_all.features.keys() if label not in ['text']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['Academic',
 'Facilities',
 'Career',
 'Financial',
 'Diversity',
 'Wellness',
 'Social',
 'Technology',
 'Sustainability',
 'Policy',
 'Communication',
 'Ethics']

In [5]:
model_path = "roberta-base"
#model_path = "microsoft/deberta-v3-base"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
def preprocess_data(examples):
  # get the survey responses
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [20]:
encoded_dataset = surveys_all.map(preprocess_data, batched=True, remove_columns=surveys_all.column_names)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [22]:
example = encoded_dataset[0]
print(example.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [23]:
tokenizer.decode(example['input_ids'])

'<s>    "Create innovation hubs that support entrepreneurial ventures. Pursuing my business ideas was challenging without dedicated resources and mentorship."</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [24]:
example['labels']

[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [25]:
encoded_dataset.set_format("torch")

In [26]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [28]:
from transformers import AutoModelForSequenceClassification

multi_label_model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
#Set hyperparameters and train the model
from transformers import TrainingArguments, Trainer

multi_label_training_args = TrainingArguments(
    model_path,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none"
)

multi_label_trainer = Trainer(
    multi_label_model,
    multi_label_training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset, #Here we have not evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

multi_label_trainer.train()



Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.358615,0.0,0.5,0.0
2,No log,0.300925,0.0,0.5,0.0
3,No log,0.286754,0.0,0.5,0.0
4,No log,0.282374,0.0,0.5,0.0
5,No log,0.280807,0.0,0.5,0.0


TrainOutput(global_step=115, training_loss=0.35201333087423575, metrics={'train_runtime': 560.9885, 'train_samples_per_second': 0.802, 'train_steps_per_second': 0.205, 'total_flos': 29602651392000.0, 'train_loss': 0.35201333087423575, 'epoch': 5.0})