In [None]:
import torch
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
import pandas as pd
from sklearn import metrics
import numpy as np
import torch.nn.functional as F
from datasets import load_metric
import wandb
import os
import evaluate
import numpy

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

1. Define the dataset class to process with the model 

In [3]:
class TrainDataset(Dataset):
  def __init__(self, filename=None, df=None, tokenizer=None):
    super().__init__()
    if filename:
      self.df = pd.read_csv(filename, sep="\t")
    else:
      self.df = df
    self.encoding = tokenizer(self.df["text"].values.tolist(), truncation=True, padding=True, return_tensors="pt")
  def __len__(self):
    return len(self.df)
  def __getitem__(self, index):
    label = 1 if self.df["label"].iloc[index] == "MOS" else 0
    return {"input_ids": self.encoding["input_ids"][index], "attention_mask" : self.encoding["attention_mask"][index], "label" : label}
  def getlabels(self):
    return np.array([1 if label == "MOS" else 0 for label in self.df["label"].values])
  def getDataFrame(self):
    return self.df

class TestDataset(Dataset):
  def __init__(self, filename=None, df=None, tokenizer=None):
    super().__init__()
    if filename:
      self.df = pd.read_csv(filename, sep="\t")
    else:
      self.df = df
    self.encoding = tokenizer(self.df["text"].values.tolist(), truncation=True, padding=True, return_tensors="pt")
  def __len__(self):
    return len(self.df)
  def __getitem__(self, index):
    return {"input_ids": self.encoding["input_ids"][index], "attention_mask" : self.encoding["attention_mask"][index]}
  def getlabels(self):
    return np.array([1 if label == "MOS" else 0 for label in self.df["label"].values])
  def getDataFrame(self):
    return self.df


2. Initialize the model and the tokenizer 

    In this case, we have the model choice as the following:

        1. bert-base-uncased (BERT)

        2. roberta-base (RoBERTa)

        3. distilgpt2 (Distil-GPT2)

    We can run the cell below multiple times and change the model_name to try different model.  

In [None]:
model_name = "bert-base-uncased"
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2)
    # model.config.pad_token_id = model.config.eos_token_id # use this if the model is GPT
    return model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token # use this if the model is GPT

train_ds = TrainDataset("train.tsv", tokenizer=tokenizer)
validate_ds = TrainDataset("dev.tsv", tokenizer=tokenizer)
test_ds = TestDataset("test.tsv", tokenizer=tokenizer)


Hyperparameter Search

    To find the best configuration for each model, we have to do the hyperparameter search. However, trying every possible value to find the best model is computationally infeasible. We decide to use random search on 20 samples to find the best parameter config for the model based on the evaluation accuracy and the loss between training and validation.

In [None]:
sweep_config = {
    'method': 'random'
}


# hyperparameters
parameters_dict = {
    'epochs': {
        'values': [1,2,3,4,5]
        },
    'batch_size': {
        'value': 8
        },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 1e-3
    },
    'weight_decay': {
        'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
    },
}


sweep_config['parameters'] = parameters_dict

sweep_id = wandb.sweep(sweep_config, project='annotation-project-sweeps')


the metric that we will use are accuracy, recall, precision, and F1 score. Since we would like to know every aspect of the model prediction so that we can decide the one that has the best balance in all measure later. 

In [7]:
def compute_metrics_fn(eval_preds):
  metrics_data = dict()
  
  accuracy_metric = load_metric('accuracy')
  precision_metric = load_metric('precision')
  recall_metric = load_metric('recall')
  f1_metric = load_metric('f1')


  logits = eval_preds.predictions
  labels = eval_preds.label_ids
  preds = np.argmax(logits, axis=-1)  
  
  metrics_data.update(accuracy_metric.compute(predictions=preds, references=labels))
  metrics_data.update(precision_metric.compute(predictions=preds, references=labels, average='weighted'))
  metrics_data.update(recall_metric.compute(predictions=preds, references=labels, average='weighted'))
  metrics_data.update(f1_metric.compute(predictions=preds, references=labels, average='weighted'))
  return metrics_data

In [8]:
def train(config=None):
  with wandb.init(config=config):
    # set sweep configuration
    config = wandb.config


    # set training arguments
    training_args = TrainingArguments(
        output_dir='ap-sweeps',
	      report_to='wandb',  # Turn on Weights & Biases logging
        num_train_epochs=config.epochs,
        learning_rate=config.learning_rate,
        weight_decay=config.weight_decay,
        per_device_train_batch_size=config.batch_size,
        per_device_eval_batch_size=16,
        save_strategy='epoch',
        evaluation_strategy='epoch',
        logging_strategy='epoch',
        load_best_model_at_end=True,
        remove_unused_columns=False,
        fp16=True
    )


    # define training loop
    trainer = Trainer(
        # model,
        model_init=model_init,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=validate_ds,
        compute_metrics=compute_metrics_fn
    )


    # start training loop
    trainer.train()


In [None]:
os.environ["WANDB_SILENT"] = "true"
wandb.agent(sweep_id, train, count=20)

Train model with the best config from hyperparameter search

    In the config value below, we can change to the other value based on the model that we would like to train and replicate the accuracy from the hyperparameter tuning. After training, the model will be saved with the format [model_name]-lr[learning_rate]-bs[batch_size]-ep[num_epoch]-wd[weight_decay]. We can load the model with that name for inference later on.

In [None]:
config={
    "learning_rate" : 5.315e-5,
    "weight_decay" : 0.3,
    "batch_size" : 8,
    "epoch" : 3
}
config_name ="{}-lr{}-bs{}-ep{}-wd{}".format(model_name, config["learning_rate"], config["batch_size"], config["epoch"], config["weight_decay"])
training_args = TrainingArguments(output_dir="model/", 
                                 evaluation_strategy="epoch", 
                                 per_device_train_batch_size=config["batch_size"],
                                 per_device_eval_batch_size=config["batch_size"],
                                 learning_rate=config["learning_rate"],
                                 weight_decay=config["weight_decay"],
                                 num_train_epochs=config["epoch"],
                                 lr_scheduler_type="linear",
                                 warmup_ratio=0,
                                 warmup_steps=0,
                                 log_level="passive",
                                 logging_strategy="epoch",
                                #  eval_steps=50,
                                 fp16=True,
                                 run_name=config_name,
                                 report_to="wandb"
                                 
)

trainer = Trainer(model_init=model_init,
                 args=training_args,
                 train_dataset=train_ds,
                 eval_dataset=validate_ds,
                 compute_metrics=compute_metrics
                 )
                                 
trainer.train()

trainer.save_model(config_name)

Load the model

In [None]:
train_model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased-lr5e-05-ep8-wd2')
trainer = Trainer(model=train_model)

Analysis on the test set

In [24]:
y_trues = test_ds.getlabels()
predictions = torch.from_numpy(trainer.predict(test_ds).predictions).float()
y_preds = torch.argmax(F.softmax(predictions), axis=-1).cpu().detach().numpy()
confusion_matrix = metrics.confusion_matrix(y_trues, y_preds, labels=[0, 1]) # NMOS, MOS
print(confusion_matrix)

***** Running Prediction *****
  Num examples = 200
  Batch size = 8


[[107  12]
 [ 14  67]]


  y_preds = torch.argmax(F.softmax(predictions), axis=-1).cpu().detach().numpy()


We now see that the confusion matrix are the following :
|   | Predicted NMOS | Predicted MOS |
|----|--------------|----------------|
| Actual NMOS | 107 | 12 |
| Actual MOS  | 14  | 67 |

Now we can try to see the number of book that is mislabeled

In [26]:
test_df = test_ds.getDataFrame()
test_df["pred"] = y_preds
test_df["pred_label"] = test_df["pred"].apply(lambda x: "MOS" if x == 1 else "NMOS")
disagree = test_df[test_df.label != test_df.pred_label]
disagree.groupby("adjudicator_id").count()["label"]

adjudicator_id
dung          3
jtopa         4
phazarika     3
ppromthaww    9
tranguyen     7
Name: label, dtype: int64

This mean that the number of mislabeled from lowest to highest is the following : 
1. The Lost World (1912) (3)
2. The Moon Maid (1926) (3)
3. A Room with a View (4)
4. Dorothy and the Wizard in Oz (7)
4. White Fangs (9) 

We can sample the mislabeled text with the following code:

In [30]:
sample = disagree.sample(10)
sample.to_csv("sample.tsv",sep="\t")

More detail in the inference notebook.