In [1]:
import pandas as pd
import numpy as np
import datasets
import torch
import os
from datasets import load_dataset, load_from_disk, concatenate_datasets
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification
from transformers import Trainer
from sklearn.model_selection import train_test_split
import evaluate

In [2]:
dataset = load_dataset('../data/ReviewPrediction', data_files={'train': 'train_binary.csv', 'test': 'test_binary.csv', 'validation': 'validation_binary.csv'})
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', '_id', 'review', 'score', 'upvotes', 'downvotes', 'sum'],
        num_rows: 6826
    })
    test: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', '_id', 'review', 'score', 'upvotes', 'downvotes', 'sum'],
        num_rows: 1897
    })
    validation: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', '_id', 'review', 'score', 'upvotes', 'downvotes', 'sum'],
        num_rows: 759
    })
})

In [3]:
checkpoint = 'bert-base-german-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example['review'], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset["train"][0]

{'Unnamed: 0.1': 1893,
 'Unnamed: 0': 1893,
 '_id': '5c34e1c593ac7c001ca22f47',
 'review': 'Sehr gute Vorlesung - sehr unfaire Prüfung....',
 'score': 3,
 'upvotes': 11.0,
 'downvotes': 2.0,
 'sum': 1,
 'input_ids': [3,
  19386,
  4493,
  15428,
  27,
  26935,
  1120,
  174,
  8716,
  942,
  4185,
  26914,
  26914,
  26914,
  26914,
  4],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [4]:
tokenized_dataset = tokenized_dataset.remove_columns(["upvotes", "downvotes", "score", "Unnamed: 0", '_id', 'review'])
tokenized_dataset = tokenized_dataset.rename_column("sum", "labels")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6826
    })
    test: Dataset({
        features: ['Unnamed: 0.1', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1897
    })
    validation: Dataset({
        features: ['Unnamed: 0.1', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 759
    })
})

In [5]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
torch.cuda.is_available()

True

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Hyperparameter optimization

In [7]:
from sklearn.metrics import classification_report, accuracy_score
from torch import nn
import optuna
from transformers import EarlyStoppingCallback

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    results = {}
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    print("############      Accuracy       ##############")
    print(acc)
    results.update({'accuracy':acc})
    return results

class CustomLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.35, 0.65], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def objective(trial: optuna.Trial):
    model,
    training_args = TrainingArguments(
        output_dir='./hp_optimization',
        learning_rate=trial.suggest_float('learning_rate', low=1e-6, high=0.01),
        #weight_decay=trial.suggest_float('weight_decay', 4e-5, 0.01),
        num_train_epochs=3,
        auto_find_batch_size=True,
        #per_device_train_batch_size=trial.suggest_categorical("per_device_train_batch_size", [2, 4, 8, 16]),
        #per_device_eval_batch_size=trial.suggest_categorical("per_device_eval_batch_size", [2, 4, 8, 16]),
        evaluation_strategy="epoch",
        save_strategy="epoch",
        #load_best_model_at_end=True,
        #per_device_train_batch_size=8,
        #per_device_eval_batch_size=8,
        disable_tqdm=True,
        )

    trainer = CustomLossTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['validation'],
        data_collator=data_collator,
        tokenizer=tokenizer,
        #callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        compute_metrics=compute_metrics,
    )

    result = trainer.train()
    return result.training_loss

In [8]:
#----------------------------------------------------------------------------------------------------
#                    CREATE OPTUNA STUDY
#----------------------------------------------------------------------------------------------------

print('Triggering Optuna study')
study = optuna.create_study(study_name='hp-search-electra', direction='minimize')
study.optimize(func=objective, n_trials=20)

[I 2023-10-19 11:57:26,104] A new study created in memory with name: hp-search-electra


Triggering Optuna study


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 3.7953, 'learning_rate': 0.00932065478665541, 'epoch': 0.15}
{'loss': 2.2478, 'learning_rate': 0.00884213261874005, 'epoch': 0.29}
{'loss': 2.7576, 'learning_rate': 0.008363610450824689, 'epoch': 0.44}
{'loss': 2.6335, 'learning_rate': 0.007885088282909326, 'epoch': 0.59}
{'loss': 3.2224, 'learning_rate': 0.007406566114993966, 'epoch': 0.73}
{'loss': 3.0855, 'learning_rate': 0.006928043947078604, 'epoch': 0.88}
############      Accuracy       ##############
{'accuracy': 0.3425559947299078}
{'eval_loss': 2.093769073486328, 'eval_accuracy': {'accuracy': 0.3425559947299078}, 'eval_runtime': 11.9509, 'eval_samples_per_second': 63.51, 'eval_steps_per_second': 7.949, 'epoch': 1.0}
{'loss': 2.8603, 'learning_rate': 0.006449521779163243, 'epoch': 1.03}
{'loss': 2.3825, 'learning_rate': 0.005970999611247881, 'epoch': 1.17}
{'loss': 2.7312, 'learning_rate': 0.005492477443332519, 'epoch': 1.32}
{'loss': 2.3674, 'learning_rate': 0.005013955275417158, 'epoch': 1.46}
{'loss': 2.3168, 'lear

[I 2023-10-19 12:18:52,570] Trial 0 finished with value: 2.2721330709948253 and parameters: {'learning_rate': 0.009799176954570773}. Best is trial 0 with value: 2.2721330709948253.


{'train_runtime': 1233.9902, 'train_samples_per_second': 16.595, 'train_steps_per_second': 8.297, 'train_loss': 2.2721330709948253, 'epoch': 3.0}
{'loss': 1.2526, 'learning_rate': 0.00042385381855871665, 'epoch': 0.15}
{'loss': 1.1327, 'learning_rate': 0.0004020931748294469, 'epoch': 0.29}
{'loss': 1.0902, 'learning_rate': 0.0003803325311001771, 'epoch': 0.44}
{'loss': 1.1201, 'learning_rate': 0.0003585718873709073, 'epoch': 0.59}
{'loss': 1.1038, 'learning_rate': 0.0003368112436416376, 'epoch': 0.73}
{'loss': 1.1221, 'learning_rate': 0.00031505059991236776, 'epoch': 0.88}
############      Accuracy       ##############
{'accuracy': 0.6574440052700923}
{'eval_loss': 0.7058058977127075, 'eval_accuracy': {'accuracy': 0.6574440052700923}, 'eval_runtime': 11.948, 'eval_samples_per_second': 63.525, 'eval_steps_per_second': 7.951, 'epoch': 1.0}
{'loss': 1.0017, 'learning_rate': 0.00029328995618309805, 'epoch': 1.03}
{'loss': 1.1307, 'learning_rate': 0.00027152931245382823, 'epoch': 1.17}
{'l

[I 2023-10-19 12:40:08,709] Trial 1 finished with value: 1.0484886791617833 and parameters: {'learning_rate': 0.0004456144622879864}. Best is trial 1 with value: 1.0484886791617833.


{'train_runtime': 1226.1313, 'train_samples_per_second': 16.701, 'train_steps_per_second': 8.351, 'train_loss': 1.0484886791617833, 'epoch': 3.0}
{'loss': 1.5521, 'learning_rate': 0.002730651317132612, 'epoch': 0.15}
{'loss': 1.6969, 'learning_rate': 0.002590459751410638, 'epoch': 0.29}
{'loss': 1.6828, 'learning_rate': 0.0024502681856886635, 'epoch': 0.44}
{'loss': 1.678, 'learning_rate': 0.0023100766199666894, 'epoch': 0.59}
{'loss': 1.4225, 'learning_rate': 0.0021698850542447154, 'epoch': 0.73}
{'loss': 1.5851, 'learning_rate': 0.0020296934885227413, 'epoch': 0.88}
############      Accuracy       ##############
{'accuracy': 0.6574440052700923}
{'eval_loss': 1.1877193450927734, 'eval_accuracy': {'accuracy': 0.6574440052700923}, 'eval_runtime': 11.973, 'eval_samples_per_second': 63.393, 'eval_steps_per_second': 7.935, 'epoch': 1.0}
{'loss': 1.4175, 'learning_rate': 0.0018895019228007675, 'epoch': 1.03}
{'loss': 1.3755, 'learning_rate': 0.0017493103570787932, 'epoch': 1.17}
{'loss': 1

[I 2023-10-19 13:01:30,307] Trial 2 finished with value: 1.3229899949912902 and parameters: {'learning_rate': 0.002870842882854586}. Best is trial 1 with value: 1.0484886791617833.


{'train_runtime': 1231.4177, 'train_samples_per_second': 16.63, 'train_steps_per_second': 8.315, 'train_loss': 1.3229899949912902, 'epoch': 3.0}
{'loss': 1.3011, 'learning_rate': 0.0018826206069186442, 'epoch': 0.15}
{'loss': 1.3247, 'learning_rate': 0.0017859669152193607, 'epoch': 0.29}
{'loss': 1.3689, 'learning_rate': 0.0016893132235200772, 'epoch': 0.44}
{'loss': 1.3606, 'learning_rate': 0.0015926595318207935, 'epoch': 0.59}
{'loss': 1.1644, 'learning_rate': 0.0014960058401215102, 'epoch': 0.73}
{'loss': 1.3458, 'learning_rate': 0.0013993521484222267, 'epoch': 0.88}
############      Accuracy       ##############
{'accuracy': 0.6574440052700923}
{'eval_loss': 0.8736157417297363, 'eval_accuracy': {'accuracy': 0.6574440052700923}, 'eval_runtime': 12.0034, 'eval_samples_per_second': 63.232, 'eval_steps_per_second': 7.914, 'epoch': 1.0}
{'loss': 1.1068, 'learning_rate': 0.0013026984567229432, 'epoch': 1.03}
{'loss': 1.1736, 'learning_rate': 0.0012060447650236597, 'epoch': 1.17}
{'loss'

[W 2023-10-19 13:22:05,048] Trial 3 failed with parameters: {'learning_rate': 0.0019792742986179277} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\jorge\anaconda3\envs\pytorch2\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\jorge\AppData\Local\Temp\ipykernel_18880\1227325410.py", line 58, in objective
    result = trainer.train()
  File "C:\Users\jorge\anaconda3\envs\pytorch2\lib\site-packages\transformers\trainer.py", line 1556, in train
    return inner_training_loop(
  File "C:\Users\jorge\anaconda3\envs\pytorch2\lib\site-packages\accelerate\utils\memory.py", line 136, in decorator
    return function(batch_size, *args, **kwargs)
  File "C:\Users\jorge\anaconda3\envs\pytorch2\lib\site-packages\transformers\trainer.py", line 1816, in _inner_training_loop
    for step, inputs in enumerate(epoch_iterator):
  File "C:\Users\jorge\anaconda3\envs\pytorch2\li

## Weighted loss

In [7]:
from sklearn.metrics import classification_report, accuracy_score
from torch import nn


class CustomLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.35, 0.65], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
    
def compute_metrics(eval_preds):
    results = {}
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    class_report= classification_report(y_pred=predictions, y_true=labels)
    acc = accuracy_score(y_pred=predictions, y_true=labels)
    print("############ Classification report ############")
    print(class_report)
    print("############      Accuracy       ##############")
    print(acc)
    results.update({'classification report' : class_report})
    results.update({'accuracy':acc})
    return results

training_args = TrainingArguments(
    'test-trainer', 
    auto_find_batch_size=True,
    #gradient_accumulation_steps=4,
    evaluation_strategy="epoch", 
    num_train_epochs=3,
    #per_device_train_batch_size=4,  
    #per_device_eval_batch_size=1,
    #eval_accumulation_steps=1,
    learning_rate=1e-5,
    #save_strategy="epoch",
    #load_best_model_at_end=True
)

trainer = CustomLossTrainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Classification report,Accuracy
1,0.7458,0.738326,precision recall f1-score support  0 0.70 0.83 0.76 499  1 0.50 0.32 0.39 260  accuracy 0.66 759  macro avg 0.60 0.58 0.58 759 weighted avg 0.63 0.66 0.64 759,0.657444


############ Classification report ############
              precision    recall  f1-score   support

           0       0.70      0.83      0.76       499
           1       0.50      0.32      0.39       260

    accuracy                           0.66       759
   macro avg       0.60      0.58      0.58       759
weighted avg       0.63      0.66      0.64       759

############      Accuracy       ##############
0.6574440052700923


In [9]:
predictions = trainer.predict(tokenized_dataset["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

preds = np.argmax(predictions.predictions, axis=-1)

############ Classification report ############
              precision    recall  f1-score   support

           0       0.69      0.80      0.74      1247
           1       0.45      0.31      0.37       650

    accuracy                           0.63      1897
   macro avg       0.57      0.56      0.55      1897
weighted avg       0.61      0.63      0.61      1897

############      Accuracy       ##############
0.6336320506062203
(1897, 2) (1897,)


In [27]:
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

print(accuracy_metric.compute(predictions=preds, references=predictions.label_ids))
print(precision_metric.compute(predictions=preds, references=predictions.label_ids, average='macro'))
print(recall_metric.compute(predictions=preds, references=predictions.label_ids, average='macro'))
print(f1_metric.compute(predictions=preds, references=predictions.label_ids, average='macro'))

{'accuracy': 0.6557722720084344}
{'precision': 0.5964063863648031}
{'recall': 0.5695046573314416}
{'f1': 0.5667236444516265}


In [29]:
preds

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)