In [34]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    DebertaTokenizer,
    DebertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import roc_auc_score

In [35]:
dataset = pd.read_csv('../../data/raw/train_essays.csv')

In [36]:
model_name = 'microsoft/deberta-base'
tokenizer = DebertaTokenizer.from_pretrained(model_name)
def get_dataset(df):
    dataset = Dataset.from_pandas(df[['text', 'generated']])
    return dataset


def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)


def tokenize_dataset(dataset):
    dataset = dataset.map(tokenize_function, batched=True)
    dataset = dataset.rename_column("generated", "labels")
    dataset.set_format("torch",
                       columns=["input_ids", "attention_mask", "labels"])
    return dataset

def sigmoid(x):
    return 1 / (1 + np.exp(-x))



In [37]:
tokenized_datasets = tokenize_dataset(get_dataset(dataset)).train_test_split(0.3)

Map: 100%|██████████| 1378/1378 [00:02<00:00, 503.19 examples/s]


In [38]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 964
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 414
    })
})

In [39]:
model = DebertaForSequenceClassification.from_pretrained(
        model_name, num_labels=2
    )

training_args = TrainingArguments(
    output_dir='./',
    evaluation_strategy='steps',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
)

  return torch.load(checkpoint_file, map_location=map_location)
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

 18%|█▊        | 131/723 [07:47<35:10,  3.57s/it]


In [41]:
trainer.train()

 69%|██████▉   | 500/723 [04:05<01:49,  2.03it/s]

{'loss': 0.0231, 'learning_rate': 6.168741355463348e-06, 'epoch': 2.07}


                                                 
 69%|██████▉   | 500/723 [04:22<01:49,  2.03it/s]Checkpoint destination directory ./checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.012349722906947136, 'eval_runtime': 16.8503, 'eval_samples_per_second': 24.569, 'eval_steps_per_second': 12.285, 'epoch': 2.07}


100%|██████████| 723/723 [06:16<00:00,  1.92it/s]

{'train_runtime': 376.0513, 'train_samples_per_second': 7.69, 'train_steps_per_second': 1.923, 'train_loss': 0.020294050622942702, 'epoch': 3.0}





TrainOutput(global_step=723, training_loss=0.020294050622942702, metrics={'train_runtime': 376.0513, 'train_samples_per_second': 7.69, 'train_steps_per_second': 1.923, 'train_loss': 0.020294050622942702, 'epoch': 3.0})

In [42]:
preds = trainer.predict(tokenized_datasets['test'])

100%|██████████| 207/207 [00:16<00:00, 12.62it/s]


In [43]:
roc_auc_score(
    preds.label_ids,
    sigmoid(np.array(preds.predictions)[:,1])
)

1.0