In [None]:
import datasets
import pandas as pd
import csv

df_train = pd.read_csv('data/training.csv', sep='|', quoting=csv.QUOTE_NONE, encoding='utf-8')
df_test = pd.read_csv('data/test.csv', sep='|', encoding='utf-8')
df_val = pd.read_csv('data/validation.csv', sep='|', encoding='utf-8')

df_train['label'] = df_train['label'].map({'machine': 0, 'human': 1})
df_test['label'] = df_test['label'].map({'machine': 0, 'human': 1})
df_val['label'] = df_val['label'].map({'machine': 0, 'human': 1})

dataset = datasets.DatasetDict({
    "train": datasets.Dataset.from_pandas(df_train),
    "test": datasets.Dataset.from_pandas(df_test),
    "val": datasets.Dataset.from_pandas(df_val),
    })

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', use_fast=True)

In [None]:
def preprocess_function(examples):
      return tokenizer(examples["sentence"], truncation=True)

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)

In [None]:
metric_name = "matthews_correlation"

args = TrainingArguments(
    f'xlm-roberta-base-finetuned',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, 0]
    return datasets.metric.compute(predictions=predictions, references=labels)

In [None]:
validation_key = "val"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()