In [None]:
pip install datasets

In [None]:
! pip install -U transformers
! pip install -U accelerate

In [None]:
import transformers

print(transformers.__version__)

In [None]:
import datasets
import pandas as pd
import csv

df_train = pd.read_csv('data/training.csv', sep='|', quoting=csv.QUOTE_NONE, encoding='utf-8')
df_test = pd.read_csv('data/test.csv', sep='|', encoding='utf-8')
df_val = pd.read_csv('data/validation.csv', sep='|', encoding='utf-8')

df_train['labels'] = df_train['labels'].map({'mt': 0, 'human': 1})
df_test['labels'] = df_test['labels'].map({'mt': 0, 'human': 1})
df_val['labels'] = df_val['labels'].map({'mt': 0, 'human': 1})

dataset = datasets.DatasetDict({
    "train": datasets.Dataset.from_pandas(df_train),
    "test": datasets.Dataset.from_pandas(df_test),
    "val": datasets.Dataset.from_pandas(df_val),
    })

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', use_fast=True)

In [None]:
def preprocess_function(examples):
      return tokenizer(examples["sentence"], padding=True, truncation=True)

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True, batch_size=None)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

num_labels = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=num_labels).to(device))

In [None]:
batch_size = 32
args = TrainingArguments(
    f'xlm-roberta-base-finetuned',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    learning_rate=2.3e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.0003,
    load_best_model_at_end=False,
)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
import numpy as np

predictions = trainer.predict(encoded_dataset["test"])
list(np.argmax(predictions.predictions, axis=1))