In [9]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [10]:
PRETRAINED_NAME="bert-base-multilingual-cased"

In [11]:
import pandas as pd
from datasets import Dataset, DatasetDict
import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import StratifiedKFold
import numpy as np


In [12]:
metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

In [13]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_NAME)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

In [14]:
df = pd.read_csv("../data/euvsdisinfo.csv")
df["text"] = df["article_title"] + " " + df["article_text"]
df["label"] = df["class"].apply(lambda x: 0 if x == "support" else 1)
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(skf.split(df, df["label"])):
    train_df = df.iloc[train_index]
    test_df = df.iloc[test_index]
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)
    train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label", "article_language"])
    break

Map: 100%|██████████| 6776/6776 [00:03<00:00, 2220.69 examples/s]
Map: 100%|██████████| 6776/6776 [00:02<00:00, 2499.47 examples/s]


In [15]:
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(skf.split(df, df["label"])):
    train_df = df.iloc[train_index]
    test_df = df.iloc[test_index]
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)
    train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label", "article_language"])
    break

Map: 100%|██████████| 6776/6776 [00:03<00:00, 2208.61 examples/s]
Map: 100%|██████████| 6776/6776 [00:02<00:00, 2392.13 examples/s]


In [16]:
# define the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

# define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='steps',
    eval_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    save_total_limit=1,
    save_steps=100,
    save_strategy="steps"
)

# define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1
100,0.2171,0.168643,0.954115
200,0.201,0.159326,0.954444
300,0.2148,0.150485,0.955402
400,0.1614,0.155827,0.954527


TrainOutput(global_step=424, training_loss=0.20458659940873677, metrics={'train_runtime': 392.6092, 'train_samples_per_second': 17.259, 'train_steps_per_second': 1.08, 'total_flos': 1782840511119360.0, 'train_loss': 0.20458659940873677, 'epoch': 1.0})

In [17]:
# make inference on the test set
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids
langs = test_dataset["article_language"]
print(metric.compute(predictions=preds, references=labels, average="macro"))

{'f1': 0.9554024815198463}


In [18]:
preds_df = pd.DataFrame({"preds": preds, "labels": labels, "langs": langs})
f1 = evaluate.load("f1")
precision_negative = evaluate.load("precision", pos_label=0)
precision_positive = evaluate.load("precision", pos_label=1)
recall_negative = evaluate.load("recall", pos_label=0)
recall_positive = evaluate.load("recall", pos_label=1)

weighted_f1 = 0
for language in set(langs):
    df = preds_df[preds_df["langs"] == language]
    l = df["labels"].tolist()
    p = df["preds"].tolist()
    f1_score = f1.compute(predictions=p, references=l, average="macro")["f1"]
    p_neg = precision_negative.compute(predictions=p, references=l, average="binary")["precision"]
    p_pos = precision_positive.compute(predictions=p, references=l, average="binary")["precision"]
    r_neg = recall_negative.compute(predictions=p, references=l, average="binary")["recall"]
    r_pos = recall_positive.compute(predictions=p, references=l, average="binary")["recall"]

    weight = len(df) / len(preds_df)
    weighted_f1 += f1_score * weight

    print(language, len(df), f1_score, p_neg, p_pos, r_neg, r_pos)

# avg_f1 /= len(set(langs))
print("Weighted F1", weighted_f1)

ru 2513 0.48829159030747304 0.9542379625945085 0.9542379625945085 1.0 1.0
cs 36 1.0 1.0 1.0 1.0 1.0
es 209 0.49759615384615385 0.9904306220095693 0.9904306220095693 1.0 1.0
az 114 1.0 1.0 1.0 1.0 1.0
de 108 0.47572815533980584 0.9158878504672897 0.9158878504672897 0.98989898989899 0.98989898989899
lt 5 1.0 0.0 0.0 0.0 0.0
pl 17 1.0 1.0 1.0 1.0 1.0
en 2355 0.811143399522877 0.9761904761904762 0.9761904761904762 0.4900398406374502 0.4900398406374502
uk 59 1.0 1.0 1.0 1.0 1.0
ar 1062 0.49976448422044273 1.0 1.0 0.9990583804143126 0.9990583804143126
mk 26 1.0 1.0 1.0 1.0 1.0
fr 154 0.4814814814814815 0.9285714285714286 0.9285714285714286 1.0 1.0
hu 66 1.0 1.0 1.0 1.0 1.0
hr 17 0.48484848484848486 0.9411764705882353 0.9411764705882353 1.0 1.0
it 35 1.0 1.0 1.0 1.0 1.0
Weighted F1 0.6292556322140321


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
