In [1]:
import pandas as pd
import numpy as np

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

from datasets import DatasetDict, Dataset

In [None]:
BASIC_PATH = './output'

In [2]:
data = pd.read_csv('./output/classification/source_title_description_bin_clf.csv', header=0)
data

Unnamed: 0,text,label
0,Source: CoinTelegraph Title: Bitcoin options d...,1
1,Source: The Daily Hodl Title: Crypto Exchange ...,0
2,Source: The Daily Hodl Title: BitKeep Wallet G...,0
3,Source: Bitcoin Magazine Title: Kazakhstan Pre...,0
4,Source: CoinTelegraph Title: The blockchain tr...,0
...,...,...
1313,Source: U.Today Title: Ethereum (ETH) Price An...,0
1314,Source: ZyCrypto Title: Digital Assets Amongst...,0
1315,"Source: The Daily Hodl Title: Cardano, Litecoi...",1
1316,Source: ZyCrypto Title: Binance plans to inves...,1


In [None]:
X, y = data['text'], data['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_split=0.3, random_state=42)

In [None]:
MODEL_NAME = 'distilbert-base-uncased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained()

In [None]:
def tokenization(example):
    return tokenizer(
        example, 
        max_length=512, 
        padding='max_length', 
        truncation=True, 
        add_special_tokens=True,
        return_tensors='pt'
    )

In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(pd.concat([X_train, y_train])),
    'test': Dataset.from_pandas(pd.concat([X_test, y_test])),
})

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
training_args = TrainingArgs(
    output_dir=f'{BASIC_PATH}/training',
    do_train=True,
    do_eval=True,
    seed=42,
    learning_rate=2e-5,
    weight_decay=1e-2,

    # 'lr_scheduler_type': 'cosine',
    gradient_accumulation_steps= 1,
    per_device_train_batch_size=4,
    num_train_epochs=2,
    warmup_steps=0,
    max_grad_norm=1000,
    adam_epsilon=1e-6,

    log_level='debug',
    save_strategy='steps',
    disable_tqdm=False,
    logging_steps=250,
    eval_steps=250,
    save_steps=500,
    resume_from_checkpoint=True,
    per_device_eval_batch_size=8,
    evaluation_strategy=steps',
    # 'fp16': True,
    metric_for_best_model=loss',
    load_best_model_at_end=True,
    greater_is_better=False,
    
    save_total_limit=1,
    
    report_to='none',
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
)

In [None]:
results = trainer.train()

In [3]:
y_pred = np.array([1, 0, 1, 0])
y_true = np.array([1, 0, 1, 1])

In [43]:
def recall(y_pred, y_true):
    classes = np.unique(y_pred)
    results = dict()
    for cl in classes:
        pred_tmp = y_pred.copy()
        pred_tmp[y_pred == cl] = 1
        pred_tmp[y_pred != cl] = 0

        true_tmp = y_true.copy()
        true_tmp[y_true == cl] = 1
        true_tmp[y_true != cl] = 0

        # True Positive (TP): we predict a label of 1 (positive), and the true label is 1.
        TP = np.sum(np.logical_and(pred_tmp == 1, true_tmp == 1))

        # True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
        TN = np.sum(np.logical_and(pred_tmp == 0, true_tmp == 0))

        # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
        FP = np.sum(np.logical_and(pred_tmp == 1, true_tmp == 0))

        # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
        FN = np.sum(np.logical_and(pred_tmp == 0, true_tmp == 1))
        
        results[cl] = TP / (TP + FN)
    print(results)
    return np.mean(list(results.values()))

In [44]:
def precision(y_pred, y_true):
    classes = np.unique(y_pred)
    results = dict()
    for cl in classes:
        pred_tmp = y_pred.copy()
        pred_tmp[y_pred == cl] = 1
        pred_tmp[y_pred != cl] = 0

        true_tmp = y_true.copy()
        true_tmp[y_true == cl] = 1
        true_tmp[y_true != cl] = 0

        # True Positive (TP): we predict a label of 1 (positive), and the true label is 1.
        TP = np.sum(np.logical_and(pred_tmp == 1, true_tmp == 1))

        # True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
        TN = np.sum(np.logical_and(pred_tmp == 0, true_tmp == 0))

        # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
        FP = np.sum(np.logical_and(pred_tmp == 1, true_tmp == 0))

        # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
        FN = np.sum(np.logical_and(pred_tmp == 0, true_tmp == 1))
        
        results[cl] = TP / (TP + FP)
    print(results)
    return np.mean(list(results.values()))

In [47]:
def f1(y_pred, y_true):
    p = precision(y_pred, y_true)
    r = recall(y_pred, y_true)
    return 2 * p * r / (p + r)

In [45]:
precision(y_pred, y_true)

{0: 0.5, 1: 1.0}


0.75

In [46]:
recall(y_pred, y_true)

{0: 1.0, 1: 0.6666666666666666}


0.8333333333333333

In [48]:
f1(y_pred, y_true)

{0: 0.5, 1: 1.0}
{0: 1.0, 1: 0.6666666666666666}


0.7894736842105263