## imports

In [None]:
!pip install -q datasets transformers peft

In [None]:
from datasets import load_dataset
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import Trainer, TrainingArguments, pipeline
from huggingface_hub import notebook_login
from peft import LoraConfig, TaskType, get_peft_model
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt
import torch

## prepare dataset

In [None]:
dataset = load_dataset("K-Monitor/kmdb_classification")

In [None]:
dataset = dataset.filter(lambda row: row['title'] and row['description'])

In [None]:
dataset

In [None]:
dataset = dataset.map(lambda row: {'td': row['title']+'\n'+row['description']})
# dataset = dataset.map(lambda row: {'td': row['title']+'\n'+row['description']+'\n'+'\n'.join(row['text'].split('\n')[:10])})

In [None]:
dataset = dataset.shuffle(seed=42)
split = dataset['train'].train_test_split(test_size=0.2, seed=42,)
dataset = split['test'].train_test_split(test_size=0.5, seed=42,)
dataset['validation'] = dataset['train']
dataset['train'] = split['train']
dataset

In [None]:
tokenizer = BertTokenizer.from_pretrained('SZTAKI-HLT/hubert-base-cc')

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["td"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

## load model

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'SZTAKI-HLT/hubert-base-cc',
    num_labels=2
)

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=True,
    r=128,
    lora_alpha=128,
    lora_dropout=0.05,
    use_rslora=True,
    use_dora=True,
    bias="all",
    target_modules=["query", "key", "value",],
    modules_to_save=["classifier"],
)

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## train

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, _, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall
    }

In [None]:
batch_size=32

training_args = TrainingArguments(
    output_dir="hubert-classification",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_steps=2,
    eval_steps=40,
    logging_steps=10,
    save_steps=40,
    save_total_limit=40,
    save_strategy='steps',
    evaluation_strategy='steps',
    learning_rate=3e-4,
    warmup_steps=40,
    num_train_epochs=2,
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    args=training_args,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

## evaluate

In [None]:
trainer.evaluate(eval_dataset=tokenized_datasets['test'])

In [None]:
merged = model.merge_and_unload()

In [None]:
classifier = pipeline("sentiment-analysis", model=merged, tokenizer=tokenizer, return_all_scores=True, max_length=512)
classifier('hello')

In [None]:
dataset['test'] = dataset['test'].map(lambda row: {'score': classifier(row['td'])[0][1]['score']})

In [None]:
precision, recall, thresholds = precision_recall_curve(dataset['test']['label'], dataset['test']['score'])

In [None]:
aupr = auc(recall, precision)

plt.figure()
plt.plot(recall, precision, marker='.', label=f'AUPR = {aupr:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

In [None]:
aupr
# 0.9843052286062822
# 0.946710389734783
# 0.9865083235548402

## eval old

In [None]:
merged = BertForSequenceClassification.from_pretrained('boapps/kmdb_classification_model')
tokenizer = BertTokenizer.from_pretrained('SZTAKI-HLT/hubert-base-cc')

In [None]:
classifier = pipeline("sentiment-analysis", model=merged, tokenizer=tokenizer, return_all_scores=True, max_length=512)
classifier('hello')

In [None]:
dataset['test'] = dataset['test'].map(lambda row: {'score': classifier(row['td'])[0][1]['score']})

In [None]:
precision, recall, thresholds = precision_recall_curve(dataset['test']['label'], dataset['test']['score'])

In [None]:
aupr = auc(recall, precision)

plt.figure()
plt.plot(recall, precision, marker='.', label=f'AUPR = {aupr:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

In [None]:
aupr
# 0.9843052286062822
# 0.946710389734783
# 0.9865083235548402
# 0.9397254130747401

## upload

In [None]:
notebook_login()

In [None]:
merged.push_to_hub('K-Monitor/kmdb_classification_hubert')