In [1]:
!pip install transformers datasets torch scikit-learn


import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [2]:

DATASET_PATH = "credibility.csv"
df = pd.read_csv(DATASET_PATH)

label_mapping = {"high": 0, "medium": 1, "low": 2}
df["label"] = df["label"].map(label_mapping)


train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})

MODEL_NAME = "microsoft/deberta-v3-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenization function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

# Apply tokenization
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Remove text column (keep tokenized inputs only)
train_dataset = train_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])

# Convert labels to tensors
train_dataset.set_format("torch")
test_dataset.set_format("torch")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/2245 [00:00<?, ? examples/s]

Map:   0%|          | 0/562 [00:00<?, ? examples/s]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/fact_claim_classifier",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="/content/logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6461,0.338335,0.886121,0.891355,0.886121,0.883695
2,0.2528,0.315975,0.903915,0.90611,0.903915,0.901826
3,0.2288,0.370905,0.903915,0.90339,0.903915,0.90276
4,0.1283,0.369411,0.91637,0.916789,0.91637,0.915512
5,0.1235,0.492019,0.891459,0.900011,0.891459,0.893132
6,0.0758,0.503801,0.898577,0.898635,0.898577,0.897923
7,0.0606,0.50707,0.909253,0.908644,0.909253,0.908516
8,0.0415,0.504169,0.907473,0.907057,0.907473,0.907106


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6461,0.338335,0.886121,0.891355,0.886121,0.883695
2,0.2528,0.315975,0.903915,0.90611,0.903915,0.901826
3,0.2288,0.370905,0.903915,0.90339,0.903915,0.90276
4,0.1283,0.369411,0.91637,0.916789,0.91637,0.915512
5,0.1235,0.492019,0.891459,0.900011,0.891459,0.893132
6,0.0758,0.503801,0.898577,0.898635,0.898577,0.897923
7,0.0606,0.50707,0.909253,0.908644,0.909253,0.908516
8,0.0415,0.504169,0.907473,0.907057,0.907473,0.907106
9,0.0124,0.542591,0.905694,0.906113,0.905694,0.905754
10,0.0304,0.625995,0.902135,0.903452,0.902135,0.90205


TrainOutput(global_step=2256, training_loss=0.10797320481832304, metrics={'train_runtime': 2362.5227, 'train_samples_per_second': 15.204, 'train_steps_per_second': 0.955, 'total_flos': 4725601728307200.0, 'train_loss': 0.10797320481832304, 'epoch': 16.0})

In [9]:
results = trainer.evaluate()
print("Evaluation results:", results)

Evaluation results: {'eval_loss': 0.3694113790988922, 'eval_accuracy': 0.9163701067615658, 'eval_precision': 0.9167889891778329, 'eval_recall': 0.9163701067615658, 'eval_f1': 0.9155115148951218, 'eval_runtime': 9.8476, 'eval_samples_per_second': 57.07, 'eval_steps_per_second': 3.656, 'epoch': 16.0}


In [10]:
trainer.save_model("/content/drive/MyDrive/fact_claim_classifier")
tokenizer.save_pretrained("/content/drive/MyDrive/fact_claim_classifier")

('/content/drive/MyDrive/fact_claim_classifier/tokenizer_config.json',
 '/content/drive/MyDrive/fact_claim_classifier/special_tokens_map.json',
 '/content/drive/MyDrive/fact_claim_classifier/spm.model',
 '/content/drive/MyDrive/fact_claim_classifier/added_tokens.json',
 '/content/drive/MyDrive/fact_claim_classifier/tokenizer.json')