In [None]:
!pip install transformers datasets torch scikit-learn


import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [None]:

DATASET_PATH = "credibility.csv"
df = pd.read_csv(DATASET_PATH)

label_mapping = {"high": 0, "medium": 1, "low": 2}
df["label"] = df["label"].map(label_mapping)


train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})

MODEL_NAME = "microsoft/deberta-v3-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenization function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

# Apply tokenization
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Remove text column (keep tokenized inputs only)
train_dataset = train_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])

# Convert labels to tensors
train_dataset.set_format("torch")
test_dataset.set_format("torch")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/2245 [00:00<?, ? examples/s]

Map:   0%|          | 0/562 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/fact_claim_classifier",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="/content/logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6461,0.338335,0.886121,0.891355,0.886121,0.883695
2,0.2528,0.315975,0.903915,0.90611,0.903915,0.901826
3,0.2288,0.370905,0.903915,0.90339,0.903915,0.90276
4,0.1283,0.369411,0.91637,0.916789,0.91637,0.915512
5,0.1235,0.492019,0.891459,0.900011,0.891459,0.893132
6,0.0758,0.503801,0.898577,0.898635,0.898577,0.897923
7,0.0606,0.50707,0.909253,0.908644,0.909253,0.908516
8,0.0415,0.504169,0.907473,0.907057,0.907473,0.907106
9,0.0124,0.542591,0.905694,0.906113,0.905694,0.905754
10,0.0304,0.625995,0.902135,0.903452,0.902135,0.90205


TrainOutput(global_step=2256, training_loss=0.10797320481832304, metrics={'train_runtime': 2362.5227, 'train_samples_per_second': 15.204, 'train_steps_per_second': 0.955, 'total_flos': 4725601728307200.0, 'train_loss': 0.10797320481832304, 'epoch': 16.0})

In [None]:
results = trainer.evaluate()
print("Evaluation results:", results)

In [None]:
trainer.save_model("/content/drive/MyDrive/fact_claim_classifier")
tokenizer.save_pretrained("/content/drive/MyDrive/fact_claim_classifier")

In [1]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

# Sample corpus
corpus = """
Scientific studies show that exercise improves mental health.
Some people believe that drinking coffee prevents heart disease, but this is still debated.
Aliens built the pyramids, according to some conspiracy theories.
"""

sentences = sent_tokenize(corpus)
print(sentences)


NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> punkt


    Downloading package punkt to /root/nltk_data...
      Unzipping tokenizers/punkt.zip.



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> punkt_tab


    Downloading package punkt_tab to /root/nltk_data...
      Unzipping tokenizers/punkt_tab.zip.



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q
['\nScientific studies show that exercise improves mental health.', 'Some people believe that drinking coffee prevents heart disease, but this is still debated.', 'Aliens built the pyramids, according to some conspiracy theories.']


In [2]:
from transformers import pipeline

MODEL_PATH = "/content/drive/MyDrive/fact_claim_classifier"
classifier = pipeline("text-classification", model=MODEL_PATH, tokenizer=MODEL_PATH)

results = classifier(sentences)

#  results
for sentence, res in zip(sentences, results):
    print(f"Sentence: {sentence}\nPrediction: {res['label']} (Confidence: {res['score']:.2f})\n")


Device set to use cuda:0


Sentence: 
Scientific studies show that exercise improves mental health.
Prediction: LABEL_0 (Confidence: 0.99)

Sentence: Some people believe that drinking coffee prevents heart disease, but this is still debated.
Prediction: LABEL_1 (Confidence: 0.93)

Sentence: Aliens built the pyramids, according to some conspiracy theories.
Prediction: LABEL_1 (Confidence: 0.81)



In [3]:
import json
from collections import Counter

id = "sample_id"

# Define label mapping
label_mapping = {0: "high", 1: "medium", 2: "low"}
label_scores = {"high": 3, "medium": 2, "low": 1}  # Assign scores

# Store predictions
formatted_results = []
all_labels = []

for sentence, res in zip(sentences, results):
    label_id = int(res['label'].split("_")[-1])  # Extract label index
    real_label = label_mapping[label_id]

    formatted_results.append({
        "sentence": sentence,
        "prediction": real_label,
        "confidence": round(res["score"], 2)
    })

    all_labels.append(real_label)

# Count occurrences of each label
label_counts = Counter(all_labels)

# Compute weighted corpus credibility score
total_score = sum(label_scores[label] * count for label, count in label_counts.items())
average_score = total_score / len(all_labels)  # Normalize score

# Map average score back to a label
if average_score >= 2.5:
    corpus_label = "high"
elif average_score >= 1.5:
    corpus_label = "medium"
else:
    corpus_label = "low"

# Add overall credibility to JSON
output = {
    "sentence_results": formatted_results,
    "corpus_credibility": {
        "label": corpus_label,
        "average_score": round(average_score, 2),
        "label_counts": dict(label_counts)  # Show counts of high/medium/low
    }
}

# Convert to JSON string
json_output = json.dumps(output, indent=4)
print(json_output)

# Save results to a file
with open(f"{id}_classification_results.json", "w") as f:
    json.dump(output, f, indent=4)


{
    "sentence_results": [
        {
            "sentence": "\nScientific studies show that exercise improves mental health.",
            "prediction": "high",
            "confidence": 0.99
        },
        {
            "sentence": "Some people believe that drinking coffee prevents heart disease, but this is still debated.",
            "prediction": "medium",
            "confidence": 0.93
        },
        {
            "sentence": "Aliens built the pyramids, according to some conspiracy theories.",
            "prediction": "medium",
            "confidence": 0.81
        }
    ],
    "corpus_credibility": {
        "label": "medium",
        "average_score": 2.33,
        "label_counts": {
            "high": 1,
            "medium": 2
        }
    }
}


#Convert to Onnx

In [4]:
!pip install onnx onnxruntime transformers torch

Collecting onnx
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cub

In [1]:
import torch
from transformers import AutoModelForSequenceClassification

# Load trained model
MODEL_PATH = "/content/drive/MyDrive/fact_claim_classifier"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
model.eval()  #  inference mode

dummy_input = {
    "input_ids": torch.randint(0, 1000, (1, 256)),  #  IDs
    "attention_mask": torch.ones((1, 256))  #  attention mask
}

# Export to ONNX
torch.onnx.export(
    model,
    (dummy_input["input_ids"], dummy_input["attention_mask"]),
    "model.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    dynamic_axes={"input_ids": {0: "batch_size"}, "attention_mask": {0: "batch_size"}, "output": {0: "batch_size"}}
)

print("Model converted to ONNX")


Model converted to ONNX


In [5]:
import onnxruntime as ort
from transformers import AutoTokenizer
import numpy as np

# Load ONNX model
MODEL_PATH = "/content/drive/MyDrive/fact_claim_classifier"
session = ort.InferenceSession("model.onnx")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Define label mapping
label_mapping = {0: "high", 1: "medium", 2: "low"}

def classify(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="np", truncation=True, padding="max_length", max_length=256)

    # Convert to ONNX format
    ort_inputs = {
        "input_ids": inputs["input_ids"].astype(np.int64),
        "attention_mask": inputs["attention_mask"].astype(np.float32),
    }

    # inference
    logits = session.run(None, ort_inputs)[0]

    predicted_class = np.argmax(logits, axis=1)[0]

    return label_mapping[predicted_class]


high


In [6]:
from nltk.tokenize import sent_tokenize

def classify_corpus(corpus):
    sentences = sent_tokenize(corpus)  # Split
    results = [classify(sentence) for sentence in sentences]  # Batch classify
    return results

corpus = """Scientific studies show that exercise improves mental health.
Some people believe that drinking coffee prevents heart disease, but this is still debated.
Aliens built the pyramids, according to some conspiracy theories."""

print(classify_corpus(corpus))


['high', 'medium', 'medium']
