#### Imports


In [None]:
import pandas as pd
import torch
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import (
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    # tokenizers
    AutoTokenizer,
    DebertaV2Tokenizer,
    DistilBertTokenizer,
    BertTokenizer,
    RobertaTokenizer,
    ElectraTokenizer,
    AlbertTokenizer,
    XLNetTokenizer,
)
from torch.nn import CrossEntropyLoss
# evaluation metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from collections import Counter

import transformers
print(transformers.__version__)
print(transformers.TrainingArguments)

# Cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### LOADING SQLITE DB WITH RECORDS



In [None]:
import sqlite3
import json
import pandas as pd

DB_FILE = "chunks.db"
OUTPUT_FILE = "exported_chunks.jsonl"

# Connect to the database
conn = sqlite3.connect(DB_FILE)
cur = conn.cursor()

# Query all data from chunks table
cur.execute("SELECT text, label FROM chunks")
rows = cur.fetchall()

# Write to JSONL
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for text, label in rows:
        obj = {"text": text}
        if label is not None:
            obj["label"] = label
        f.write(json.dumps(obj) + "\n")

conn.close()

print("Data exported to JSONL file.")


Data exported to JSONL file.


In [None]:
# Load the labeled chunks
with open("exported_chunks.jsonl", "r", encoding="utf-8") as f:
    labeled_chunks = [json.loads(line) for line in f]

data = pd.DataFrame(labeled_chunks)
labeled_count = data['label'].value_counts().to_dict()

# Get the first 9000 rows
data = data.head(9000)

# Remove rows with label == 11
data = data[data['label'] != 11]

# Print labeled count after removing label 11
labeled_count = data['label'].value_counts().to_dict()
print(f"Labeled chunks after removing label 11: {labeled_count}")

# Remove rows where label == 1 and text length < 100
data = data[~((data['label'] == 1) & (data['text'].str.len() < 100))]

# Print final labeled count
labeled_count = data['label'].value_counts().to_dict()
print(f"Final labeled chunks: {labeled_count}")


Labeled chunks after removing label 11: {1: 8199, 0: 800}
Final labeled chunks: {1: 5384, 0: 800}


##### Spliting data


In [None]:
# Loading the data
data['label'] = data['label'].astype(int)

# Train-Test Split using stratified sampling
train_df, test_df = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42)

# since there is a class imbalance, we will compute class weights
# to handle this in the loss function
labels = train_df["label"].values
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print("Class weights:", class_weights)

# Convert ing the DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

In [None]:

# ---- Tuning parameters ----
CONFIG = {
    "epochs": 2,
    "batch_size": 16,
    "max_length": 128,
    "learning_rate": 5e-5,
    "weight_decay": 0.01,
    "output_dir": "./model_results"
}

# ---- Model configurations ----
MODEL_CONFIGS = {
    "deberta": {
        "tokenizer_class": DebertaV2Tokenizer,
        "pretrained_model_name": "microsoft/deberta-v3-small"
    },
    "distilbert": {
        "tokenizer_class": DistilBertTokenizer,
        "pretrained_model_name": "distilbert-base-uncased"
    },
    "bert": {
        "tokenizer_class": BertTokenizer,
        "pretrained_model_name": "bert-base-uncased"
    },
    "roberta": {
        "tokenizer_class": RobertaTokenizer,
        "pretrained_model_name": "roberta-base"
    },
    "electra": {
        "tokenizer_class": ElectraTokenizer,
        "pretrained_model_name": "google/electra-small-discriminator"
    },
    "albert": {
        "tokenizer_class": AlbertTokenizer,
        "pretrained_model_name": "albert-base-v2"
    },
    "xlnet": {
        "tokenizer_class": XLNetTokenizer,
        "pretrained_model_name": "xlnet-base-cased"
    }
}


In [None]:

# ---- Metric function ----
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}



In [None]:
# ---- Weighted Trainer ----
class WeightedLossTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss



In [None]:

# ---- Function to train and evaluate ----
def train_and_evaluate(model_name, train_dataset, test_dataset, data):
    print(f"\n===== Training {model_name} =====")

    # Model + tokenizer
    cfg = MODEL_CONFIGS[model_name]
    tokenizer = cfg["tokenizer_class"].from_pretrained(cfg["pretrained_model_name"])
    model = AutoModelForSequenceClassification.from_pretrained(
        cfg["pretrained_model_name"],
        num_labels=len(data['label'].unique())
    )

    # Tokenization
    def tokenize_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=CONFIG["max_length"])

    train_enc = train_dataset.map(tokenize_fn, batched=True)
    test_enc = test_dataset.map(tokenize_fn, batched=True)
    train_enc = train_enc.rename_column("label", "labels")
    test_enc = test_enc.rename_column("label", "labels")
    train_enc.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    test_enc.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # Compute class weights
    label_counts = Counter(train_enc["labels"])
    total = sum(label_counts.values())
    weights = [total / (len(label_counts) * label_counts[i]) for i in range(len(label_counts))]
    class_weights = torch.tensor(weights, dtype=torch.float).to("cuda" if torch.cuda.is_available() else "cpu")

    # Training args
    training_args = TrainingArguments(
        output_dir=f"{CONFIG['output_dir']}/{model_name}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=CONFIG["learning_rate"],
        per_device_train_batch_size=CONFIG["batch_size"],
        per_device_eval_batch_size=CONFIG["batch_size"],
        num_train_epochs=CONFIG["epochs"],
        weight_decay=CONFIG["weight_decay"],
        logging_dir=f"./logs_{model_name}",
        report_to="none"
    )

    # Trainer
    trainer = WeightedLossTrainer(
        model=model,
        args=training_args,
        train_dataset=train_enc,
        eval_dataset=test_enc,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        class_weights=class_weights
    )

    # Train + evaluate
    trainer.train()
    preds = trainer.predict(test_enc)
    y_true = preds.label_ids
    y_pred = np.argmax(preds.predictions, axis=1)

    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_true, y_pred, digits=4))