In [1]:
import sqlite3
import json

DB_FILE = "chunks.db"
OUTPUT_FILE = "exported_chunks.jsonl"

# Connect to the database
conn = sqlite3.connect(DB_FILE)
cur = conn.cursor()

# Query all data from chunks table
cur.execute("SELECT text, label FROM chunks")
rows = cur.fetchall()

# Write to JSONL
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for text, label in rows:
        obj = {"text": text}
        if label is not None:
            obj["label"] = label
        f.write(json.dumps(obj) + "\n")

conn.close()

print("Data exported to JSONL file.")


Data exported to JSONL file.


In [2]:
import transformers
print(transformers.__version__)
print(transformers.TrainingArguments)



4.56.2
<class 'transformers.training_args.TrainingArguments'>


In [3]:
import json
import pandas as pd

# Load the labeled chunks
with open("exported_chunks.jsonl", "r", encoding="utf-8") as f:
    labeled_chunks = [json.loads(line) for line in f]

data = pd.DataFrame(labeled_chunks)
labeled_count = data['label'].value_counts().to_dict()

# Get the first 9000 rows
data = data.head(9000)

# Remove rows with label == 11
data = data[data['label'] != 11]

# Print labeled count after removing label 11
labeled_count = data['label'].value_counts().to_dict()
print(f"Labeled chunks after removing label 11: {labeled_count}")

# Remove rows where label == 1 and text length < 100
data = data[~((data['label'] == 1) & (data['text'].str.len() < 100))]

# Print final labeled count
labeled_count = data['label'].value_counts().to_dict()
print(f"Final labeled chunks: {labeled_count}")


Labeled chunks after removing label 11: {1: 8199, 0: 800}
Final labeled chunks: {1: 5384, 0: 800}


In [5]:
import pandas as pd
import torch
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import (
    TrainingArguments,
    Trainer,
    DebertaV2Tokenizer,
    AutoModelForSequenceClassification,
)
from torch.nn import CrossEntropyLoss

# Cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading the data
data['label'] = data['label'].astype(int)

# Train-Test Split using stratified sampling
train_df, test_df = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42)

# since there is a class imbalance, we will compute class weights
# to handle this in the loss function
labels = train_df["label"].values
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print("Class weights:", class_weights)

# Convert ing the DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# Model
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", num_labels=len(data['label'].unique()))
model.to(device)
# Tokenization
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-small")

def tokenize(input_data):
    return tokenizer(input_data["text"], truncation=True, padding="max_length", max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Creating a custom Trainer to handle weighted loss
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none",
)

# Creating the Trainer
trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Training the model
trainer.train()

# Evaluating the model
preds = trainer.predict(test_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)

from sklearn.metrics import classification_report
print(classification_report(test_dataset["labels"], pred_labels, digits=4))


Class weights: tensor([3.8648, 0.5743], device='cuda:0')


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: no
- Save strategy: steps

In [None]:
# Running test on 10 random samples
sample_df = test_df.sample(10, random_state=42).reset_index(drop=True)

# Tokenize the samples
encodings = tokenizer(
    sample_df["text"].tolist(),
    truncation=True,
    padding="max_length",
    max_length=512,
    return_tensors="pt"
).to(device)

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**encodings)
    probs = torch.softmax(outputs.logits, dim=1)
    predictions = torch.argmax(probs, dim=1).cpu().numpy()

# Display actual vs predicted
for i in range(10):
    print(f"\n Text {i+1}:")
    print(f"Text: {sample_df.loc[i, 'text'][:300]}{'...' if len(sample_df.loc[i, 'text']) > 300 else ''}")
    print(f"Actual label   : {sample_df.loc[i, 'label']}")
    print(f"Predicted label: {predictions[i]}")


NameError: name 'model' is not defined

In [None]:
# Testing the models generalization on new unseen data

# Random new text samples
new_texts = [
    "Photosynthesis is the process by which green plants convert sunlight into energy.",
    "Bibliography\nChapter 3\nIndex\nAcknowledgments",
    "The mitochondria is often called the powerhouse of the cell.",
    "References\n[1] Smith et al. 2022",
    "This text explains Newton’s laws of motion in detail.",
    "Appendix A: Glossary of Terms",
    "How does the water cycle work in nature?",
    "The syllabus is subject to change without notice.",
    "Cell division is crucial for reproduction in organisms.",
    "Table of contents\n1. Preface\n2. Introduction"
]

# Tokenize the new texts
encodings = tokenizer(
    new_texts,
    truncation=True,
    padding="max_length",
    max_length=512,
    return_tensors="pt"
).to(device)

# RUn the model on new texts
model.eval()
with torch.no_grad():
    outputs = model(**encodings)
    probs = torch.softmax(outputs.logits, dim=1)
    predictions = torch.argmax(probs, dim=1).cpu().numpy()

# Results 
for i, text in enumerate(new_texts):
    print(f"\nText {i+1}:")
    print(f"Text: {text[:300]}{'...' if len(text) > 300 else ''}")
    print(f"Predicted label: {predictions[i]} (1 = Relevant, 0 = Irrelevant)")


In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import HfApi, HfFolder, Repository, create_repo

repo_id = "LMForge/text_relevance_classfier"

# Create repo if it doesn’t exist
create_repo(repo_id, exist_ok=True)

# Push your saved model folder
from huggingface_hub import upload_folder
upload_folder(
    repo_id=repo_id,
    folder_path="classifier_model",  # your saved folder
    commit_message="Initial model upload"
)