In [1]:
import sqlite3
import json

DB_FILE = "chunks.db"
OUTPUT_FILE = "exported_chunks.jsonl"

# Connect to the database
conn = sqlite3.connect(DB_FILE)
cur = conn.cursor()

# Query all data from chunks table
cur.execute("SELECT text, label FROM chunks")
rows = cur.fetchall()

# Write to JSONL
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for text, label in rows:
        obj = {"text": text}
        if label is not None:
            obj["label"] = label
        f.write(json.dumps(obj) + "\n")

conn.close()

print("Data exported to JSONL file.")


Data exported to JSONL file.


In [2]:
import transformers
print(transformers.__version__)
print(transformers.TrainingArguments)



4.29.2
<class 'transformers.training_args.TrainingArguments'>


In [3]:
import json
import pandas as pd

# Load the labeled chunks
with open("exported_chunks.jsonl", "r", encoding="utf-8") as f:
    labeled_chunks = [json.loads(line) for line in f]

data = pd.DataFrame(labeled_chunks)
labeled_count = data['label'].value_counts().to_dict()

# Get the first 9000 rows
data = data.head(9000)

# Remove rows with label == 11
data = data[data['label'] != 11]

# Print labeled count after removing label 11
labeled_count = data['label'].value_counts().to_dict()
print(f"Labeled chunks after removing label 11: {labeled_count}")

# Remove rows where label == 1 and text length < 100
data = data[~((data['label'] == 1) & (data['text'].str.len() < 100))]

# Print final labeled count
labeled_count = data['label'].value_counts().to_dict()
print(f"Final labeled chunks: {labeled_count}")


Labeled chunks after removing label 11: {1: 8199, 0: 800}
Final labeled chunks: {1: 5384, 0: 800}


In [4]:
import pandas as pd
import torch
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import (
    TrainingArguments,
    Trainer,
    DebertaV2Tokenizer,
    AutoModelForSequenceClassification,
)
from torch.nn import CrossEntropyLoss

# Cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading the data
data['label'] = data['label'].astype(int)

# Train-Test Split using stratified sampling
train_df, test_df = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42)

# since there is a class imbalance, we will compute class weights
# to handle this in the loss function
labels = train_df["label"].values
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print("Class weights:", class_weights)

# Convert ing the DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# Model
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", num_labels=len(data['label'].unique()))
model.to(device)
# Tokenization
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-small")

def tokenize(input_data):
    return tokenizer(input_data["text"], truncation=True, padding="max_length", max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Creating a custom Trainer to handle weighted loss
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none",
)

# Creating the Trainer
trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Training the model
trainer.train()

# Evaluating the model
preds = trainer.predict(test_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)

from sklearn.metrics import classification_report
print(classification_report(test_dataset["labels"], pred_labels, digits=4))


Class weights: tensor([3.8648, 0.5743], device='cuda:0')


Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]



  0%|          | 0/930 [00:00<?, ?it/s]

{'loss': 0.6752, 'learning_rate': 4.9462365591397855e-05, 'epoch': 0.03}
{'loss': 0.6024, 'learning_rate': 4.89247311827957e-05, 'epoch': 0.06}
{'loss': 0.4507, 'learning_rate': 4.8387096774193554e-05, 'epoch': 0.1}
{'loss': 0.3033, 'learning_rate': 4.78494623655914e-05, 'epoch': 0.13}
{'loss': 0.314, 'learning_rate': 4.731182795698925e-05, 'epoch': 0.16}
{'loss': 0.2666, 'learning_rate': 4.67741935483871e-05, 'epoch': 0.19}
{'loss': 0.963, 'learning_rate': 4.6236559139784944e-05, 'epoch': 0.23}
{'loss': 0.0811, 'learning_rate': 4.56989247311828e-05, 'epoch': 0.26}
{'loss': 0.3581, 'learning_rate': 4.516129032258064e-05, 'epoch': 0.29}
{'loss': 0.3529, 'learning_rate': 4.4623655913978496e-05, 'epoch': 0.32}
{'loss': 0.2746, 'learning_rate': 4.408602150537635e-05, 'epoch': 0.35}
{'loss': 0.5411, 'learning_rate': 4.3548387096774194e-05, 'epoch': 0.39}
{'loss': 0.2955, 'learning_rate': 4.301075268817205e-05, 'epoch': 0.42}
{'loss': 0.4125, 'learning_rate': 4.247311827956989e-05, 'epoch': 

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.27823910117149353, 'eval_runtime': 8.8158, 'eval_samples_per_second': 140.316, 'eval_steps_per_second': 8.848, 'epoch': 1.0}
{'loss': 0.0085, 'learning_rate': 3.279569892473118e-05, 'epoch': 1.03}
{'loss': 0.1222, 'learning_rate': 3.2258064516129034e-05, 'epoch': 1.06}
{'loss': 0.0107, 'learning_rate': 3.172043010752688e-05, 'epoch': 1.1}
{'loss': 0.2221, 'learning_rate': 3.118279569892473e-05, 'epoch': 1.13}
{'loss': 0.1973, 'learning_rate': 3.0645161290322585e-05, 'epoch': 1.16}
{'loss': 0.2167, 'learning_rate': 3.010752688172043e-05, 'epoch': 1.19}
{'loss': 0.3238, 'learning_rate': 2.9569892473118284e-05, 'epoch': 1.23}
{'loss': 0.1264, 'learning_rate': 2.9032258064516133e-05, 'epoch': 1.26}
{'loss': 0.1911, 'learning_rate': 2.8494623655913982e-05, 'epoch': 1.29}
{'loss': 0.3467, 'learning_rate': 2.7956989247311828e-05, 'epoch': 1.32}
{'loss': 0.2304, 'learning_rate': 2.7419354838709678e-05, 'epoch': 1.35}
{'loss': 0.1451, 'learning_rate': 2.6881720430107527e-05, 'ep

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.33145129680633545, 'eval_runtime': 8.859, 'eval_samples_per_second': 139.633, 'eval_steps_per_second': 8.805, 'epoch': 2.0}
{'loss': 0.1581, 'learning_rate': 1.6129032258064517e-05, 'epoch': 2.03}
{'loss': 0.0379, 'learning_rate': 1.5591397849462366e-05, 'epoch': 2.06}
{'loss': 0.0026, 'learning_rate': 1.5053763440860215e-05, 'epoch': 2.1}
{'loss': 0.0019, 'learning_rate': 1.4516129032258066e-05, 'epoch': 2.13}
{'loss': 0.3645, 'learning_rate': 1.3978494623655914e-05, 'epoch': 2.16}
{'loss': 0.1127, 'learning_rate': 1.3440860215053763e-05, 'epoch': 2.19}
{'loss': 0.1684, 'learning_rate': 1.2903225806451613e-05, 'epoch': 2.23}
{'loss': 0.0613, 'learning_rate': 1.2365591397849464e-05, 'epoch': 2.26}
{'loss': 0.293, 'learning_rate': 1.1827956989247313e-05, 'epoch': 2.29}
{'loss': 0.0443, 'learning_rate': 1.129032258064516e-05, 'epoch': 2.32}
{'loss': 0.0037, 'learning_rate': 1.0752688172043012e-05, 'epoch': 2.35}
{'loss': 0.2623, 'learning_rate': 1.0215053763440861e-05, 'e

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.234201118350029, 'eval_runtime': 8.8628, 'eval_samples_per_second': 139.573, 'eval_steps_per_second': 8.801, 'epoch': 3.0}
{'train_runtime': 329.1049, 'train_samples_per_second': 45.095, 'train_steps_per_second': 2.826, 'train_loss': 0.19406638859060182, 'epoch': 3.0}


  0%|          | 0/78 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9470    0.8938    0.9196       160
           1     0.9843    0.9926    0.9884      1077

    accuracy                         0.9798      1237
   macro avg     0.9657    0.9432    0.9540      1237
weighted avg     0.9795    0.9798    0.9795      1237



In [7]:
# Running test on 10 random samples
sample_df = test_df.sample(10, random_state=42).reset_index(drop=True)

# Tokenize the samples
encodings = tokenizer(
    sample_df["text"].tolist(),
    truncation=True,
    padding="max_length",
    max_length=512,
    return_tensors="pt"
).to(device)

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**encodings)
    probs = torch.softmax(outputs.logits, dim=1)
    predictions = torch.argmax(probs, dim=1).cpu().numpy()

# Display actual vs predicted
for i in range(10):
    print(f"\n Text {i+1}:")
    print(f"Text: {sample_df.loc[i, 'text'][:300]}{'...' if len(sample_df.loc[i, 'text']) > 300 else ''}")
    print(f"Actual label   : {sample_df.loc[i, 'label']}")
    print(f"Predicted label: {predictions[i]}")



 Text 1:
Text: The federal government alerts its contractors to CI threats
and subjects them to "awareness programs" under the
DOD's Defense Information Counter Espionage (DICE)
program. The Defense Investigative Service (DIS)
maintains a host of useful databases such as the Foreign
Ownership, Control, or Influenc...
Actual label   : 1
Predicted label: 1

 Text 2:
Text: 9. The prosperity of nations rests very largely on the
      six inches of soil between the surface and the subsoil
      of the territory.
Actual label   : 1
Predicted label: 1

 Text 3:
Text: _Accept_ means _to receive_. _Except_ as a verb means _to exclude_; as a
preposition it means _with the exception of_. Insert the correct form in
the following:
Actual label   : 1
Predicted label: 1

 Text 4:
Text: How the West lost the East. The economics, the politics, the geopolitics, the
conspiracies, the corruption, the old and the new, the plough and the internet - it is all
here, in colourful and provocative prose.
Actual

In [8]:
# Testing the models generalization on new unseen data

# Random new text samples
new_texts = [
    "Photosynthesis is the process by which green plants convert sunlight into energy.",
    "Bibliography\nChapter 3\nIndex\nAcknowledgments",
    "The mitochondria is often called the powerhouse of the cell.",
    "References\n[1] Smith et al. 2022",
    "This text explains Newton’s laws of motion in detail.",
    "Appendix A: Glossary of Terms",
    "How does the water cycle work in nature?",
    "The syllabus is subject to change without notice.",
    "Cell division is crucial for reproduction in organisms.",
    "Table of contents\n1. Preface\n2. Introduction"
]

# Tokenize the new texts
encodings = tokenizer(
    new_texts,
    truncation=True,
    padding="max_length",
    max_length=512,
    return_tensors="pt"
).to(device)

# RUn the model on new texts
model.eval()
with torch.no_grad():
    outputs = model(**encodings)
    probs = torch.softmax(outputs.logits, dim=1)
    predictions = torch.argmax(probs, dim=1).cpu().numpy()

# Results 
for i, text in enumerate(new_texts):
    print(f"\nText {i+1}:")
    print(f"Text: {text[:300]}{'...' if len(text) > 300 else ''}")
    print(f"Predicted label: {predictions[i]} (1 = Relevant, 0 = Irrelevant)")



Text 1:
Text: Photosynthesis is the process by which green plants convert sunlight into energy.
Predicted label: 1 (1 = Relevant, 0 = Irrelevant)

Text 2:
Text: Bibliography
Chapter 3
Index
Acknowledgments
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 3:
Text: The mitochondria is often called the powerhouse of the cell.
Predicted label: 1 (1 = Relevant, 0 = Irrelevant)

Text 4:
Text: References
[1] Smith et al. 2022
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 5:
Text: This text explains Newton’s laws of motion in detail.
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 6:
Text: Appendix A: Glossary of Terms
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 7:
Text: How does the water cycle work in nature?
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 8:
Text: The syllabus is subject to change without notice.
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 9:
Text: Cell division is crucial for reproduction in organisms.
Predicted lab

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import HfApi, HfFolder, Repository, create_repo

repo_id = "LMForge/text_relevance_classfier"

# Create repo if it doesn’t exist
create_repo(repo_id, exist_ok=True)

# Push your saved model folder
from huggingface_hub import upload_folder
upload_folder(
    repo_id=repo_id,
    folder_path="classifier_model",  # your saved folder
    commit_message="Initial model upload"
)