In [14]:
import sqlite3
import json

DB_FILE = "chunks.db"
OUTPUT_FILE = "exported_chunks.jsonl"

# Connect to the database
conn = sqlite3.connect(DB_FILE)
cur = conn.cursor()

# Query all data from chunks table
cur.execute("SELECT text, label FROM chunks")
rows = cur.fetchall()

# Write to JSONL
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for text, label in rows:
        obj = {"text": text}
        if label is not None:
            obj["label"] = label
        f.write(json.dumps(obj) + "\n")

conn.close()

print("Data exported to JSONL file.")


Data exported to JSONL file.


In [1]:
import json
import pandas as pd

# Load the labeled chunks
with open("exported_chunks.jsonl", "r", encoding="utf-8") as f:
    labeled_chunks = [json.loads(line) for line in f]

data = pd.DataFrame(labeled_chunks)
labeled_count = data['label'].value_counts().to_dict()

# Get the first 9000 rows
data = data.head(9000)

# Remove rows with label == 11
data = data[data['label'] != 11]

# Print labeled count after removing label 11
labeled_count = data['label'].value_counts().to_dict()
print(f"Labeled chunks after removing label 11: {labeled_count}")

# Remove rows where label == 1 and text length < 100
data = data[~((data['label'] == 1) & (data['text'].str.len() < 100))]

# Print final labeled count
labeled_count = data['label'].value_counts().to_dict()
print(f"Final labeled chunks: {labeled_count}")


Labeled chunks after removing label 11: {1: 8199, 0: 800}
Final labeled chunks: {1: 5384, 0: 800}


In [2]:
import pandas as pd
import torch
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer
)
from torch.nn import CrossEntropyLoss

# Cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading the data
data['label'] = data['label'].astype(int)

# Train-Test Split using stratified sampling
train_df, test_df = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42)

# since there is a class imbalance, we will compute class weights
# to handle this in the loss function
labels = train_df["label"].values
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print("Class weights:", class_weights)

# Convert ing the DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(input_data):
    return tokenizer(input_data["text"], truncation=True, padding="max_length", max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Loaading the model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(device)

# Creating a custom Trainer to handle weighted loss
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none",
)

# Creating the Trainer
trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Training the model
trainer.train()

# Evaluating the model
preds = trainer.predict(test_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)

from sklearn.metrics import classification_report
print(classification_report(test_dataset["labels"], pred_labels, digits=4))


Class weights: tensor([3.8648, 0.5743], device='cuda:0')




Map:   0%|          | 0/4947 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.

  0%|          | 0/930 [00:00<?, ?it/s]

{'loss': 0.6134, 'learning_rate': 4.9462365591397855e-05, 'epoch': 0.03}
{'loss': 0.5157, 'learning_rate': 4.89247311827957e-05, 'epoch': 0.06}
{'loss': 0.3569, 'learning_rate': 4.8387096774193554e-05, 'epoch': 0.1}
{'loss': 0.1959, 'learning_rate': 4.78494623655914e-05, 'epoch': 0.13}
{'loss': 0.3643, 'learning_rate': 4.731182795698925e-05, 'epoch': 0.16}
{'loss': 0.3029, 'learning_rate': 4.67741935483871e-05, 'epoch': 0.19}
{'loss': 0.3797, 'learning_rate': 4.6236559139784944e-05, 'epoch': 0.23}
{'loss': 0.1163, 'learning_rate': 4.56989247311828e-05, 'epoch': 0.26}
{'loss': 0.3242, 'learning_rate': 4.516129032258064e-05, 'epoch': 0.29}
{'loss': 0.3069, 'learning_rate': 4.4623655913978496e-05, 'epoch': 0.32}
{'loss': 0.2747, 'learning_rate': 4.408602150537635e-05, 'epoch': 0.35}
{'loss': 0.3707, 'learning_rate': 4.3548387096774194e-05, 'epoch': 0.39}
{'loss': 0.3371, 'learning_rate': 4.301075268817205e-05, 'epoch': 0.42}
{'loss': 0.493, 'learning_rate': 4.247311827956989e-05, 'epoch':

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.2780472934246063, 'eval_runtime': 4.2225, 'eval_samples_per_second': 292.953, 'eval_steps_per_second': 18.472, 'epoch': 1.0}
{'loss': 0.0078, 'learning_rate': 3.279569892473118e-05, 'epoch': 1.03}
{'loss': 0.1706, 'learning_rate': 3.2258064516129034e-05, 'epoch': 1.06}
{'loss': 0.0075, 'learning_rate': 3.172043010752688e-05, 'epoch': 1.1}
{'loss': 0.2529, 'learning_rate': 3.118279569892473e-05, 'epoch': 1.13}
{'loss': 0.2801, 'learning_rate': 3.0645161290322585e-05, 'epoch': 1.16}
{'loss': 0.1653, 'learning_rate': 3.010752688172043e-05, 'epoch': 1.19}
{'loss': 0.3258, 'learning_rate': 2.9569892473118284e-05, 'epoch': 1.23}
{'loss': 0.0556, 'learning_rate': 2.9032258064516133e-05, 'epoch': 1.26}
{'loss': 0.159, 'learning_rate': 2.8494623655913982e-05, 'epoch': 1.29}
{'loss': 0.3066, 'learning_rate': 2.7956989247311828e-05, 'epoch': 1.32}
{'loss': 0.1938, 'learning_rate': 2.7419354838709678e-05, 'epoch': 1.35}
{'loss': 0.1355, 'learning_rate': 2.6881720430107527e-05, 'epo

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.31458428502082825, 'eval_runtime': 4.317, 'eval_samples_per_second': 286.539, 'eval_steps_per_second': 18.068, 'epoch': 2.0}
{'loss': 0.144, 'learning_rate': 1.6129032258064517e-05, 'epoch': 2.03}
{'loss': 0.0032, 'learning_rate': 1.5591397849462366e-05, 'epoch': 2.06}
{'loss': 0.0022, 'learning_rate': 1.5053763440860215e-05, 'epoch': 2.1}
{'loss': 0.0023, 'learning_rate': 1.4516129032258066e-05, 'epoch': 2.13}
{'loss': 0.1042, 'learning_rate': 1.3978494623655914e-05, 'epoch': 2.16}
{'loss': 0.1075, 'learning_rate': 1.3440860215053763e-05, 'epoch': 2.19}
{'loss': 0.1433, 'learning_rate': 1.2903225806451613e-05, 'epoch': 2.23}
{'loss': 0.0425, 'learning_rate': 1.2365591397849464e-05, 'epoch': 2.26}
{'loss': 0.0042, 'learning_rate': 1.1827956989247313e-05, 'epoch': 2.29}
{'loss': 0.0051, 'learning_rate': 1.129032258064516e-05, 'epoch': 2.32}
{'loss': 0.0344, 'learning_rate': 1.0752688172043012e-05, 'epoch': 2.35}
{'loss': 0.1863, 'learning_rate': 1.0215053763440861e-05, '

  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.23098085820674896, 'eval_runtime': 4.3805, 'eval_samples_per_second': 282.389, 'eval_steps_per_second': 17.806, 'epoch': 3.0}
{'train_runtime': 176.4869, 'train_samples_per_second': 84.091, 'train_steps_per_second': 5.27, 'train_loss': 0.1687075160493854, 'epoch': 3.0}


  0%|          | 0/78 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9172    0.9000    0.9085       160
           1     0.9852    0.9879    0.9866      1077

    accuracy                         0.9766      1237
   macro avg     0.9512    0.9440    0.9475      1237
weighted avg     0.9764    0.9766    0.9765      1237



In [3]:
# Running test on 10 random samples
sample_df = test_df.sample(10, random_state=42).reset_index(drop=True)

# Tokenize the samples
encodings = tokenizer(
    sample_df["text"].tolist(),
    truncation=True,
    padding="max_length",
    max_length=512,
    return_tensors="pt"
).to(device)

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**encodings)
    probs = torch.softmax(outputs.logits, dim=1)
    predictions = torch.argmax(probs, dim=1).cpu().numpy()

# Display actual vs predicted
for i in range(10):
    print(f"\n Text {i+1}:")
    print(f"Text: {sample_df.loc[i, 'text'][:300]}{'...' if len(sample_df.loc[i, 'text']) > 300 else ''}")
    print(f"Actual label   : {sample_df.loc[i, 'label']}")
    print(f"Predicted label: {predictions[i]}")



 Text 1:
Text: The federal government alerts its contractors to CI threats
and subjects them to "awareness programs" under the
DOD's Defense Information Counter Espionage (DICE)
program. The Defense Investigative Service (DIS)
maintains a host of useful databases such as the Foreign
Ownership, Control, or Influenc...
Actual label   : 1
Predicted label: 1

 Text 2:
Text: 9. The prosperity of nations rests very largely on the
      six inches of soil between the surface and the subsoil
      of the territory.
Actual label   : 1
Predicted label: 1

 Text 3:
Text: _Accept_ means _to receive_. _Except_ as a verb means _to exclude_; as a
preposition it means _with the exception of_. Insert the correct form in
the following:
Actual label   : 1
Predicted label: 1

 Text 4:
Text: How the West lost the East. The economics, the politics, the geopolitics, the
conspiracies, the corruption, the old and the new, the plough and the internet - it is all
here, in colourful and provocative prose.
Actual

In [None]:
# Testing the models generalization on new unseen data

# Random new text samples
new_texts = [
    "Photosynthesis is the process by which green plants convert sunlight into energy.",
    "Bibliography\nChapter 3\nIndex\nAcknowledgments",
    "The mitochondria is often called the powerhouse of the cell.",
    "References\n[1] Smith et al. 2022",
    "This text explains Newton’s laws of motion in detail.",
    "Appendix A: Glossary of Terms",
    "How does the water cycle work in nature?",
    "The syllabus is subject to change without notice.",
    "Cell division is crucial for reproduction in organisms.",
    "Table of contents\n1. Preface\n2. Introduction"
]

# Tokenize the new texts
encodings = tokenizer(
    new_texts,
    truncation=True,
    padding="max_length",
    max_length=512,
    return_tensors="pt"
).to(device)

# RUn the model on new texts
model.eval()
with torch.no_grad():
    outputs = model(**encodings)
    probs = torch.softmax(outputs.logits, dim=1)
    predictions = torch.argmax(probs, dim=1).cpu().numpy()

# Results 
for i, text in enumerate(new_texts):
    print(f"\nText {i+1}:")
    print(f"Text: {text[:300]}{'...' if len(text) > 300 else ''}")
    print(f"Predicted label: {predictions[i]} (1 = Relevant, 0 = Irrelevant)")



Text 1:
Text: Photosynthesis is the process by which green plants convert sunlight into energy.
Predicted label: 1 (1 = Relevant, 0 = Irrelevant)

Text 2:
Text: Bibliography
Chapter 3
Index
Acknowledgments
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 3:
Text: The mitochondria is often called the powerhouse of the cell.
Predicted label: 1 (1 = Relevant, 0 = Irrelevant)

Text 4:
Text: References
[1] Smith et al. 2022
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 5:
Text: This text explains Newton’s laws of motion in detail.
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 6:
Text: Appendix A: Glossary of Terms
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 7:
Text: How does the water cycle work in nature?
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 8:
Text: The syllabus is subject to change without notice.
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 9:
Text: Cell division is crucial for reproduction in organisms.
Predicted lab

In [None]:
#save the model
model_save_path = "classifier_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)  

# using the saved model
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained(model_save_path)
model = DistilBertForSequenceClassification.from_pretrained(model_save_path).to(device)

# Testing the saved model on long texts
long_texts = [

    "In this chapter, we explore the foundational principles of quantum mechanics, including the concept of wave-particle duality, uncertainty, and quantum entanglement. These phenomena have been experimentally verified and form the basis for quantum computing and quantum encryption. Students will learn how to apply Schrödinger’s equation and analyze quantum states in one-dimensional potential wells, along with real-world implications.",
    
    "Table of Contents\nPreface\nChapter 1: Introduction to Biology\nChapter 2: The Cell\nChapter 3: Genetics\nChapter 4: Evolution\nChapter 5: Ecology\nChapter 6: Human Anatomy\nChapter 7: The Immune System\nGlossary\nIndex\nAcknowledgements\nAuthor Biography\nDisclaimer: This textbook is provided as-is without warranty.",
    

    "Photosynthesis is a biochemical process that converts light energy into chemical energy in plants, algae, and some bacteria. The process involves two major stages: the light-dependent reactions and the Calvin cycle. In the light-dependent reactions, sunlight is absorbed by chlorophyll and used to split water molecules, releasing oxygen and generating ATP and NADPH. The Calvin cycle then uses these energy molecules to fix carbon dioxide into glucose, which serves as an energy source for the plant.",
    
    "Acknowledgements\nThis work would not have been possible without the support of our research assistants, editorial team, and the generous funding provided by the National Science Foundation and other partners. We also thank the numerous reviewers and contributors who helped shape the final version of this manuscript. All errors, however, remain the responsibility of the authors."
]
# Tokenize the texts
encodings = tokenizer(
    long_texts,
    truncation=True,
    padding="max_length",
    max_length=512,
    return_tensors="pt"
).to(device)
# Predicting usijng the saved model
model.eval()
with torch.no_grad():
    outputs = model(**encodings)
    probs = torch.softmax(outputs.logits, dim=1)
    predictions = torch.argmax(probs, dim=1).cpu().numpy()

for i, text in enumerate(long_texts):
    print(f"\nText {i+1}:")
    print(f"Text: {text[:300]}{'...' if len(text) > 300 else ''}")
    print(f"Predicted label: {predictions[i]} (1 = Relevant, 0 = Irrelevant)")





Text 1:
Text: In this chapter, we explore the foundational principles of quantum mechanics, including the concept of wave-particle duality, uncertainty, and quantum entanglement. These phenomena have been experimentally verified and form the basis for quantum computing and quantum encryption. Students will learn ...
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 2:
Text: Table of Contents
Preface
Chapter 1: Introduction to Biology
Chapter 2: The Cell
Chapter 3: Genetics
Chapter 4: Evolution
Chapter 5: Ecology
Chapter 6: Human Anatomy
Chapter 7: The Immune System
Glossary
Index
Acknowledgements
Author Biography
Disclaimer: This textbook is provided as-is without warr...
Predicted label: 0 (1 = Relevant, 0 = Irrelevant)

Text 3:
Text: Photosynthesis is a biochemical process that converts light energy into chemical energy in plants, algae, and some bacteria. The process involves two major stages: the light-dependent reactions and the Calvin cycle. In the light-dependent reaction