# Extracting Based on Chunk Size using LangChain

In [12]:
import fitz  
import json
import os
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter


def extract_chunks_langchain(pdf_path, chunk_size_words, overlap_words):
    doc = fitz.open(pdf_path)
    full_text = "".join(page.get_text("text") for page in doc)

    # Estimate characters per word if you want word-based approximation
    avg_chars_per_word = 5  
    chunk_size = chunk_size_words * avg_chars_per_word
    overlap = overlap_words * avg_chars_per_word

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    return splitter.split_text(full_text)
    

pdf_files = [
    "Documents/Legal Aspects of Corporate Management and Finance.pdf",
    "Documents/PrinciplesofFinance-WEB.pdf",
    "Documents/Financial-Management-for-Small-Businesses-2nd-OER-Edition-1627674276.pdf",
    "Documents/International Finance - Theory and Policy.pdf",
]

chunk_sizes = [128]
overlap_ratio = 0.15  

overlap = int(128 * overlap_ratio) 
output_data = {}

for pdf_file in pdf_files:
    if os.path.exists(pdf_file):
        print(f"Extracting {128}-size chunks (overlap: {overlap}) from {pdf_file}...")
        chunks = extract_chunks_langchain(pdf_file, 128, overlap)
        output_data[pdf_file] = chunks
    else:
        print(f"File not found: {pdf_file}")

os.makedirs("Results", exist_ok=True)
output_filename = f"Results/extracted_chunk_{128}_overlap.json"
with open(output_filename, "w", encoding="utf-8") as json_file:
    json.dump(output_data, json_file, indent=4, ensure_ascii=False)

print(f"Extraction complete. Data saved to {output_filename}")


Extracting 128-size chunks (overlap: 19) from Documents/Legal Aspects of Corporate Management and Finance.pdf...
Extracting 128-size chunks (overlap: 19) from Documents/PrinciplesofFinance-WEB.pdf...
Extracting 128-size chunks (overlap: 19) from Documents/Financial-Management-for-Small-Businesses-2nd-OER-Edition-1627674276.pdf...
Extracting 128-size chunks (overlap: 19) from Documents/International Finance - Theory and Policy.pdf...
Extraction complete. Data saved to Results/extracted_chunk_128_overlap.json


In [None]:
labeling_output = f"Results/chunks_for_labeling.jsonl"

with open(labeling_output, "w", encoding="utf-8") as f:
    for pdf_file, chunks in output_data.items():
        for idx, chunk in enumerate(chunks):
            record = {
                "pdf_file": pdf_file,
                "chunk_id": f"{os.path.basename(pdf_file)}_chunk_{idx}",
                "text": chunk,
                "label": None  # To be filled during manual labeling
            }
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"Labeling dataset saved to {labeling_output}")




Labeling dataset saved to Results/chunks_for_labeling_128.jsonl


In [6]:
import json
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split

import transformers
print(transformers.__version__)


# loading labeled chunks
data = []
with open("Results/labeled_chunks.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        if record["label"] is not None:
            data.append({"text": record["text"], "label": int(record["label"])})

print(f"Loaded {len(data)} labeled chunks")

# Ensure labels are integers
for record in data:
    if isinstance(record["label"], str):
        record["label"] = int(record["label"])  

# Ensure text is a string
for record in data:
    if not isinstance(record["text"], str):
        record["text"] = str(record["text"])  # Convert to string if not already

# class distribution
from collections import Counter
label_counts = Counter(record["label"] for record in data)
print("Label distribution:", label_counts)

# Label distribution: Counter({1: 1117, 0: 137})
# Ensure the dataset is balanced
if label_counts[0] < label_counts[1]:
    print("Warning: The dataset is imbalanced. Consider balancing it before training.")



# Train-test split with balanced classes
train_data, test_data = train_test_split(data, test_size=0.2, stratify=[record["label"] for record in data], random_state=42)

# Loading dataset
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

# Loading tokenizer and model (DistilBERT)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove text field for training
train_dataset = train_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])

# Load model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir="./chunk_classifier_distilbert_results",
   # evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Training
trainer.train()

# Evaluating
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

# Save model
model.save_pretrained("./chunk_classifier_distilbert")
tokenizer.save_pretrained("./chunk_classifier_distilbert")

print("Model and tokenizer saved to ./chunk_classifier_distilbert")


4.51.3
Loaded 1254 labeled chunks
Label distribution: Counter({1: 1117, 0: 137})


Map:   0%|          | 0/1003 [00:00<?, ? examples/s]

Map:   0%|          | 0/251 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluation results: {'eval_loss': 0.09059876948595047, 'eval_runtime': 0.9197, 'eval_samples_per_second': 272.925, 'eval_steps_per_second': 34.795, 'epoch': 3.0}
Model and tokenizer saved to ./chunk_classifier_distilbert


In [7]:
# Load your fine-tuned model and tokenizer
model_path = "./chunk_classifier_distilbert"

tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.eval()  # set to evaluation mode

# Send to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [8]:
def classify_chunk(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()

    return "Content" if prediction == 1 else "Non-content"


In [14]:
example_1 = """**Table of Contents**

Introduction.................................................... 1
    1.1 Background............................................ 3
    1.2 Objectives............................................ 5
Chapter 1: Methodology......................................... 1"""
example_2 = "By default: Evaluation happens only at the end of training You won’t see eval loss after each epoch, but training still works fine If you want evaluation during training and evaluation_strategy still breaks, let me know — I can show how to manually call trainer.evaluate() after each epoch."
print("Example 1:", classify_chunk(example_1))
print("Example 2:", classify_chunk(example_2))


Example 1: Content
Example 2: Content
