# Extracting Based on Chunk Size using LangChain

In [12]:
import fitz  
import json
import os
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter


def extract_chunks_langchain(pdf_path, chunk_size_words, overlap_words):
    doc = fitz.open(pdf_path)
    full_text = "".join(page.get_text("text") for page in doc)

    # Estimate characters per word if you want word-based approximation
    avg_chars_per_word = 5  
    chunk_size = chunk_size_words * avg_chars_per_word
    overlap = overlap_words * avg_chars_per_word

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    return splitter.split_text(full_text)
    

pdf_files = [
    "Documents/Legal Aspects of Corporate Management and Finance.pdf",
    "Documents/PrinciplesofFinance-WEB.pdf",
    "Documents/Financial-Management-for-Small-Businesses-2nd-OER-Edition-1627674276.pdf",
    "Documents/International Finance - Theory and Policy.pdf",
]

chunk_sizes = [128]
overlap_ratio = 0.15  

overlap = int(128 * overlap_ratio) 
output_data = {}

for pdf_file in pdf_files:
    if os.path.exists(pdf_file):
        print(f"Extracting {128}-size chunks (overlap: {overlap}) from {pdf_file}...")
        chunks = extract_chunks_langchain(pdf_file, 128, overlap)
        output_data[pdf_file] = chunks
    else:
        print(f"File not found: {pdf_file}")

os.makedirs("Results", exist_ok=True)
output_filename = f"Results/extracted_chunk_{128}_overlap.json"
with open(output_filename, "w", encoding="utf-8") as json_file:
    json.dump(output_data, json_file, indent=4, ensure_ascii=False)

print(f"Extraction complete. Data saved to {output_filename}")


Extracting 128-size chunks (overlap: 19) from Documents/Legal Aspects of Corporate Management and Finance.pdf...
Extracting 128-size chunks (overlap: 19) from Documents/PrinciplesofFinance-WEB.pdf...
Extracting 128-size chunks (overlap: 19) from Documents/Financial-Management-for-Small-Businesses-2nd-OER-Edition-1627674276.pdf...
Extracting 128-size chunks (overlap: 19) from Documents/International Finance - Theory and Policy.pdf...
Extraction complete. Data saved to Results/extracted_chunk_128_overlap.json


In [None]:
labeling_output = f"Results/chunks_for_labeling.jsonl"

with open(labeling_output, "w", encoding="utf-8") as f:
    for pdf_file, chunks in output_data.items():
        for idx, chunk in enumerate(chunks):
            record = {
                "pdf_file": pdf_file,
                "chunk_id": f"{os.path.basename(pdf_file)}_chunk_{idx}",
                "text": chunk,
                "label": None  # To be filled during manual labeling
            }
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"Labeling dataset saved to {labeling_output}")




Labeling dataset saved to Results/chunks_for_labeling_128.jsonl


In [25]:
import json
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split

# loading labeled chunks
data = []
with open("Results/labeled_chunks.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        if record["label"] is not None:
            data.append({"text": record["text"], "label": int(record["label"])})

print(f"Loaded {len(data)} labeled chunks")

# Train-test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Loading dataset
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

# Loading tokenizer and model (DistilBERT)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove text field for training
train_dataset = train_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])

# Load model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir="./chunk_classifier_distilbert_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Training
trainer.train()

# Evaluating
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

# Save model
model.save_pretrained("./chunk_classifier_distilbert")
tokenizer.save_pretrained("./chunk_classifier_distilbert")

print("Model and tokenizer saved to ./chunk_classifier_distilbert")



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\ctngweru\AppData\Local\anaconda3\envs\llm-forge-Copy\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\ctngweru\AppData\Local\anaconda3\envs\llm-forge-Copy\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\ctngweru\AppData\Local\anaconda3\envs\llm-forge-Copy\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\ctngweru\AppData\Local\anaconda3\envs\llm-forge-

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import