In [13]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.4.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251230-py3-none-any.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
import os
dir_path ="/content/drive/MyDrive/projects/chu_chat_bot"
data_path = "/content/drive/MyDrive/projects/chu_chat_bot/data"
os.makedirs(dir_path, exist_ok = True )
os.makedirs(data_path, exist_ok = True )



In [27]:
import pathlib as path
import pdfplumber


def pdf_to_pages(file_path): #list of dicts
  file_path = path.Path(file_path)
  pages = []
  with pdfplumber.open(str(file_path)) as pdf :
    for i , page in enumerate(pdf.pages , start = 1): #page 1 stays 1 for convenience
      pages.append({
          "source" : file_path.name,
          "page" : i,
          "text" : page.extract_text() or ""
      })
  return pages





In [28]:
import re #cleaning biblio

import re

def text_cleaning(s):
    if not s:
        return ""

    s = s.replace("\x00", " ").replace("\xa0", " ") #null bytes
    s = s.replace("\t", " ") # tab -> 1 space
    s = re.sub(r"-\n", "", s) #exemple : bigwo- then back line -od -> bigword
    s = re.sub(r"\n{3,}", "\n\n", s) # more than 2 newlines to 2 lines
    s = re.sub(r"[ ]{2,}", " ", s)#spaces to one space

    return s.strip() # superfluous spaces at the end or beginning


In [29]:
def load_and_clean(pdf_path , min_char = 50):
  pages = pdf_to_pages(pdf_path)
  for page in pages :
    page["text"] = text_cleaning(page["text"])

  pages = [ p for p in pages if len(p["text"])>min_char] # drop pages where page has less than 50 character
  return pages


In [32]:
def chunk_text(text , length = 1200 , overlap = 200) :
  chunks = []
  start = 0
  while start <len(text) :
    end = min(start + length , len(text))
    chunk = (text[start:end]).strip()
    if chunk :
      chunks.append(chunk)
    if end == len(text) :
      break
    start = max(end - overlap,0)
  return chunks


In [33]:
def load_and_clean_and_chunk(pdf_path,tier: int = 2, doc_date: str = None):
  chunks = []
  pages = load_and_clean(pdf_path)
  for page in pages :
    for i , chunk in enumerate(chunk_text(page["text"]),start = 1):
      chunks.append({
          "source" : page["source"],
          "page" : page["page"],
          "chunk" : i,
          "chunk_id": f"{page['source']}_p{page['page']}_c{i}", #unqiue id
          "tier": tier,
          "date": doc_date,
          "text": chunk_text
      })
  return chunks

In [34]:
def apply_to_all_pdfs(pdf_dir_path) :
  chunks = []
  for pdf_path in pdf_dir_path.glob("*.pdf"):
    chunks.extend(load_and_clean_and_chunk(pdf_path))
  return chunks


In [38]:
import json
def save_jsonl(chunks,data_path):
  out_dir = path.Path(data_path)
  out_dir.mkdir(exist_ok = True)
  with open(data_path + "/chunks.jsonl","w" ,encoding= "utf-8") as f :
    for chunk in chunks :
      f.write(json.dumps(chunk,ensure_ascii=False) + "\n")


In [39]:
def full_pipe_line(pdf_dir_path,data_path):
  chunks = apply_to_all_pdfs(pdf_dir_path)
  save_jsonl(chunks,data_path)
  return chunks