In [5]:
import os
import hashlib
import requests
from io import BytesIO
from typing import List, Tuple, Dict

from pdf2image import convert_from_path
from pypdf import PdfReader
from PIL import Image
import base64


In [7]:
file = "/datasets/cc-20250630151645/data_process/Guidelines/"

def get_cpic_pdf_images_texts(
    directory: str
) -> List[Dict[str, List]]:
    """
    Read all PDFs in a directory and return list of dicts:
      {
        'path': full filepath,
        'name': filename,
        'images': [PIL.Image ...],
        'texts': [str ...]
      }
    """
    results: List[Dict[str, List]] = []
    for fname in sorted(os.listdir(directory)):
        if not fname.lower().endswith(".pdf"):
            continue
        full_path = os.path.join(directory, fname)

        reader = PdfReader(full_path)
        texts: List[str] = [pg.extract_text() or "" for pg in reader.pages]
        images = convert_from_path(full_path)

        if len(images) != len(texts):
            raise RuntimeError(
                f"Page count mismatch in {fname}: "
                f"{len(images)} images vs {len(texts)} texts"
            )

        results.append({
            "path": full_path,
            "name": fname,
            "images": images,
            "texts": texts,
        })

    return results
cpic_test = get_cpic_pdf_images_texts(file)


In [17]:
(cpic_test[0]['texts'])

['CPIC GUIDELINE\nCLINICAL PHARMACOLOGY & THERAPEUTICS | VOLUME 106 NUMBER 4 | OCTOBER 2019726\nClinical Pharmacogenetics Implementation \nConsortium (CPIC) Guideline for CYP2B6 and \nEfavirenz- Containing Antiretroviral Therapy\nZeruesenay Desta1, Roseann S. Gammal2,3, Li Gong4, Michelle Whirl-Carrillo4, Aditya H. Gaur5, \nChonlaphat Sukasem6,7, Jennifer Hockings8, Alan Myers9, Marelize Swart1, Rachel F. Tyndale10, \nCollen\xa0Masimirembwa11, Otito F. Iwuchukwu12, Sanika Chirwa13, Jeffrey Lennox14, Andrea Gaedigk15,  \nT eri E.\xa0Klein4 and David W . Haas13,16,*\nThe HIV type-  1 nonnucleoside reverse transcriptase \ninhibitor, efavirenz, is widely used to treat HIV type-1 \ninfection. Efavirenz is predominantly metabolized into \ninactive metabolites by cytochrome P450 (CYP)2B6, and \npatients with certain CYP2B6  genetic variants may be \nat increased risk for adverse effects, particularly central \nnervous system toxicity and treatment discontinuation. We \nsummarize the evidence 