In [1]:
import sys
import os
script_dir = os.getcwd()
root_dir = os.path.join(os.path.dirname(os.path.abspath(script_dir)))
sys.path.append(root_dir)
os.chdir(root_dir)

In [None]:
import json
import pandas as pd
import numpy as np
import requests
import faiss
from openai import OpenAI
from utils.embedding import index_context_db
from dotenv import load_dotenv
from utils.check_db_version import get_local_version
from utils.flatten_statement import flatten_statements
from tqdm import tqdm
from tika import parser
_VERSION=get_local_version()
_VERSION

'2025-09-04'

In [4]:
with open(f"data/latest_db/fda_statements__{_VERSION}.json", "r") as f:
    fda_statements=json.load(f)
len(fda_statements)

642

## Access and download FDA labels

In [5]:
download_path=f"data/latest_db/fda_labels__{_VERSION}"
os.makedirs(download_path, exist_ok=True)

In [6]:
fda_label_urls={stmt.get("reportedIn", [{}])[0].get("url", None) for stmt in fda_statements}
len(fda_label_urls)

102

In [91]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/116.0.0.0 Safari/537.36",
    "Accept": "application/pdf"
}

for url in tqdm(fda_label_urls):
    response = requests.get(url, headers=headers, timeout=20)
    filename = os.path.join(download_path, url.replace("https://","").replace("/","_"))
    with open(filename, "wb") as f:
        f.write(response.content)

100%|██████████| 102/102 [00:23<00:00,  4.28it/s]


## Extract indication and usage section

### Functions

In [11]:
def extract_section(
    file_path, 
    start_text="INDICATIONS AND USAGE",
    end_text="DOSAGE AND ADMINISTRATION"
    ):
    raw=parser.from_file(file_path)
    section=raw['content']

    #1) remove empty lines and convert letter to lower case
    section=os.linesep.join([s for s in section.splitlines() if s.strip()])

    #2) extract text between two headers
    if start_text in section and end_text in section:
        start_ind=section.index(start_text)
        end_ind=section.index(end_text)
        final_chunk=section[start_ind:end_ind].strip()
        return final_chunk
    else:
        return None

file_path = f"data/latest_db/fda_labels__{_VERSION}/www.accessdata.fda.gov_drugsatfda_docs_label_2023_211723s004lbl.pdf"
final_chunk = extract_section(file_path)
print(final_chunk)

INDICATIONS AND USAGE --------------------------
TAZVERIK is a methyltransferase inhibitor indicated for the treatment of: 
 Adults and pediatric patients aged 16 years and older with metastatic or 
locally advanced epithelioid sarcoma not eligible for complete resection. 
(1.1) 
 Adult patients with relapsed or refractory follicular lymphoma whose 
tumors are positive for an EZH2 mutation as detected by an 
FDA-approved test and who have received at least 2 prior systemic 
therapies. (1.2) 
 Adult patients with relapsed or refractory follicular lymphoma who have 
no satisfactory alternative treatment options. (1.2) 
These indications are approved under accelerated approval based on overall 
response rate and duration of response. Continued approval for these 
indications may be contingent upon verification and description of clinical 
benefit in a confirmatory trial(s). 
-----------------------


In [22]:
import re
import unicodedata

def clean_fda_text(text, section_name="INDICATIONS AND USAGE"):
    lines = text.splitlines()
    cleaned_lines = []

    for i, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue
        #1) skip empty lines that are mostly dashes
        if not line or re.fullmatch(r'-{3,}', line):
            continue
        #2) skip the section header line itself
        if i == 0 and line.upper().startswith(section_name):
            continue
        #3) normalize unicode and remove non-printable characters
        line = unicodedata.normalize("NFKD", line)
        line = "".join([c for c in line if c.isprintable()]) 
        cleaned_lines.append(line)

    return " ".join(cleaned_lines).lower()

clean_fda_text(final_chunk)

'tazverik is a methyltransferase inhibitor indicated for the treatment of:  adults and pediatric patients aged 16 years and older with metastatic or locally advanced epithelioid sarcoma not eligible for complete resection. (1.1)  adult patients with relapsed or refractory follicular lymphoma whose tumors are positive for an ezh2 mutation as detected by an fda-approved test and who have received at least 2 prior systemic therapies. (1.2)  adult patients with relapsed or refractory follicular lymphoma who have no satisfactory alternative treatment options. (1.2) these indications are approved under accelerated approval based on overall response rate and duration of response. continued approval for these indications may be contingent upon verification and description of clinical benefit in a confirmatory trial(s).'

### Extract for all PDFs

In [28]:
fda_label_path = f"data/latest_db/fda_labels__{_VERSION}"

all_files = os.listdir(fda_label_path)

unstructured_contexts=[]
for f in all_files:
    section_chunk=extract_section(os.path.join(fda_label_path, f))
    cleaned_chunk=clean_fda_text(section_chunk)
    unstructured_contexts.append(cleaned_chunk)
    

In [31]:
len({ctx for ctx in unstructured_contexts})

102

## Save and index context

In [33]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
_CLIENT = OpenAI(api_key=api_key)
_MODEL_EMBED = "text-embedding-3-small"

def _cache_paths(output_dir: str, embed_name: str, name: str, version: str = "v1"):
    os.makedirs(output_dir, exist_ok=True)
    return (
        f"{output_dir}/{embed_name}_{name}__{version}.faiss",
        f"{output_dir}/{embed_name}_{name}__{version}.json",
    )
    
index_path, ctx_path = _cache_paths("data/latest_db/indexes", _MODEL_EMBED, "unstructured_context", version=_VERSION)

_INDEX = index_context_db(unstructured_contexts, _CLIENT, _MODEL_EMBED)
faiss.write_index(_INDEX, index_path)
with open(ctx_path, "w") as f:
    json.dump(unstructured_contexts, f)