In [2]:
import sys
import os
script_dir = os.getcwd()
root_dir = os.path.join(os.path.dirname(os.path.abspath(script_dir)))
sys.path.append(root_dir)
os.chdir(root_dir)

In [16]:
import json
import pandas as pd
import numpy as np
import requests
import faiss
from openai import OpenAI
from utils.embedding import index_context_db
from dotenv import load_dotenv
from utils.check_db_version import get_local_version
from utils.flatten_statement import flatten_statements
from tqdm import tqdm
from tika import parser
_VERSION=get_local_version()
_VERSION

'2025-10-03'

In [3]:
with open(f"data/latest_db/fda_statements__{_VERSION}.json", "r") as f:
    fda_statements=json.load(f)
len(fda_statements)

651

## Access and download FDA labels

In [4]:
download_path=f"data/latest_db/fda_labels__{_VERSION}"
os.makedirs(download_path, exist_ok=True)

In [5]:
fda_label_urls={stmt.get("reportedIn", [{}])[0].get("url", None) for stmt in fda_statements}
len(fda_label_urls)

106

In [6]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/116.0.0.0 Safari/537.36",
    "Accept": "application/pdf"
}

for url in tqdm(fda_label_urls):
    response = requests.get(url, headers=headers, timeout=20)
    filename = os.path.join(download_path, url.replace("https://","").replace("/","_"))
    with open(filename, "wb") as f:
        f.write(response.content)

100%|██████████| 106/106 [01:38<00:00,  1.07it/s]


## Extract indication and usage section

### Functions

In [7]:
def extract_section(
    file_path, 
    start_text="INDICATIONS AND USAGE",
    end_text="DOSAGE AND ADMINISTRATION"
    ):
    raw=parser.from_file(file_path)
    section=raw['content']

    #1) remove empty lines and convert letter to lower case
    section=os.linesep.join([s for s in section.splitlines() if s.strip()])

    #2) extract text between two headers
    if start_text in section and end_text in section:
        start_ind=section.index(start_text)
        end_ind=section.index(end_text)
        final_chunk=section[start_ind:end_ind].strip()
        return final_chunk
    else:
        return None

file_path = f"data/latest_db/fda_labels__{_VERSION}/www.accessdata.fda.gov_drugsatfda_docs_label_2023_211723s004lbl.pdf"
final_chunk = extract_section(file_path)
print(final_chunk)

2025-10-04 17:34:12,658 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2025-10-04 17:34:17,704 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


INDICATIONS AND USAGE --------------------------
TAZVERIK is a methyltransferase inhibitor indicated for the treatment of: 
 Adults and pediatric patients aged 16 years and older with metastatic or 
locally advanced epithelioid sarcoma not eligible for complete resection. 
(1.1) 
 Adult patients with relapsed or refractory follicular lymphoma whose 
tumors are positive for an EZH2 mutation as detected by an 
FDA-approved test and who have received at least 2 prior systemic 
therapies. (1.2) 
 Adult patients with relapsed or refractory follicular lymphoma who have 
no satisfactory alternative treatment options. (1.2) 
These indications are approved under accelerated approval based on overall 
response rate and duration of response. Continued approval for these 
indications may be contingent upon verification and description of clinical 
benefit in a confirmatory trial(s). 
-----------------------


In [8]:
import re
import unicodedata

def clean_fda_text(text, section_name="INDICATIONS AND USAGE"):
    lines = text.splitlines()
    cleaned_lines = []

    for i, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue
        #1) skip empty lines that are mostly dashes
        if not line or re.fullmatch(r'-{3,}', line):
            continue
        #2) skip the section header line itself
        if i == 0 and line.upper().startswith(section_name):
            continue
        #3) normalize unicode and remove non-printable characters
        line = unicodedata.normalize("NFKD", line)
        line = "".join([c for c in line if c.isprintable()]) 
        cleaned_lines.append(line)

    return " ".join(cleaned_lines).lower()

clean_fda_text(final_chunk)

'tazverik is a methyltransferase inhibitor indicated for the treatment of:  adults and pediatric patients aged 16 years and older with metastatic or locally advanced epithelioid sarcoma not eligible for complete resection. (1.1)  adult patients with relapsed or refractory follicular lymphoma whose tumors are positive for an ezh2 mutation as detected by an fda-approved test and who have received at least 2 prior systemic therapies. (1.2)  adult patients with relapsed or refractory follicular lymphoma who have no satisfactory alternative treatment options. (1.2) these indications are approved under accelerated approval based on overall response rate and duration of response. continued approval for these indications may be contingent upon verification and description of clinical benefit in a confirmatory trial(s).'

### Extract for all PDFs

In [9]:
fda_label_path = f"data/latest_db/fda_labels__{_VERSION}"

all_files = os.listdir(fda_label_path)

unstructured_contexts=[]
for f in all_files:
    section_chunk=extract_section(os.path.join(fda_label_path, f))
    cleaned_chunk=clean_fda_text(section_chunk)
    unstructured_contexts.append(cleaned_chunk)
    

## Save and index context

In [4]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
CLIENT = OpenAI(api_key=api_key)
MODEL_EMBED = "text-embedding-3-small"

def _cache_paths(output_dir: str, embed_name: str, name: str, version: str = "v1"):
    os.makedirs(output_dir, exist_ok=True)
    return (
        f"{output_dir}/{embed_name}_{name}__{version}.faiss",
        f"{output_dir}/{embed_name}_{name}__{version}.json",
    )
    

In [None]:
index_path, ctx_path = _cache_paths("data/latest_db/indexes", MODEL_EMBED, "fda_unstructured_context", version=_VERSION)

INDEX = index_context_db(unstructured_contexts, CLIENT, MODEL_EMBED)
faiss.write_index(INDEX, index_path)
with open(ctx_path, "w") as f:
    json.dump(unstructured_contexts, f)

In [None]:
_, ctx_path = _cache_paths("data/latest_db/indexes", MODEL_EMBED, "fda_unstructured_context", version=_VERSION)

with open(ctx_path, "r") as f:
    unstructured_contexts=json.load(f)

In [6]:
for i, c in enumerate(unstructured_contexts):
    print(c)
    if i == 2:
        break

lynparza is a poly (adp-ribose) polymerase (parp) inhibitor indicated: ovarian cancer • for the maintenance treatment of adult patients with deleterious or suspected deleterious germline or somatic brca-mutated advanced epithelial ovarian, fallopian tube or primary peritoneal cancer who are in complete or partial response to first-line platinum-based chemotherapy. select patients for therapy based on an fda-approved companion diagnostic for lynparza. (1.1, 2.1) • in combination with bevacizumab for the maintenance treatment of adult patients with advanced epithelial ovarian, fallopian tube or primary peritoneal cancer who are in complete or partial response to first-line platinum-based chemotherapy and whose cancer is associated with homologous recombination deficiency (hrd)-positive status defined by either: • a deleterious or suspected deleterious brca mutation, and/or • genomic instability. select patients for therapy based on an fda-approved companion diagnostic for lynparza. (1.2,

## Update brand-generic name mapping

In [47]:
from utils.io import load_object
drug_names_mapping_dict=load_object(filename='data/fda_drug_names_mapping_dict.pkl')
drug_names_mapping_dict

{'bosulif': [{'bosutinib'}],
 'sprycel': [{'dasatinib'}],
 'gleevec': [{'imatinib'}],
 'scemblix': [{'asciminib'}],
 'alecensa': [{'alectinib'}],
 'xalkori': [{'crizotinib'}],
 'lorbrena': [{'lorlatinib'}],
 'zykadia': [{'ceritinib'}],
 'pemazyre': [{'pemigatinib'}],
 'truseltiq': [{'infigratinib'}],
 'balversa': [{'erdafitinib'}],
 'vitrakvi': [{'larotrectinib'}],
 'rozlytrek': [{'entrectinib'}],
 'retevmo': [{'selpercatinib'}],
 'gavreto': [{'pralsetinib'}],
 'lynparza': [{'bevacizumab', 'olaparib'},
  {'olaparib'},
  {'abiraterone', 'olaparib', 'prednisone'}],
 'tafinlar': [{'dabrafenib', 'trametinib'}, {'dabrafenib'}],
 'mekinist': [{'trametinib'}],
 'braftovi': [{'binimetinib', 'encorafenib'},
  {'cetuximab', 'encorafenib'},
  {'encorafenib'}],
 'zelboraf': [{'vemurafenib'}],
 'rubraca': [{'rucaparib'}],
 'zejula': [{'niraparib'}],
 'tagrisso': [{'osimertinib'}, {'cisplatin', 'osimertinib', 'pemetrexed'}],
 'gilotrif': [{'afatinib'}],
 'rybrevant': [{'amivantamab-vmjw'},
  {'amiva

In [17]:
with open(f"data/latest_db/fda_statements__{_VERSION}.json", "r") as f:
    fda_statements=json.load(f)
len(fda_statements)

651

In [67]:
import ast
moalmanac_db=pd.read_csv(f"data/latest_db/moalmanac_fda_core_query__{_VERSION}.csv", index_col=0)
moalmanac_db.head()

Unnamed: 0,statement_id,standardized_cancer,raw_cancer,modified_standardized_cancer,biomarker,therapy,prompt,answer
0,0,invasive breast carcinoma,early breast cancer,early invasive breast carcinoma,"['ER positive', 'HER2-negative']","['Tamoxifen', 'Abemaciclib']",if a patient with early invasive breast carcin...,if a patient with early invasive breast carcin...
1,1,invasive breast carcinoma,early breast cancer,early invasive breast carcinoma,"['PR positive', 'HER2-negative']","['Tamoxifen', 'Abemaciclib']",if a patient with early invasive breast carcin...,if a patient with early invasive breast carcin...
2,2,invasive breast carcinoma,early breast cancer,early invasive breast carcinoma,"['PR positive', 'ER positive', 'HER2-negative']","['Tamoxifen', 'Abemaciclib']",if a patient with early invasive breast carcin...,if a patient with early invasive breast carcin...
3,3,invasive breast carcinoma,advanced or metastatic breast cancer,advanced or metastatic invasive breast carcinoma,"['ER positive', 'HER2-negative']","['Anastrozole', 'Abemaciclib']",if a patient with advanced or metastatic invas...,if a patient with advanced or metastatic invas...
4,4,invasive breast carcinoma,advanced or metastatic breast cancer,advanced or metastatic invasive breast carcinoma,"['PR positive', 'HER2-negative']","['Anastrozole', 'Abemaciclib']",if a patient with advanced or metastatic invas...,if a patient with advanced or metastatic invas...


In [68]:
moalmanac_db['brand_name']=[stmt['reportedIn'][0].get('drug_name_brand', None).lower() for stmt in fda_statements]

In [92]:
moalmanac_db_drug_mapping=moalmanac_db.drop_duplicates(subset=['therapy', 'brand_name'])[['therapy', 'brand_name']]

In [96]:
moalmanac_db_drug_mapping['therapy']=moalmanac_db_drug_mapping['therapy'].apply(ast.literal_eval)

In [112]:
generic_list=[]
for therapy in moalmanac_db_drug_mapping['therapy']:
    generic_list.append([set(t.lower().strip() for t in therapy)])
len(generic_list)

moalmanac_db_drug_mapping['therapy']=generic_list

In [128]:
drug_brand_generic_mapping={}

for _, row in moalmanac_db_drug_mapping.iterrows():
    brand=row['brand_name']
    generic=row['therapy']
    if brand in drug_brand_generic_mapping.keys():
        drug_brand_generic_mapping[brand].extend(generic)
    else:
        drug_brand_generic_mapping[brand]=generic

In [154]:
from utils.io import save_object
save_object(drug_brand_generic_mapping, f"data/latest_db/fda_drug_names_mapping_dict__{_VERSION}.pkl")