## Loads

In [None]:
os.chdir(root_dir)
os.getcwd()

In [20]:
import sys
import os
script_dir = os.getcwd()
root_dir = os.path.join(os.path.dirname(os.path.abspath(script_dir)))
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(script_dir))))

import json
import pandas as pd
import numpy as np
from utils.io import load_object, save_object
from collections import Counter
import requests
import random
import math

All functions used in this notebook

In [4]:
#local db version
CACHE_FILE = "db_version_cache.json"

def get_remote_version():
    agents = requests.get('https://api.moalmanac.org/agents').json()
    return agents['service']['last_updated']

def get_local_version():
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE) as f:
            return json.load(f).get("version")
    return None

def save_local_version(version):
    with open(CACHE_FILE, "w") as f:
        json.dump({"version": version}, f)


#split ranges for testing on small batches
def split_ranges(max_int, num_ranges, samples=None, seed=42):
    random.seed(seed)
    step = math.ceil(max_int / num_ranges)
    ranges = [(i*step+1, min((i+1)*step, max_int)) for i in range(num_ranges)]
    
    if samples is None:
        return ranges
    
    # split samples evenly across ranges
    n_per_range = samples // num_ranges
    remainder = samples % num_ranges
    
    result = []
    for i, r in enumerate(ranges):
        count = n_per_range + (1 if i < remainder else 0)
        result.extend(random.randint(r[0], r[1]) for _ in range(count))
    
    return result


# function to subset FDA statements
def subset_db_statements(statements, organization='fda'):
    # subset statements
    subset=[statement for statement in statements if statement['reportedIn'][0]['organization']['id'] == organization]
    return subset


# function to ensure list
def ensure_list(x):
    return x if isinstance(x, list) else [x]


# function to extract biomarker from statement
def extract_biomarker_info(stmt):
    # extract biomarkers from the statement
    biomarkers_list = []
    for i in range(len(stmt['proposition']['biomarkers'])):
        biomarker = stmt["proposition"]["biomarkers"][i]["name"]
        extensions_dict = {item['name']: item['value'] for item in stmt['proposition']['biomarkers'][i]['extensions']}
        presence = extensions_dict.get('present', '')
        biomarkers_list.append(biomarker)
        
        # extract presence information
        if presence == True:
            biomarker += " [present]"
        else:   
            biomarker += " [not present]"
        if i == 0:
            biomarkers_str = biomarker
        else:
            biomarkers_str += f", {biomarker}"
    
    extracted_info = {
        "str": biomarkers_str, 
        "list": biomarkers_list
    }
    
    return extracted_info


# function to extract therapy info from statement
def extract_therapy_info(stmt):
    # extract membership operator
    obj = stmt.get('proposition', {}).get('objectTherapeutic', {})
    operator = obj.get('membership_operator', None)
    
    # extract therapy approach, type, and names
    if operator == 'AND':
        approach = 'Combination therapy'
        therapy_strategyList = []
        therapy_typeList = []
        for therapy in obj.get('therapies', []):
            extensions_dict = {item['name']: item['value'] for item in therapy['extensions']}
            therapy_strategyList.extend(ensure_list(extensions_dict['therapy_strategy']))
            therapy_typeList.extend(ensure_list(extensions_dict['therapy_type']))
        drugList = [drug.get('name', None) for drug in obj.get('therapies', [])]
        
    else:
        approach = 'Monotherapy'
        therapy_strategyList = []
        therapy_typeList = []
        extensions_dict = {item['name']: item['value'] for item in obj['extensions']}
        therapy_strategyList.extend(ensure_list(extensions_dict['therapy_strategy']))
        therapy_typeList.extend(ensure_list(extensions_dict['therapy_type']))
        drugList = [obj.get('name', None)]
    
    # sanity check for drugList
    if any(d is None for d in drugList):
        raise ValueError(f"Found None in drugList for statement {stmt['id']}")
    
    drug_str = " + ".join([d for d in drugList if d is not None])
    therapy_strategy_str = " + ".join([s for s in therapy_strategyList if s is not None])
    therapy_type_str = " + ".join([t for t in therapy_typeList if t is not None])
    
    extracted_info = {
        "str": {
            "drug_str": drug_str, 
            "therapy_approach": approach,
            "therapy_strategy_str": therapy_strategy_str, 
            "therapy_type_str": therapy_type_str
                }, 
        "list": {
            "drugList": drugList, 
            "therapy_approach": approach,
            "therapy_strategyList": therapy_strategyList, 
            "therapy_typeList": therapy_typeList
            }
        }
    
    return extracted_info


# function to flatten statement into summary text to include in context
def flatten_statements(stmt: dict) -> str:
    
    statement_id = stmt.get("id")
    
    # approval status
    approval_status = stmt.get("reportedIn", [{}])[0].get("subtype", "None")
    approval_org = stmt.get("reportedIn", [{}])[0].get("organization", {}).get("id", "Unknown organization")
    approval_url = stmt.get("reportedIn", [{}])[0].get("url", "Unknown URL")
    approval_date = stmt.get("reportedIn", [{}])[0].get("publication_date", "Unknown date")
    
    # description and indication
    description = stmt.get("description", "None")
    indication = stmt.get("indication", {}).get("indication", "None")
    
    # cancer type
    cancer_type = stmt.get("proposition", {}).get("conditionQualifier", {}).get("name", "Unknown cancer")
    
    # biomarkers
    biomarker = extract_biomarker_info(stmt)
    
    # therapy
    therapy_info = extract_therapy_info(stmt)
    
    # create summary text
    summary = (
        f"Indication: {indication}\n"
        f"Cancer type: {cancer_type}\n"
        f"Biomarkers: {biomarker['str']}\n"
        f"Therapy: {therapy_info['str']['drug_str']}\n"
        f"Therapy approach: {therapy_info['str']['therapy_approach']}\n"
        f"Therapy strategy: {therapy_info['str']['therapy_strategy_str']}\n"
        f"Therapy type: {therapy_info['str']['therapy_type_str']}\n"
        f"Description: {description}\n"
        f"Approval status: {approval_status} ({approval_org})\n"
        f"Approval url: {approval_url}\n"
        f"Publication date: {approval_date}"
    )
    
    # create row to add to dataframe
    row = {
        "statement_id": statement_id,
        "approval_status": approval_status,
        "approval_org": approval_org,
        "description": description,
        "indication": indication,
        "cancer_type": cancer_type,
        "biomarker": biomarker['list'],
        "therapy_drug": therapy_info['list']['drugList'],
        "therapy_approach": therapy_info['list']['therapy_approach'],
        "therapy_strategy": therapy_info['list']['therapy_strategyList'],
        "therapy_type": therapy_info['list']['therapy_typeList'],
        "approval_url": approval_url,
        "publication_date": approval_date,
        "context": summary
    }
    
    return summary, row

_VERSION = get_local_version()
print(_VERSION)

2025-09-04


## Examine latest DB fields

### Load latest MOAlamanc DB

In [None]:
!wget https://raw.githubusercontent.com/vanallenlab/moalmanac-db/refs/heads/main/moalmanac-draft.dereferenced.json -P $root_dir/data/latest_db

--2025-08-18 20:50:03--  https://raw.githubusercontent.com/vanallenlab/moalmanac-db/refs/heads/main/moalmanac-draft.dereferenced.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7382022 (7.0M) [text/plain]
Saving to: ‘/home/helenajun/rag-llm-cancer-paper/data/latest_db/moalmanac-draft.dereferenced.json’


2025-08-18 20:50:03 (69.5 MB/s) - ‘/home/helenajun/rag-llm-cancer-paper/data/latest_db/moalmanac-draft.dereferenced.json’ saved [7382022/7382022]



In [None]:
! wget https://github.com/vanallenlab/moalmanac-db/blob/main/molecular-oncology-almanac.json -P $root_dir/data/latest_db

--2025-08-18 20:54:12--  https://github.com/vanallenlab/moalmanac-db/blob/main/molecular-oncology-almanac.json
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘/home/helenajun/rag-llm-cancer-paper/data/latest_db/molecular-oncology-almanac.json’

molecular-oncology-     [ <=>                ]   2.18M  12.4MB/s    in 0.2s    

2025-08-18 20:54:12 (12.4 MB/s) - ‘/home/helenajun/rag-llm-cancer-paper/data/latest_db/molecular-oncology-almanac.json’ saved [2289110]



Key Fields (Brendan):
- biomarkers: statement > proposition > biomarkers > and then the "name" key for each biomarker. You can access the biomarker type from the list element "biomarker_type" in the "extensions" key for each biomarker. I am going to refactor this soonish (i hope) but also FYI there is an extension with the name "present" with the value "true" or "false", which represents if the biomarker is expected to be present or absent.
- cancer types: statement > proposition > conditionQualifier > name

Standardization:
- The names have been standardized to oncotree and, if not applicable, the NCI thesaurus. All therapy names are standardized to NCI thesaurus, too.

Database expansion:
- The indications.json in the GitHub also has Canadian ones. The EU/Irish will be on the GitHub hopefully by the end of the week (Aug 22nd)! We need to migrate the rest of the database before I leave for the second half of parental leave, but it's been low priority since we are not going to be involving those in upcoming papers

In [59]:
with open(f'{root_dir}/data/latest_db/moalmanac-draft.dereferenced.json') as f:
    d = json.load(f)
    print(len(d['content']))
    entity1 = d['content'][606] 
    entity2 = d['content'][32]
    print(entity1 == entity2)
    diff_keys = [k for k in entity1 if k != "id" and entity1.get(k) != entity2.get(k)]
    print(diff_keys)
    
    # print(d['content'][i]['proposition'].keys()) #dict_keys(['id', 'type', 'predicate', 'biomarkers', 'subjectVariant', 'conditionQualifier', 'objectTherapeutic'])
    # print(d['content'][i]['proposition']['objectTherapeutic'].keys()) #dict_keys(['id', 'membershipOperator', 'therapies'])
    # print(d['content'][i]['proposition']['conditionQualifier'].keys()) #dict_keys(['id', 'conceptType', 'name', 'mappings', 'extensions', 'primaryCoding'])
    # print(d['content'][i]['proposition']['biomarkers'][0]['name'])
    # print(d['content'][i]['proposition']['biomarkers'][0]['extensions'])
    # print(d['content'][i]['reportedIn'][0]['organization']['id'])
    
    fda_urls = []
    prop_type=[]
    prop_predicate=[]
    prop_biomarkers_len=[]
    prop_conditionQualifier=[]
    prop_subjectVariant=[]
    prop_objectTherapeutic=[]
    for i in range(len(d['content'])):
        if d['content'][i]['reportedIn'][0]['subtype'] == "Regulatory approval":
            fda_urls.append(d['content'][i]['reportedIn'][0]['url'])
        
        # print(d['content'][i]['proposition']['conditionQualifier']['name'])
        # print(d['content'][i]['proposition']['conditionQualifier']['extensions'])
        prop_biomarkers_len.append(len(d['content'][i]['proposition']['biomarkers']))
        prop_type.append(d['content'][i]['proposition']['type'])
        prop_predicate.append(d['content'][i]['proposition']['predicate'])
        prop_subjectVariant.append(d['content'][i]['proposition']['subjectVariant'])
        prop_conditionQualifier.append(d['content'][i]['proposition']['conditionQualifier'])
        prop_objectTherapeutic.append(d['content'][i]['proposition']['objectTherapeutic'])

        # for j in range(len(d['content'][i]['proposition']['biomarkers'])):
        #     print(d['content'][i]['proposition']['biomarkers'][j]['type'])
        #     print(d['content'][i]['proposition']['biomarkers'][j]['name'])
        #     print(d['content'][i]['proposition']['biomarkers'][j]['extensions'])
            
    # print(len(fda_urls))
    # print(Counter(prop_type))
    # print(Counter(prop_predicate))
    # print(np.median(prop_biomarkers_len))
#     Counter(prop_biomarkers)
#     Counter(prop_subjectVariant)
#     Counter(prop_conditionQualifier)
#     Counter(prop_objectTherapeutic)

# # len(Counter(fda_urls).keys()) #vs. 56 fda urls in previous version of moalamanc db

628
False
['description', 'indication', 'proposition']


### Create and update context db

In [3]:
# function to load moalmanac db and filter for FDA-reported statements
def load_db_statements(db_path):
    with open(db_path) as f:
        d = json.load(f)
    
    # filter for FDA-reported statements
    filtered = []
    for stmt in d.get('content', []): 
        try:
            if stmt['reportedIn'][0]['organization']['id'].lower() == 'fda':
                filtered.append(stmt)
        except (KeyError, IndexError, TypeError):
            continue
    
    return filtered

# function to ensure list
def ensure_list(x):
    return x if isinstance(x, list) else [x]

# function to extract biomarker from statement
def extract_biomarker_info(stmt):
    # extract biomarkers from the statement
    for i in range(len(stmt.get("proposition", {}).get("biomarkers", []))):
        if not stmt["proposition"]["biomarkers"][i].get("name"):
            continue
        biomarker = stmt["proposition"]["biomarkers"][i]["name"]
        extensions_dict = {item['name']: item['value'] for item in stmt['proposition']['biomarkers'][i]['extensions']}
        presence = extensions_dict['_present']
        
        # extract presence information
        if presence == True:
            biomarker += " [present]"
        else:   
            biomarker += " [not present]"
        
        # append to string
        if i == 0:
            biomarkers_str = biomarker
        else:
            biomarkers_str += f", {biomarker}"
    
    return biomarkers_str

# function to extract therapy info from statement
def extract_therapy_info(stmt):
    # extract membership operator
    obj = stmt.get('proposition', {}).get('objectTherapeutic', {})
    operator = obj.get('membershipOperator', None)
    
    # extract therapy approach, type, and names
    if operator == 'AND':
        approach = 'Combination therapy'
        therapy_strategyList = []
        therapy_typeList = []
        for therapy in obj.get('therapies', []):
            extensions_dict = {item['name']: item['value'] for item in therapy['extensions']}
            therapy_strategyList.extend(ensure_list(extensions_dict['therapy_strategy']))
            therapy_typeList.extend(ensure_list(extensions_dict['therapy_type']))
        drugList = [drug.get('name', None) for drug in obj.get('therapies', [])]
        
    else:
        approach = 'Monotherapy'
        therapy_strategyList = []
        therapy_typeList = []
        extensions_dict = {item['name']: item['value'] for item in obj['extensions']}
        therapy_strategyList.extend(ensure_list(extensions_dict['therapy_strategy']))
        therapy_typeList.extend(ensure_list(extensions_dict['therapy_type']))
        drugList = [obj.get('name', None)]
    
    # sanity check for drugList
    if any(d is None for d in drugList):
        raise ValueError(f"Found None in drugList for statement {stmt['id']}")
    
    drug_str = " + ".join([d for d in drugList if d is not None])
    therapy_strategy_str = " + ".join([s for s in therapy_strategyList if s is not None])
    therapy_type_str = " + ".join([t for t in therapy_typeList if t is not None])
    
    extracted_info = {
        "str": {
            "drug_str": drug_str, 
            "therapy_approach": approach,
            "therapy_strategy_str": therapy_strategy_str, 
            "therapy_type_str": therapy_type_str
                }, 
        "list": {
            "drugList": drugList, 
            "therapy_approach": approach,
            "therapy_strategyList": therapy_strategyList, 
            "therapy_typeList": therapy_typeList
            }
        }
    
    return extracted_info

# function to flatten statement into summary text to include in context
def flatten_statements(stmt: dict) -> str:
    
    statement_id = stmt.get("id")
    
    # approval status
    approval_status = stmt.get("reportedIn", [{}])[0].get("subtype", "None")
    approval_org = stmt.get("reportedIn", [{}])[0].get("organization", {}).get("id", "Unknown organization")
    approval_url = stmt.get("reportedIn", [{}])[0].get("url", "Unknown URL")
    approval_date = stmt.get("reportedIn", [{}])[0].get("publication_date", "Unknown date")
    
    # description and indication
    description = stmt.get("description", "None")
    indication = stmt.get("indication", {}).get("indication", "None")
    
    # cancer type
    cancer_type = stmt.get("proposition", {}).get("conditionQualifier", {}).get("name", "Unknown cancer")
    
    # biomarkers
    biomarker = extract_biomarker_info(stmt)
    
    # therapy
    therapy_info = extract_therapy_info(stmt)
    
    # create summary text
    summary = (
        f"Approval status: {approval_status} ({approval_org})\n"
        f"Description: {description}\n"
        f"Indication: {indication}\n"
        f"Cancer type: {cancer_type}\n"
        f"Biomarkers: {biomarker}\n"
        f"Therapy: {therapy_info['str']['drug_str']}\n"
        f"Therapy approach: {therapy_info['str']['therapy_approach']}\n"
        f"Therapy strategy: {therapy_info['str']['therapy_strategy_str']}\n"
        f"Therapy type: {therapy_info['str']['therapy_type_str']}\n"
        f"Approval url: {approval_url}\n"
        f"Publication date: {approval_date}"
    )
    
    # create row to add to dataframe
    row = {
        "statement_id": statement_id,
        "approval_status": approval_status,
        "approval_org": approval_org,
        "description": description,
        "indication": indication,
        "cancer_type": cancer_type,
        "biomarker": biomarker,
        "therapy_drug": therapy_info['list']['drugList'],
        "therapy_approach": therapy_info['list']['therapy_approach'],
        "therapy_strategy": therapy_info['list']['therapy_strategyList'],
        "therapy_type": therapy_info['list']['therapy_typeList'],
        "approval_url": approval_url,
        "publication_date": approval_date,
        "context": summary
    }
    
    return summary, row


In [4]:
# load filtered statement
statements = load_db_statements(f'{root_dir}/data/latest_db/moalmanac-draft.dereferenced.json')

In [95]:
# iterate through all statements and flatten
flattened_rows = []
for stmt in statements:
    _, row = flatten_statements(stmt)
    flattened_rows.append(row)

In [96]:
# output to csv file
import csv

csv_columns = [
    "statement_id",
    "approval_status",
    "approval_org",
    "description",
    "indication",
    "cancer_type",
    "biomarker",
    "therapy_drug",
    "therapy_approach",
    "therapy_strategy",
    "therapy_type",
    "approval_url",
    "publication_date",
    "context"
]

with open(f"{root_dir}/data/latest_db/moalmanac-draft.dereferenced.context_db.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=csv_columns)
    writer.writeheader()
    for data in flattened_rows:
        writer.writerow(data)

#### Context duplicates

In [44]:
moalmanac_new_draft = pd.read_csv(f"{root_dir}/data/latest_db/moalmanac-draft.dereferenced.context_db.csv")
moalmanac_new_draft.head()

Unnamed: 0,statement_id,approval_status,approval_org,description,indication,cancer_type,biomarker,therapy_drug,therapy_approach,therapy_strategy,therapy_type,approval_url,publication_date,context
0,0,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"ER positive [present], HER2-negative [present]","['Abemaciclib', 'Tamoxifen']",Combination therapy,"['CDK4/6 inhibition', 'Estrogen receptor inhib...","['Targeted therapy', 'Hormone therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...
1,1,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"HER2-negative [present], PR positive [present]","['Abemaciclib', 'Tamoxifen']",Combination therapy,"['CDK4/6 inhibition', 'Estrogen receptor inhib...","['Targeted therapy', 'Hormone therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...
2,2,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"ER positive [present], HER2-negative [present]...","['Abemaciclib', 'Tamoxifen']",Combination therapy,"['CDK4/6 inhibition', 'Estrogen receptor inhib...","['Targeted therapy', 'Hormone therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...
3,3,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"ER positive [present], HER2-negative [present]","['Anastrozole', 'Abemaciclib']",Combination therapy,"['Aromatase inhibition', 'CDK4/6 inhibition']","['Hormone therapy', 'Targeted therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...
4,4,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"HER2-negative [present], PR positive [present]","['Anastrozole', 'Abemaciclib']",Combination therapy,"['Aromatase inhibition', 'CDK4/6 inhibition']","['Hormone therapy', 'Targeted therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...


In [110]:
moalmanac_new_draft_duplicated = moalmanac_new_draft.iloc[moalmanac_new_draft['context'].duplicated(keep=False).values, :]
print(moalmanac_new_draft_duplicated.shape)
moalmanac_new_draft_duplicated.to_csv(f"{root_dir}/data/latest_db/moalmanac-draft.dereferenced.duplicated.context_db.csv", index=False)

(83, 14)


In [111]:
moalmanac_new_draft_unique = moalmanac_new_draft.drop_duplicates(subset=['context'], keep='first')
print(moalmanac_new_draft_unique.shape)
moalmanac_new_draft_unique.to_csv(f"{root_dir}/data/latest_db/moalmanac-draft.dereferenced.unique.context_db.csv", index=False)

(584, 14)


In [109]:
duplicated_rows = moalmanac_new_draft[moalmanac_new_draft['context'].duplicated(keep=False)]
grouped_duplicates = duplicated_rows.groupby('context')
# for ctx, group in grouped_duplicates:
#     print(ctx)
#     print(group)
len(grouped_duplicates.groups)

39

#### Context statistics

In [67]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")  # or "gpt-4", "gpt-3.5-turbo", etc.

In [112]:
def calc_iqr(chunk_size_list):
    q1 = np.percentile(chunk_size_list, 25)
    q3 = np.percentile(chunk_size_list, 75)
    iqr = f'{q1}-{q3}'
    return(iqr)

struc_context_size = [len(encoding.encode(context)) for context in moalmanac_new_draft_unique['context']]
print("# chunks: "+str(len(struc_context_size)))
print("Min: "+str(np.min(struc_context_size)))
print("Max: "+str(np.max(struc_context_size)))
print("Mean: "+str(np.mean(struc_context_size)))
print("Median: "+str(np.median(struc_context_size)))
print("IQR: "+str(calc_iqr(struc_context_size)))


# chunks: 584
Min: 176
Max: 509
Mean: 294.7157534246575
Median: 289.0
IQR: 235.5-338.25


## Load DB from API for automation

### Cache DB version

In [7]:
CACHE_FILE = "db_version_cache.json"

def get_remote_version():
    agents = requests.get('https://api.moalmanac.org/agents').json()
    return agents['service']['last_updated']

def get_local_version():
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE) as f:
            return json.load(f).get("version")
    return None

def save_local_version(version):
    with open(CACHE_FILE, "w") as f:
        json.dump({"version": version}, f)
        
# def update_db_files():
#   update cancer type and biomarker entities
#   update context db
#   update context db faiss index

# def sync_db():
#     remote = get_remote_version()
#     local = get_local_version()
#     if local != remote:
#         update_db_files()
#         save_local_version(remote)
#     else:
#         print(f"DB is already up to date (version={local})")

# sync_db()

In [10]:
version=get_remote_version()
print(version)

2025-09-04


In [9]:
version=get_local_version()
version

'2025-09-04'

In [11]:
statements = requests.get('https://api.moalmanac.org/statements').json()['data']
print(len(statements))

1784


In [14]:
fda_statements=subset_db_statements(statements, organization='fda')
print(len(fda_statements))

642


### Extract cancer types, biomarkers, and therapies from the latest DB

In [16]:
_VERSION=get_local_version()
_VERSION

'2025-09-04'

In [16]:
with open(f"data/latest_db/fda_statements__{_VERSION}.json", "r") as f:
    fda_statements = json.load(f)
len(fda_statements)

642

#### Extract common clinical modifers from raw cancer types

In [None]:
standardized_cancer = []
raw_cancer = []
biomarker = []
therapy = []
for stmt in fda_statements:
    standardized_cancer_i = stmt.get("proposition", {}).get("conditionQualifier", {}).get("name", "Unknown cancer")
    raw_cancer_i = stmt['indication']['raw_cancer_type']
    standardized_cancer.append(standardized_cancer_i)
    raw_cancer.append(raw_cancer_i)
    biomarker.append(extract_biomarker_info(stmt)['str'])
    therapy.append(extract_therapy_info(stmt)['str']['drug_str'])
    
raw_cancer_unique_counts = Counter(raw_cancer)
standardized_cancer_unique_counts = Counter(standardized_cancer)

In [None]:
standardized_to_raw_mapping = pd.DataFrame({"standardized_cancer": standardized_cancer, "raw_cancer": raw_cancer, "biomarker": biomarker, "therapy": therapy})
standardized_to_raw_mapping.to_csv("data/latest_db/standardized_to_raw_cancer_biomarker_mapping.csv", index=False)
standardized_to_raw_mapping.drop_duplicates(subset=['standardized_cancer', 'raw_cancer']).to_csv("data/latest_db/standardized_to_raw_cancer_mapping.csv", index=False)

Create a list of all modifiers from the DB

In [None]:
modifiers = [
    'early', 'advanced or metastatic', 'metastatic', 'metastatic castration-resistant', 'locally advanced or metastatic', 
    'low-risk', 'philadelphia chromosome-positive', 'non-squamous', 'unresectable or metastatic', 'systemic', 'cutaneous',
    'unresectable, recurrent, or refractory', 'chronic, accelerated, or myeloid or lymphoid blast phase ph+', 
    'primary advanced or recurrent', 'recurrent or advanced', 'relapsed or refractory', 'unresectable, locally or metastatic',
    'unresectable and/or metastatic maligant', 'unresectable locally advanced or metastatic', 'b-cell precursor',
    'metastatic or recurrent', 'transfusion-dependent anemia due to low- or intermediate-1-risk', 'early-stage',
    'chronic phase or accelerated phase ph+', 'recurrent', 'advanced', 'relapsed', 'high risk early', 
    'metastatic or unresectable, recurrent', 'locally advanced unresectable or metastatic', 'persistent, recurrent, or metastatic',
    'recurrent or metastatic', 'high-risk early-stage triple-negative', 'locally recurrent unresectable or metastatic triple-negative',
    'locally advanced, inflammatory, or early stage', 'early stage', 'b-cell', 'diffuse large b-cell', 'mature B-cell',
    'previously untreated, advanced stage, cd20-positive', 'unresectable locally advanced or metastatic triple-negative',
    'metastatic non-squamous'
]
modifiers = [mod.lower() for mod in modifiers]
with open(f"data/latest_db/disease_modifiers__{_VERSION}.json", "w") as f:
    json.dump(modifiers, f)

Append modifiers to standardized cancer names to add context

In [23]:
def extract_clinical_modifiers(raw_cancer_type, standardized_cancer_type, modifiers):
    raw_cancer_type_lower = raw_cancer_type.lower()
    extracted_modifiers = [mod for mod in modifiers if mod in raw_cancer_type_lower and mod not in standardized_cancer_type.lower()]
    if not extracted_modifiers:
        return None
    if len(extracted_modifiers) > 1:
        return max(extracted_modifiers, key=len)
    return extracted_modifiers[0]

statement_id = []
standardized_cancer = []
raw_cancer = []
modified_standardized_cancer = []
biomarker = []
therapy = []
for stmt in fda_statements:
    standardized_cancer_i = stmt.get("proposition", {}).get("conditionQualifier", {}).get("name", "Unknown cancer")
    raw_cancer_i = stmt['indication']['raw_cancer_type']
    extracted_modifiers = extract_clinical_modifiers(raw_cancer_i, standardized_cancer_i, modifiers)
    if extracted_modifiers:
        modified_standardized_cancer_i = f"{extract_clinical_modifiers(raw_cancer_i, standardized_cancer_i, modifiers)} {standardized_cancer_i.lower()}"
    else:
        modified_standardized_cancer_i = standardized_cancer_i.lower()
    statement_id.append(stmt.get('id'))
    standardized_cancer.append(standardized_cancer_i)
    raw_cancer.append(raw_cancer_i)
    modified_standardized_cancer.append(modified_standardized_cancer_i)
    biomarker.append(extract_biomarker_info(stmt)['list'])
    therapy.append(extract_therapy_info(stmt)['list']['drugList'])
    

Create a DB core dataframe with:
- standardized cancer name -> later used for entity matching
- raw cancer name 
- modified standardized cancer name -> later used for context DB
- biomarker -> later used for entity matching
- therapy -> later used for ground-truth mapping during validation on synthetic queries

In [24]:
standardized_to_raw_mapping = pd.DataFrame({
    "statement_id": statement_id,
    "standardized_cancer": standardized_cancer, 
    "raw_cancer": raw_cancer, 
    "modified_standardized_cancer": modified_standardized_cancer,
    "biomarker": biomarker, 
    "therapy": therapy
    })
standardized_to_raw_mapping

Unnamed: 0,statement_id,standardized_cancer,raw_cancer,modified_standardized_cancer,biomarker,therapy
0,0,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"[HER2-negative, ER positive]","[Tamoxifen, Abemaciclib]"
1,1,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"[PR positive, HER2-negative]","[Tamoxifen, Abemaciclib]"
2,2,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"[PR positive, HER2-negative, ER positive]","[Tamoxifen, Abemaciclib]"
3,3,Invasive Breast Carcinoma,advanced or metastatic breast cancer,advanced or metastatic invasive breast carcinoma,"[HER2-negative, ER positive]","[Anastrozole, Abemaciclib]"
4,4,Invasive Breast Carcinoma,advanced or metastatic breast cancer,advanced or metastatic invasive breast carcinoma,"[PR positive, HER2-negative]","[Anastrozole, Abemaciclib]"
...,...,...,...,...,...,...
637,641,Invasive Breast Carcinoma,breast cancer,invasive breast carcinoma,"[HER2-negative, ER positive]",[Datopotamab deruxtecan]
638,642,Non-Small Cell Lung Cancer,breast cancer,non-small cell lung cancer,"[PR positive, HER2-negative]",[Datopotamab deruxtecan]
639,643,Invasive Breast Carcinoma,breast cancer,invasive breast carcinoma,"[PR positive, HER2-negative, ER positive]",[Datopotamab deruxtecan]
640,644,Non-Small Cell Lung Cancer,non-small cell lung cancer (NSCLC),non-small cell lung cancer,[EGFR Exon 20 (Insertion)],[Sunvozertinib]


In [None]:
standardized_to_raw_mapping.to_csv(f"data/latest_db/moalmanac_core__{_VERSION}.csv", index=False)

In [None]:
#for manual inspection of modifiers
standardized_to_raw_mapping.drop_duplicates(subset=['standardized_cancer', 'raw_cancer']).to_csv("data/latest_db/standardized_to_raw_cancer_mapping.csv", index=False)