In [146]:
import pandas as pd
import numpy as np
import os, requests
import warnings
warnings.filterwarnings("ignore")

In [147]:
organizations = requests.get('https://api.moalmanac.org/organizations').json()['data']
organizations

[{'id': 'fda',
  'name': 'Food and Drug Administration',
  'description': 'Regulatory agency that approves drugs for use in the United States.',
  'url': 'https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm',
  'last_updated': '2025-04-03'},
 {'id': 'ema',
  'name': 'European Medicines Agency',
  'description': 'Regulatory agency that approves medicines for use in the European Union.',
  'url': 'https://www.ema.europa.eu/en/medicines',
  'last_updated': '2024-10-20'},
 {'id': 'hse',
  'name': 'Health Service Executive',
  'description': 'Regulatory agency that approves medicines for reimbursement by the public health system in the Republic of Ireland.',
  'url': 'https://www.hse.ie/eng/services/list/5/cancer/profinfo/chemoprotocols/',
  'last_updated': '2024-10-16'},
 {'id': 'hpra',
  'name': 'Health Products Regulatory Authority',
  'description': 'National register of authorized medicines in the Republic of Ireland.',
  'url': 'http://www.hpra.ie/homepage/medicines/medicines-in

here, we're interested in only the ones in ireland

In [148]:
data = requests.get('https://api.moalmanac.org/statements').json()['data']
data = pd.json_normalize(data)
data.head(1)

Unnamed: 0,id,type,description,contributions,reportedIn,direction,indication.id,indication.indication,indication.initial_approval_date,indication.initial_approval_url,...,proposition.objectTherapeutic.conceptType,proposition.objectTherapeutic.name,proposition.objectTherapeutic.primaryCoding.id,proposition.objectTherapeutic.primaryCoding.code,proposition.objectTherapeutic.primaryCoding.name,proposition.objectTherapeutic.primaryCoding.system,proposition.objectTherapeutic.primaryCoding.systemVersion,proposition.objectTherapeutic.primaryCoding.iris,proposition.objectTherapeutic.mappings,proposition.objectTherapeutic.extensions
0,0,Statement,The U.S. Food and Drug Administration (FDA) gr...,"[{'id': 0, 'type': 'Contribution', 'agent': {'...","[{'id': 'doc:fda.verzenio', 'type': 'Document'...",supports,ind:fda.verzenio:0,Verzenio is a kinase inhibitor indicated in co...,2023-03-03,https://www.accessdata.fda.gov/drugsatfda_docs...,...,,,,,,,,,,


In [149]:
data['agency'] = data['indication.id'].apply(lambda x: x.split(':')[1].split('.')[0])
data['agency'].value_counts()

agency
fda     631
ema     422
hse     350
hc      332
hpra     22
Name: count, dtype: int64

In [150]:
data = data[data.agency.isin(['fda'])]
data.to_csv('moa_all_usa_statements.csv', index = False)

In [151]:
formatted_table = data[['indication.id', 'agency',
                        'indication.description',
                        #'indication.indication',
                        'proposition.conditionQualifier.name',
                        'proposition.biomarkers',
                        'proposition.objectTherapeutic.therapies',
                        'proposition.objectTherapeutic.name',
                        'proposition.objectTherapeutic.extensions',
                        'indication.document.url',
                        'indication.document.publication_date'
                        ]]
formatted_table['therapy_name'] = formatted_table.apply(lambda x: [i['name'] for i in x['proposition.objectTherapeutic.therapies']]
                                                            if str(x['proposition.objectTherapeutic.therapies']) != 'nan'
                                                            else [x['proposition.objectTherapeutic.name']], axis=1)
formatted_table['therapy_approach'] = formatted_table.apply(lambda x: 'Combination therapy'
                                                            if str(x['proposition.objectTherapeutic.therapies']) != 'nan'
                                                            else 'Monotherapy', axis=1)
formatted_table['therapy_strategy'] = formatted_table.apply(lambda x: [i['extensions'][0]['value'][0] for i in x['proposition.objectTherapeutic.therapies']
                                                                       if i['extensions'][0]['name'] == 'therapy_strategy']
                                                            if str(x['proposition.objectTherapeutic.therapies']) != 'nan'
                                                            else [i['value'][0] for i in x['proposition.objectTherapeutic.extensions']
                                                                  if i['name'] == 'therapy_strategy'], axis=1)
formatted_table['therapy_type'] = formatted_table.apply(lambda x: [i['extensions'][1]['value'] for i in x['proposition.objectTherapeutic.therapies']
                                                                       if i['extensions'][1]['name'] == 'therapy_type']
                                                            if str(x['proposition.objectTherapeutic.therapies']) != 'nan'
                                                            else [i['value'] for i in x['proposition.objectTherapeutic.extensions']
                                                                  if i['name'] == 'therapy_type'], axis=1)
formatted_table.head(1)


Unnamed: 0,indication.id,agency,indication.description,proposition.conditionQualifier.name,proposition.biomarkers,proposition.objectTherapeutic.therapies,proposition.objectTherapeutic.name,proposition.objectTherapeutic.extensions,indication.document.url,indication.document.publication_date,therapy_name,therapy_approach,therapy_strategy,therapy_type
0,ind:fda.verzenio:0,fda,The U.S. Food and Drug Administration (FDA) gr...,Invasive Breast Carcinoma,"[{'id': 2, 'name': 'HER2-negative', 'genes': [...","[{'id': 99, 'conceptType': 'Drug', 'name': 'Ab...",,,https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,"[Abemaciclib, Tamoxifen]",Combination therapy,"[CDK4/6 inhibition, Estrogen receptor inhibition]","[Targeted therapy, Hormone therapy]"


In [152]:
def format_biomarker(biomarker_list):
    '''
    Format biomarker entry
    '''
    biomarkers = []
    for i in biomarker_list:
        name = i['name']
        extension_dict = {}
        for entry in i['extensions']:
            extension_dict[entry['name']] = entry['value']
        present = extension_dict.get('present', '')
        if present == True:
            present = 'present'
        if name != '':
            biomarkers.append('{marker} [{present}]'.format(marker=name, present=present))
    return biomarkers
formatted_table['biomarker'] = formatted_table['proposition.biomarkers'].apply(format_biomarker)
formatted_table = formatted_table.drop(columns = ['proposition.biomarkers', 'proposition.objectTherapeutic.extensions',
                                                  'proposition.objectTherapeutic.name', 'proposition.objectTherapeutic.therapies'])

In [153]:
formatted_table.columns

Index(['indication.id', 'agency', 'indication.description',
       'proposition.conditionQualifier.name', 'indication.document.url',
       'indication.document.publication_date', 'therapy_name',
       'therapy_approach', 'therapy_strategy', 'therapy_type', 'biomarker'],
      dtype='object')

In [154]:
formatted_table.columns = ['indication_id', 'agency',
                           'indication',
                           'cancer_type', 
                           'approval_url', 'publication_date',
                           'therapy_name', 'therapy_approach',
                           'therapy_strategy', 'therapy_type', 'biomarker']
formatted_table.to_csv('moa_all_usa_formatted.csv', index=False)
print(formatted_table.shape)
formatted_table.head(1)

(631, 11)


Unnamed: 0,indication_id,agency,indication,cancer_type,approval_url,publication_date,therapy_name,therapy_approach,therapy_strategy,therapy_type,biomarker
0,ind:fda.verzenio:0,fda,The U.S. Food and Drug Administration (FDA) gr...,Invasive Breast Carcinoma,https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,"[Abemaciclib, Tamoxifen]",Combination therapy,"[CDK4/6 inhibition, Estrogen receptor inhibition]","[Targeted therapy, Hormone therapy]","[HER2-negative [present], ER positive [present]]"


In [155]:
def format_context(x):
    text = ''
    text += 'Approval status: Regulatory approval (fda)\n'
    text += 'Indication: ' + x.indication + '\n'
    text += 'Cancer type: ' + x.cancer_type + '\n'
    text += 'Biomarkers: ' + ', '.join(x.biomarker) + '\n'
    text += 'Therapy: ' + ' + '.join(x.therapy_name) + '\n'
    text += 'Therapy approach: ' + x.therapy_approach + '\n'
    text += 'Therapy strategy: ' + ' + '.join(x.therapy_strategy) + '\n'
    text += 'Therapy type: ' + ' + '.join(x.therapy_type) + '\n'
    text += 'Approval url: ' + x.approval_url + '\n'
    text += 'Publication date: ' + x.publication_date
    return text
formatted_table['context'] = formatted_table.apply(format_context, axis=1)
print(formatted_table['context'].iloc[0])

Approval status: Regulatory approval (fda)
Indication: The U.S. Food and Drug Administration (FDA) granted approval to abemaciclib in combination with endocrine therapy (tamoxifen or an aromatase inhibitor) for the adjuvant treatment of adult patients with hormone receptor (HR)-positive, human epidermal growth factor 2 (HER2)-negative, node positive, early breast cancer at high risk of recurrence. This indication is based on the monarchE (NCT03155997) clinical trial, which was a randomized (1:1), open-label, two cohort, multicenter study. Initial endocrine therapy received by patients included letrozole (39%), tamoxifen (31%), anastrozole (22%), or exemestane (8%).
Cancer type: Invasive Breast Carcinoma
Biomarkers: HER2-negative [present], ER positive [present]
Therapy: Abemaciclib + Tamoxifen
Therapy approach: Combination therapy
Therapy strategy: CDK4/6 inhibition + Estrogen receptor inhibition
Therapy type: Targeted therapy + Hormone therapy
Approval url: https://www.accessdata.fda.

In [156]:
formatted_table.to_csv('moa_all_usa_formatted.csv', index=False)
print(formatted_table.shape)
formatted_table.head(1)

(631, 12)


Unnamed: 0,indication_id,agency,indication,cancer_type,approval_url,publication_date,therapy_name,therapy_approach,therapy_strategy,therapy_type,biomarker,context
0,ind:fda.verzenio:0,fda,The U.S. Food and Drug Administration (FDA) gr...,Invasive Breast Carcinoma,https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,"[Abemaciclib, Tamoxifen]",Combination therapy,"[CDK4/6 inhibition, Estrogen receptor inhibition]","[Targeted therapy, Hormone therapy]","[HER2-negative [present], ER positive [present]]",Approval status: Regulatory approval (fda)\nIn...


- approval status
- approval org : agency
- description : description
- indication : indication.indication
- cancer_type : proposition.conditionQualifier.name
- biomarker : proposition.biomarkers
- therapy_drug 
- therapy_approach
- therapy_strategy
- therapy_type
- approval_url : indication.document.url
- publication_date : indication.document.publication_date

---
### compare against helena's table

In [112]:
json_table = pd.read_csv('moalmanac-draft.dereferenced.context_db.csv')
print(json_table.shape)
json_table.head()

(628, 12)


Unnamed: 0,approval_status,approval_org,indication,cancer_type,biomarker,therapy_drug,therapy_approach,therapy_strategy,therapy_type,approval_url,publication_date,context
0,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Invasive Breast Carcinoma,"ER positive [present], HER2-negative [present]","['Abemaciclib', 'Tamoxifen']",Combination therapy,"['CDK4/6 inhibition', 'Estrogen receptor inhib...","['Targeted therapy', 'Hormone therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nIn...
1,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Invasive Breast Carcinoma,"HER2-negative [present], PR positive [present]","['Abemaciclib', 'Tamoxifen']",Combination therapy,"['CDK4/6 inhibition', 'Estrogen receptor inhib...","['Targeted therapy', 'Hormone therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nIn...
2,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Invasive Breast Carcinoma,"ER positive [present], HER2-negative [present]...","['Abemaciclib', 'Tamoxifen']",Combination therapy,"['CDK4/6 inhibition', 'Estrogen receptor inhib...","['Targeted therapy', 'Hormone therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nIn...
3,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Invasive Breast Carcinoma,"ER positive [present], HER2-negative [present]","['Anastrozole', 'Abemaciclib']",Combination therapy,"['Aromatase inhibition', 'CDK4/6 inhibition']","['Hormone therapy', 'Targeted therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nIn...
4,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Invasive Breast Carcinoma,"HER2-negative [present], PR positive [present]","['Anastrozole', 'Abemaciclib']",Combination therapy,"['Aromatase inhibition', 'CDK4/6 inhibition']","['Hormone therapy', 'Targeted therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nIn...


In [144]:
df = json_table['approval_url'].value_counts().reset_index()
df2 = formatted_table['approval_url'].value_counts().reset_index()
df = df.merge(df2, on = 'approval_url', how = 'outer')
df[df.count_x != df.count_y]

Unnamed: 0,approval_url,count_x,count_y
1,https://www.accessdata.fda.gov/drugsatfda_docs...,2,4
8,https://www.accessdata.fda.gov/drugsatfda_docs...,11,10
34,https://www.accessdata.fda.gov/drugsatfda_docs...,3,2
54,https://www.accessdata.fda.gov/drugsatfda_docs...,4,6
72,https://www.accessdata.fda.gov/drugsatfda_docs...,4,5
