# HEAL Monday Studies Board Update pipeline

Jupyter Notebook to follow the SOP for update the HEAL Monday Board. 
The notebook can be used for either step by step exploration, or running from a service like Google Colab.

In [None]:
## If running on Google Colab, run this cell to mount Google Drive to access files on Google Drive.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
## If running from Google Colab, might need to install this library
!pip install xlsxwriter

In [1]:
import sys
import logging
from pathlib import Path
sys.path.append('../scripts/')
import monday_board_update


In [2]:
## Set this to the directory where:
## 1- Monday Studies board has been exported.
## 2- All relevant tables from MySql database for HEAL have been exported as a csv to.
# input_dir = Path("/pat/to/data/dir")
input_dir = Path("/Users/hinashah/Documents/HEAL/MondayUpdate_Feb2026")

In [3]:
## Setup logger
logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s [%(levelname)s] %(message)s",
        filename= input_dir / "report-log.txt",
    )
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))


In [4]:
logging.info("---- STEP 1: Looking at Study Lookup Table")
gt_file = monday_board_update.import_study_lookup_table(input_dir)


---- STEP 1: Looking at Study Lookup Table
Number of entries in study lookup table: 2660
Number of distinct values in --appl_id--: 2626
---- NA count: 0
Number of distinct values in --xstudy_id--: 1623
---- NA count: 0
Number of distinct values in --study_hdp_id--: 1497
---- NA count: 189
Number of distinct values in --study_hdp_id_appl--: 1475
---- NA count: 189
Number of distinct values in --study_most_recent_appl--: 1602
---- NA count: 0
Number of distinct values in --study_first_appl--: 1601
---- NA count: 0
Number of distinct values in --compound_key--: 2648
---- NA count: 0


In [5]:
logging.info("---- STEP 2: Importing Monday Studies Board")
monday_board = monday_board_update.import_monday_board(input_dir)

---- STEP 2: Importing Monday Studies Board
[PosixPath('/Users/hinashah/Documents/HEAL/MondayUpdate_Feb2026/HEAL_Studies_1770661804_Deletions.xlsx'), PosixPath('/Users/hinashah/Documents/HEAL/MondayUpdate_Feb2026/HEAL_Studies_1770389317.xlsx')]
Index(['Name', 'Most Recent Appl_ID', 'HDP appl_ID', 'Project #', 'Archived',
       'HEAL-Related', 'Data Type', 'Research Focus', 'Research Program',
       'Research Network', 'Title', 'Contact PI', 'Contact Email',
       'Administering IC', 'NIH PO', 'NIH PO Email', 'Institution(s)', 'PI(s)',
       'Location', 'Activity Code', 'Award Type', 'Award Year', 'Total Funded',
       'Summary', 'Project Start', 'Project End', 'Reporter Link', 'SBIR/STTR',
       'Data Engagement', 'Repo per Platform', 'Platform Reg Time',
       'CEDAR Form %', 'Creation Log', 'study_type',
       '"Get the Data" Engagement Board', 'VLMD Status', 'DD Tracker',
       'Checklist Exempt', 'Do not Engage', 'link to Data Dictionary Tracker',
       '_tmp'],
      dty

In [6]:
logging.info("---- STEP 3: Compare lookup table and Monday Board")
mondayboard_missingin_lookup, lookup_fields = monday_board_update.compare_study_loookup_monday(gt_file, monday_board)

---- STEP 3: Compare lookup table and Monday Board
Number records from Monday already in lookup table: 0
Number records from Monday that are not in lookup table (Consider these as discrepancies **Investigate**): 34
Number records from lookup table that are not on Monday (Potentially new entries): 1623
Entries in Monday that are not in lookup table
                 Name Most Recent Appl_ID HDP appl_ID  \
0            HDP01518                   -           -   
1            HDP01519                   -           -   
2            HDP01514                   -           -   
3            HDP01515                   -           -   
4            HDP01517                   -           -   
5            HDP01516                   -           -   
6            HDP01513                   -           -   
11  10428343_HDP00882                 NaN         NaN   
12  10488140_HDP00883                 NaN         NaN   
13       9673173_none                 NaN         NaN   
14       9769689_none  

In [7]:
# import pandas as pd
# def get_unique_values(df:pd.DataFrame, col_name:str='appl_id'):
#     if col_name in df.columns:
#         return df[ ~pd.isna(df[col_name])][col_name].drop_duplicates()
#     return None

# convert_dict = {'appl_id':str}
# resnet_df = pd.read_csv(input_dir/"research_networks.csv", low_memory=False, dtype=convert_dict)
# logging.info(f"Research Network table has: {len(resnet_df)} entrie, with {len(get_unique_values(resnet_df))} appl_ids")
# print(resnet_df.columns)
# appl_ids = gt_file[['appl_id', 'study_most_recent_appl']].drop_duplicates()
# print(len(appl_ids))
# resnet_added = pd.merge(appl_ids, resnet_df[['appl_id', 'res_net']], how = 'left', left_on='appl_id', right_on='appl_id' )
# print(len(resnet_added))
# resnet_most_recent_appl_id = resnet_added[~pd.isna(resnet_added.res_net)][['study_most_recent_appl', 'res_net']]
# resnet_added_updated = pd.merge(appl_ids, resnet_most_recent_appl_id, how='left', left_on='study_most_recent_appl', right_on='study_most_recent_appl')
# resnet_added_updated.to_csv("/tmp/tmp_resnet_updated.csv", index=False)

In [8]:
logging.info("---- STEP 4: Importing tables from MySQL and combining relevant information")
combined_data_ph1 = monday_board_update.import_mysql_data(input_dir, gt_file, monday_board, lookup_fields)

---- STEP 4: Importing tables from MySQL and combining relevant information
Awards table has: 2439 entries, with 2439 appl_ids
*** Combining the two reporter tables
2439
489
2928
Reporter table has: 2928 entries, with 2928 appl_ids
Platform generated table has: 1680 entries, with 1608 appl_ids
Platform table has 1608 unique HDP IDs
Repo mapping table has: 1748 entries, with 1748 appl_ids
Research Network table has: 2439 entries, with 2439 appl_ids
Engagment Flags table has: 2626 entries, with 2626 appl_ids
PO Emails table has: 420 entries, with 420 appl_ids
--- Wrangling PI Emails
ALL PI emails associated with a project (identified by most_recent_appl)
      study_most_recent_appl               pi_email_latest
1                   9755001             kwatkins@rand.org
3                  10088639               damico@rand.org
4                  11195709         acfernan@med.umich.ed
6                  11196080               fqeadan@luc.edu
8                   9869480             LYNN.DEB

In [9]:
logging.info("---- STEP 5: Filling holes with MDS data")
combined_data_ph1 = monday_board_update.fill_in_holes_from_mds(input_dir, combined_data_ph1)

---- STEP 5: Filling holes with MDS data


In [10]:
    ## Add CTN  data
logging.info("---- STEP 6: Adding any CTN data from MDS")
ctn_fields_platform = monday_board_update.get_ctndata_from_mds(input_dir)

---- STEP 6: Adding any CTN data from MDS
Number of CTN entries found in Platform MDS 44


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctn_data['project_title'] = ctn_data['project_title'].replace('0', '')


In [11]:
logging.info("---- STEP 7: Combining everything together")
all_data = monday_board_update.combine_mysql_ctn(combined_data_ph1, ctn_fields_platform)


---- STEP 7: Combining everything together
------------ Preview of the final combined dataset ---------------
     Activity Code Administering IC Archived Award Type  Award Year  \
0              U01            NIAAA     live          3      2019.0   
1              R34            NIAAA     live          3      2019.0   
2              R01            NIAAA     live          3      2020.0   
3              R34            NIAAA     live          5      2025.0   
4              R01            NIAAA     live          5      2025.0   
...            ...              ...      ...        ...         ...   
1318           NaN              NaN     live          0         NaN   
1498           NaN              NaN     live          0         NaN   
1499           NaN              NaN     live          0         NaN   
1500           NaN              NaN     live          0         NaN   
1501           NaN              NaN     live          0         NaN   

      CEDAR Form %  Checklist Exempt 

In [12]:
logging.info("---- STEP 8: Final Manipulation of all the data to make it Monday Board ready")
combined_data = monday_board_update.prepare_for_monday(all_data)

---- STEP 8: Final Manipulation of all the data to make it Monday Board ready
Counts for study types in the final dataset
study_type
HDP           1497
APPLIDONLY     126
CTN             44
Name: count, dtype: int64
Setting empty cells to  '-' in the following colulmns:
['Activity Code', 'Administering IC', 'Award Type', 'Contact Email', 'Contact PI', 'Data Type', 'Institution(s)', 'NIH PO', 'NIH PO Email', 'PI(s)', 'Project #', 'Repo per Platform', 'Reporter Link', 'Research Focus', 'Research Network', 'Research Program', 'Summary', 'Title', 'key', 'HDP appl_ID', 'Most Recent Appl_ID']


In [13]:
logging.info("---- STEP 9: Final numbers and Export")
monday_board_update.export_finaldata(input_dir, combined_data, mondayboard_missingin_lookup, monday_board)  

---- STEP 9: Final numbers and Export
******************* MONDAY COMPARISON  ******************************************
Number records from Monday already in final dataset: 0
Number records from Monday that are not in final dataset (Consider these as discrepancies **Investigate**): 34
Number records from final dataset that are not on Monday (Potentially new entries): 1667
****** Investigate/Delete the following entries on Monday that are not in the final dataset
[['HDP01518' '-' 'HDP']
 ['HDP01519' '-' 'HDP']
 ['HDP01514' '-' 'HDP']
 ['HDP01515' '-' 'HDP']
 ['HDP01517' '-' 'HDP']
 ['HDP01516' '-' 'HDP']
 ['HDP01513' '-' 'HDP']
 ['10428343_HDP00882' nan nan]
 ['10488140_HDP00883' nan nan]
 ['9673173_none' nan nan]
 ['9769689_none' nan nan]
 ['10934856' '10934856' 'APPLIDONLY']
 ['11129951' '11129951' 'APPLIDONLY']
 ['10881322' '10881322' 'APPLIDONLY']
 ['11009694' '11009694' 'APPLIDONLY']
 ['11044441' '11044441' 'APPLIDONLY']
 ['11046758' '11046758' 'APPLIDONLY']
 ['10936304' '10936304'

In [None]:
mondayboard_missingin_lookup[~(mondayboard_missingin_lookup.study_type == 'CTN')].Name.values

In [None]:
"""
If the answer to: Making sure uniqueness of key values. Do we have one row per key(HDPID/APPLID)? is False, 
Run the script here.
"""
counts = combined_data.key.value_counts()
counts[counts > 1]

In [None]:
combined_data[combined_data.key == 'HDP01187']

In [None]:
import pandas as pd

def get_unique_values(df:pd.DataFrame, col_name:str='appl_id'):
    if col_name in df.columns:
        return df[ ~pd.isna(df[col_name])][col_name].drop_duplicates()
    return None

convert_dict = {'appl_id':str}
resnet_df = pd.read_csv(input_dir/"research_networks.csv", low_memory=False, dtype=convert_dict)
logging.info(f"Research Network table has: {len(resnet_df)} entries, with {len(get_unique_values(resnet_df))} appl_ids")

appl_ids = gt_file[['appl_id', 'study_most_recent_appl']].drop_duplicates()
resnets = resnet_df[~pd.isna(resnet_df.res_net)][['appl_id', 'res_net']]

resnets_merged = pd.merge(appl_ids, resnets, how='left', left_on='study_most_recent_appl', right_on='appl_id')
resnets_merged_again = pd.merge(resnets_merged, resnets, how='left', left_on='appl_id_x', right_on='appl_id')

resnets_merged_again.drop_duplicates(inplace=True)
resnets_merged_again.to_csv("/tmp/resnet.csv")


In [None]:
resnet_added = pd.merge(appl_ids, resnet_df[['appl_id', 'res_net']], how = 'left', left_on='appl_id', right_on='appl_id' )
resnet_most_recent_appl_id = resnet_added[~pd.isna(resnet_added.res_net)][['study_most_recent_appl', 'res_net']]
resnet_added_updated = pd.merge(appl_ids, resnet_most_recent_appl_id, how='left', left_on='study_most_recent_appl', right_on='study_most_recent_appl')
resnet_df_new = resnet_added_updated[['study_most_recent_appl', 'res_net']].drop_duplicates()
resnet_added_updated.to_csv("/tmp/tmp_resnet_udpated.csv", index=False)


In [None]:
### BLOCK to compare Platform MDS list of studies vs. Monday Board studies.


import requests
mds_metadata_endpoint = 'https://healdata.org/mds/metadata'
HEAL_STUDY_GUID_TYPES = [
    'discovery_metadata',                   # Fully registered studies.
    'unregistered_discovery_metadata'       # Studies added to the Platform MDS but without the investigator registering the study.
]

metadata_ids = []
for heal_study_guid_type in HEAL_STUDY_GUID_TYPES:
    result = requests.get(mds_metadata_endpoint, params={
            '_guid_type': heal_study_guid_type,
            'limit': 2000,
        })
    if not result.ok:
            print(f'Could not retrieve metadata list for guid_type {heal_study_guid_type}: {result}')
    metadata_ids.extend(result.json())

print(len(metadata_ids))

Starting new HTTPS connection (1): healdata.org:443
https://healdata.org:443 "GET /mds/metadata?_guid_type=discovery_metadata&limit=2000 HTTP/11" 200 None
Starting new HTTPS connection (1): healdata.org:443
https://healdata.org:443 "GET /mds/metadata?_guid_type=unregistered_discovery_metadata&limit=2000 HTTP/11" 200 None
1149


In [22]:
hdps_platform = combined_data[(combined_data.study_type.isin(['HDP', 'CTN'])) & (combined_data['Archived'] != 'archived')]
print(len(hdps_platform))

1128


In [28]:
missing_hdps = [k for k in metadata_ids if k not in hdps_platform.key.values]
missing_hdps.sort()
print(len(missing_hdps))
for k in missing_hdps:
    print(k)


21
HDP00881
HDP00882
HDP00883
HDP00884
HDP00885
HDP00886
HDP01513
HDP01514
HDP01515
HDP01516
HDP01517
HDP01518
HDP01519
HDP01678
HDP01679
HDP01680
HDP01681
HDP01682
HDP01683
HDP01684
HDP01685
