In [2]:
import pandas as pd
from pathlib import Path
import numpy as np
import re
from datetime import datetime

In [3]:
input_dir = Path("/Users/hinashah/Documents/HEAL/MondayBoard/")

In [4]:
def get_unique_appl_ids(df:pd.DataFrame):
    if 'appl_id' in df.columns:
        return df[ ~pd.isna(df['appl_id'])]['appl_id'].drop_duplicates()
    return None
    
def get_unique_hdp_ids(df:pd.DataFrame):
    if 'hdp_id' in df.columns:
        return df[ ~pd.isna(df['hdp_id'])]['hdp_id'].drop_duplicates()
    return None

def convert_appl_ids_tostr(df:pd.DataFrame, appl_id_col:str='appl_id'):
    if appl_id_col not in df.columns:
        print("No appl_id columns available")
        return

    if df[appl_id_col].dtypes == 'int64':
        df[appl_id_col].fillna(-1, inplace=True)
        df[appl_id_col] = df[appl_id_col].astype('str')
    elif df[appl_id_col].dtypes == 'float64':
        df[appl_id_col].fillna(-1, inplace=True)
        df[appl_id_col] = df[appl_id_col].astype('int64').astype('str')
    elif df[appl_id_col].dtypes == 'object':
        print("Nothing to do here")
    else:
        print(f"Dtype is: {df[appl_id_col].dtypes}, cannot be converted yet")
        return
    df[appl_id_col].replace("-1", np.NaN, inplace=True)

def get_missing_applids(expected: set, df:pd.DataFrame):
    appl_id_col = 'appl_id'
    return [k for k in expected if k not in df[appl_id_col].values]

In [5]:
#MATCH!
awards_df = pd.read_csv(input_dir/"awards.csv", low_memory=False)
awards_df = awards_df.dropna(how='all')
print(f"Awards table has: {len(awards_df)} entries, with {len(get_unique_appl_ids(awards_df))} appl_ids")
reporter_df = pd.read_csv(input_dir/"reporter.csv", low_memory=False)
reporter_df = reporter_df.dropna(how='all')
print(f"Reporter table has: {len(reporter_df)} entriesawards_df, with {len(get_unique_appl_ids(reporter_df))} appl_ids")
progress_tracker_df = pd.read_csv(input_dir/"progress_tracker.csv", low_memory=False)
print(f"Platform generated table has: {len(progress_tracker_df)} entries, with {len(get_unique_appl_ids(progress_tracker_df))} appl_ids")
print(f"Platform table has {len(get_unique_hdp_ids(progress_tracker_df))} unique HDP IDs")

Awards table has: 1588 entries, with 1588 appl_ids
Reporter table has: 1590 entriesawards_df, with 1590 appl_ids
Platform generated table has: 1279 entries, with 1270 appl_ids
Platform table has 1279 unique HDP IDs


In [6]:
convert_appl_ids_tostr(awards_df)
convert_appl_ids_tostr(reporter_df)
convert_appl_ids_tostr(progress_tracker_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[appl_id_col].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[appl_id_col].replace("-1", np.NaN, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

In [7]:
## Match
print(len(progress_tracker_df))
progress_tracker_df = progress_tracker_df.sort_values("appl_id")
appl_ids_wrong = [k for k in progress_tracker_df.appl_id if (k=="0") | ('-' in k) ]
print(appl_ids_wrong)
## Remove any rows with appl_id="0"
progress_tracker_df = progress_tracker_df[ ~progress_tracker_df.appl_id.isin(["0"]) ]
## Matched with Sabrina the project numbers that will not make into the dataset because of funky project numbers
## HDP00885
## HDP00886
## HDP00882
## HDP00881
## HDP00883
## HDP00884
progress_tracker_df['num_dashes'] = [len( [a for a in k if a=='-'] ) for k in progress_tracker_df.project_num]
# Remove anything after '_' (this also removes lower case letters, but a check should be made below)
progress_tracker_df['project_num'] = [k.split('_')[0] for k in progress_tracker_df['project_num']]
# Reset the project number to blank for anything that has more than one dashes
progress_tracker_df['project_num'] = ["" if n > 1 else k for (k,n) in progress_tracker_df[['project_num', 'num_dashes']].values]
# progress_tracker_df['project_num'] = [k.replace() for k in progress_tracker_df['project_num']]
print("Project numbers with a lowercase letter that might have been added by platform: ")
print([k for k in progress_tracker_df.project_num if re.search(r'[a-z]', k) is not None])

## Create project number components
progress_tracker_df['mds_proj_num_spl_ty_code'] = [k[0] if len(k) > 0 else k for k in progress_tracker_df['project_num']]
progress_tracker_df['mds_pproj_num_spl_act_code'] = [k[1:4] if len(k) > 0 else k for k in progress_tracker_df['project_num']]
progress_tracker_df['mds_proj_ser_num'] = [k[4:12] if len(k) > 0 else k for k in progress_tracker_df['project_num']]
progress_tracker_df['mds_proj_nm_spl_supp_yr'] = [k.split('-')[1] if len(k) > 0 else k for k in progress_tracker_df['project_num']]
progress_tracker_df['mds_proj_num_spl_sfx_code'] = [k[2:] if len(k) > 0 else k for k in progress_tracker_df['mds_proj_nm_spl_supp_yr']]

progress_tracker_df[ ['project_num'] + [k for k in progress_tracker_df.columns if k.startswith('mds')]]
progress_tracker_df.rename(columns={'project_num':'mds_project_num'}, inplace=True)
# Cound hdp_ids for appl_ids
appl_hdp = progress_tracker_df[['appl_id', 'hdp_id']].drop_duplicates()
t = appl_hdp.groupby('appl_id').size()

print("List of appl_ids for which multiple HDPID s are assigned::")
print(t[t!=1])
print(appl_hdp[ appl_hdp.appl_id.isin(t[t!=1].keys())])
progress_tracker_df['num_hdp_by_appl'] = [t[k] for k in progress_tracker_df['appl_id']]


1279
[]
Project numbers with a lowercase letter that might have been added by platform: 
[]
List of appl_ids for which multiple HDPID s are assigned::
appl_id
10267804    3
10378422    2
10378910    2
10378923    2
10391075    3
10590474    3
dtype: int64
       appl_id    hdp_id
1047  10267804  HDP01050
216   10267804  HDP00218
1048  10267804  HDP01051
38    10378422  HDP00039
1042  10378422  HDP01045
1043  10378910  HDP01046
278   10378910  HDP00280
1044  10378923  HDP01047
288   10378923  HDP00290
1046  10391075  HDP01049
1045  10391075  HDP01048
105   10391075  HDP00106
1009  10590474  HDP01012
1276  10590474  HDP01282
1277  10590474  HDP01283


In [57]:
combined_data_1[combined_data_1.appl_id.isin()]

Unnamed: 0,proj_abs,act_code,ic_code,adm_ic,adm_ic_code,adm_ic_nm,fund_ic,ic_fund_code,ic_fund_yr,fund_ic_nm,...,trms,rfa,res_prg,spcf_aims,dai_res,res_net,goal,data_src,heal_funded,data_mgmt
63,From 2009-2013 the utilization of the Schedule...,R44,NIH,NIDA,DA,National Institute on Drug Abuse,NIDA,DA,2020,National Institute on Drug Abuse,...,<Oral Administration><Oral Drug Administration...,Cross-Cutting Research,Small Business Programs,The most commonly prescribed opioids in the fi...,NO,,Cross-Cutting Research,2,Y,SBIR/STTR
350,From 2009-2013 the utilization of the Schedule...,R44,NIH,NIDA,DA,National Institute on Drug Abuse,NIDA,DA,2021,National Institute on Drug Abuse,...,<Oral Administration><Oral Drug Administration...,Cross-Cutting Research,Small Business Programs,,,,Cross-Cutting Research,2,Y,SBIR/STTR
1001,From 2009-2013 the utilization of the Schedule...,R44,NIH,NIDA,DA,National Institute on Drug Abuse,NIDA,DA,2019,National Institute on Drug Abuse,...,<Oral Administration><intraoral drug delivery>...,Cross-Cutting Research,Small Business Programs,The most commonly prescribed opioids in the fi...,NO,,Cross-Cutting Research,2,Y,SBIR/STTR


In [8]:
### MATCH!
# 1:m in Stata is the same as outer join in pandas??
print(f"Length of reporter: {len(reporter_df)}")
combined_data_1 = pd.merge(reporter_df, awards_df, how='outer', left_on='appl_id', right_on='appl_id').drop_duplicates()
print(f"# of records after merging reporter and awards: {len(combined_data_1)}")
combined_data = pd.merge(combined_data_1, progress_tracker_df, how='outer', left_on='appl_id', right_on='appl_id')
print(f"# of records after merging with platform: {len(combined_data)}")

print(f"Number of appl_ids in progress tracker that were not matched to reporter/awards: \
      {len(progress_tracker_df[~progress_tracker_df.appl_id.isin(combined_data_1.appl_id)])}")
progress_tracker_df[~progress_tracker_df.appl_id.isin(combined_data_1.appl_id)].to_csv(input_dir/"platform_applids_missing_inreporterawards.csv", index=False)

Length of reporter: 1590
# of records after merging reporter and awards: 1591
# of records after merging with platform: 1624
Number of appl_ids in progress tracker that were not matched to reporter/awards:       24


In [9]:
## This block is debugging
s = combined_data[pd.isna(combined_data.proj_num_spl_sfx_code)][['proj_num', 'proj_num_spl_sfx_code', 'proj_ser_num', 'mds_proj_num_spl_sfx_code', 'mds_proj_ser_num', 'mds_project_num']]
s.dropna(how='all', inplace=True)
s.to_csv(input_dir/"combined_data_qc.csv")

In [10]:
## MATCH
print(len(combined_data[pd.isna(combined_data.proj_ser_num)]))
#print(combined_data[pd.isna(combined_data.proj_ser_num)][['proj_num', 'proj_num_spl_sfx_code', 'proj_ser_num', 'mds_proj_num_spl_sfx_code', 'mds_proj_ser_num', 'mds_project_num', 'hdp_id']])

## Question: The following corresponds to line 157 from HEAL_MYSQL_01_ImportMerge.do., but it seems to be unnecessary. 
## Is my understanding correct that this is replacing the awards/reporter proj_ser_num with the version in mds_ when empty.
## But this seems unnecessary since it's empty.
for var in ['proj_num_spl_sfx_code', 'proj_ser_num']:
    combined_data[var] = [m if pd.isna(k) else k for (m,k) in combined_data[['mds_'+var, var]].values]
combined_data = combined_data.sort_values(by=['proj_ser_num', 'subproj_id', 'proj_num_spl_sfx_code', 'appl_id', 'hdp_id'])
print(f"Number of entries in merged dataset: {len(combined_data)}")
combined_data['proj_ser_num'].replace('', np.NaN, inplace=True)
print(f"Number of entries with empty proj_ser_num which will be dropped: {len(combined_data[pd.isna(combined_data.proj_ser_num)])}")
print(combined_data[pd.isna(combined_data.proj_ser_num)][['proj_ser_num', 'mds_proj_ser_num', 'hdp_id']])
combined_data = combined_data[(~pd.isna(combined_data.proj_ser_num))]
print(f"*** Number of entries in merged dataset AFTER removing empty proj_ser_num: {len(combined_data)}")

31
Number of entries in merged dataset: 1624
Number of entries with empty proj_ser_num which will be dropped: 7
    proj_ser_num mds_proj_ser_num    hdp_id
321          NaN                   HDP00885
322          NaN                   HDP00886
506          NaN                   HDP00882
511          NaN                   HDP00881
598          NaN                   HDP00883
603          NaN                   HDP00884
261          NaN              NaN       NaN
*** Number of entries in merged dataset AFTER removing empty proj_ser_num: 1617


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['proj_ser_num'].replace('', np.NaN, inplace=True)


In [11]:
### Assign group numbers
combined_data['proj_num_spl_sfx_code'].replace('', np.NaN, inplace=True)
combined_data['subproj_id'].replace('', np.NaN, inplace=True)
combined_data['xstudy_id_stewards'] = combined_data.groupby(by=['proj_ser_num', 'subproj_id', 'proj_num_spl_sfx_code'], dropna=False).ngroup()
combined_data['study_id'] = combined_data.groupby(by=['xstudy_id_stewards', 'hdp_id']).ngroup()
print(combined_data[ ['proj_ser_num', 'subproj_id', 'proj_num_spl_sfx_code', 'xstudy_id_stewards', 'study_id']])
combined_data[ ['proj_ser_num', 'subproj_id', 'proj_num_spl_sfx_code', 'xstudy_id_stewards', 'study_id', 'hdp_id']].to_csv(input_dir/"xstudy_id_stewards.csv")

     proj_ser_num  subproj_id proj_num_spl_sfx_code  xstudy_id_stewards  \
1264     AA021691         NaN                    S1                   0   
1123     AA025480         NaN                    S1                   1   
1243     AA025848         NaN                    S1                   2   
1288     AG067493         NaN                   NaN                   3   
248      AG067493         NaN                   NaN                   3   
...           ...         ...                   ...                 ...   
867      TR004701         NaN                   NaN                1209   
944      TR004743         NaN                   NaN                1210   
1105     TW007401         NaN                    S1                1211   
1106     TW008163         NaN                    S1                1212   
1107     TW009872         NaN                    S1                1213   

      study_id  
1264       0.0  
1123       1.0  
1243       2.0  
1288       3.0  
248        NaN

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['proj_num_spl_sfx_code'].replace('', np.NaN, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['subproj_id'].replace('', np.NaN, inplace=True)


In [12]:
 ## Matched
 ### Make a dataset of xstudy_id_stewards and # of appl_ids
xstud = combined_data[~pd.isna(combined_data['appl_id'])][ ['xstudy_id_stewards', 'appl_id']].copy(deep=True).drop_duplicates()
xstud_count_applid = xstud.groupby('xstudy_id_stewards').size()
print(xstud_count_applid)
print(max(xstud_count_applid))


xstudy_id_stewards
0       1
1       1
2       1
3       4
4       4
       ..
1209    1
1210    1
1211    1
1212    1
1213    1
Length: 1214, dtype: int64
5


In [13]:
## Match
### Make a dataset of xstudy_id_stewards and # of hdpids
xstud = combined_data[~pd.isna(combined_data['hdp_id'])][ ['xstudy_id_stewards', 'hdp_id']].copy(deep=True).drop_duplicates()
xstud_count_hdpid = xstud.groupby('xstudy_id_stewards').size()
print(xstud_count_hdpid)
print(max(xstud_count_hdpid))

xstudy_id_stewards
0       1
1       1
2       1
3       1
4       1
       ..
1209    1
1210    1
1211    1
1212    1
1213    1
Length: 1174, dtype: int64
4


In [14]:
## Combine the two counts into the combined data column:
combined_data['num_appl_by_xstudyidstewards'] = [xstud_count_applid[k] if k in xstud_count_applid else 0 for k in combined_data['xstudy_id_stewards']]
combined_data['num_hdp_by_xstudyidstewards'] = [xstud_count_hdpid[k] if k in xstud_count_hdpid else 0 for k in combined_data['xstudy_id_stewards']]

In [15]:
combined_data['valid_flag'] = [1 if (n in [0,1]) else 0 for n in combined_data['num_hdp_by_xstudyidstewards']]
### Question: These numbers look different
print(f"Got valid rows: {len(combined_data[combined_data['valid_flag']==1])}") #1408
combined_data['valid_flag'] = [1 if (v==0 and (na==nh and nha==1)) else v for (v, na, nh, nha) in combined_data[['valid_flag', 'num_appl_by_xstudyidstewards', 'num_hdp_by_xstudyidstewards', 'num_hdp_by_appl']].values]
print(f"Got valid rows: {len(combined_data[combined_data['valid_flag']==1])}") #137
combined_data['valid_flag'] = [1 if (v==0 and (na==1)) else v for (v, na) in combined_data[['valid_flag', 'num_appl_by_xstudyidstewards']].values]
print(f"Got valid rows: {len(combined_data[combined_data['valid_flag']==1])}") #15

print(f"INVALID rows: {len(combined_data[combined_data['valid_flag']==0])}") #33
combined_data[combined_data['valid_flag']==0].to_csv(input_dir/"sis_hdpid_comparison_issues.csv")

Got valid rows: 1427
Got valid rows: 1569
Got valid rows: 1584
INVALID rows: 33


In [16]:
combined_data[['study_id', 'valid_flag', 'xstudy_id_stewards', 'num_appl_by_xstudyidstewards', 'num_hdp_by_xstudyidstewards', 'appl_id', 'hdp_id', 'proj_ser_num', 'subproj_id', 'proj_num_spl_sfx_code', 'proj_num']].to_csv(input_dir/"full_data.csv", index=False)

In [17]:
### START FILLING UP STUDY IDs

hdpid0 = combined_data[ combined_data['num_hdp_by_xstudyidstewards'] == 0].copy(deep=True)
print(len(hdpid0)) ##51 :: MATCH
hdpid1 = combined_data[ combined_data['num_hdp_by_xstudyidstewards'] == 1].copy(deep=True)
print(len(hdpid1)) ##1376: MATCH

# When HDPID per stud_stewards is > 1::
ss = combined_data[ ~combined_data['num_hdp_by_xstudyidstewards'].isin([0,1])]
studyidgood1 = ss[ (ss['num_appl_by_xstudyidstewards']==ss['num_hdp_by_xstudyidstewards']) & (ss['num_hdp_by_appl']==1) ].copy(deep=True)
print(len(studyidgood1)) # 142
studyidgood2 = ss[ss['num_appl_by_xstudyidstewards']==1].copy(deep=True)
print(len(studyidgood2))


51
1376
142
15


In [18]:
### So far study_id has not been assigned to all the values, reassign:

## HDPID0
max_studyid = max(combined_data['study_id'])
hdpid0['study_id'] = hdpid0.groupby(by=['xstudy_id_stewards']).ngroup() + max_studyid+1
studyidgood3 = hdpid0

## HDPID1
hdpid1_ids = hdpid1[~pd.isna(hdpid1['study_id'])][ ['study_id', 'xstudy_id_stewards']].drop_duplicates().copy(deep=True)
hdpid1_ids.rename(columns={'study_id':'new_study_id'}, inplace=True)
hdpid1_merge = pd.merge(hdpid1, hdpid1_ids, how='left', on='xstudy_id_stewards')
hdpid1_merge.drop(columns='study_id', inplace=True)
hdpid1_merge.rename(columns={'new_study_id':'study_id'}, inplace=True)
studyidgood4 = hdpid1_merge


In [19]:
rest_studies = combined_data[ ~ (combined_data['num_hdp_by_xstudyidstewards'].isin([0,1]) |
                                ((combined_data['num_appl_by_xstudyidstewards'] == combined_data['num_hdp_by_xstudyidstewards']) & (combined_data['num_hdp_by_appl']==1)) |
                                (combined_data['num_appl_by_xstudyidstewards']==1)
                                )
                            ]
rest_studies.to_csv(input_dir/"valid_flag_0.csv")

In [20]:
ctn_studies = rest_studies[ [ ((not pd.isna(k)) and ('Clinical Trials Network' in k)) | ( ( not pd.isna(l)) and ('Clinical Trials Network' in l)) | (r == 'CTN') for (k,l,r) in rest_studies[['proj_title', 'res_prg', 'res_net']].values]]
print(f"Number of ctn studies: {len(ctn_studies)}") # 17 (MATCH)

nonctn_studies = rest_studies[ ~rest_studies['xstudy_id_stewards'].isin(ctn_studies['xstudy_id_stewards'])]
print(f"Non-CTN studies: {len(nonctn_studies)}") # 16 :: Match

nonctn_studies_hdp = nonctn_studies[ ~pd.isna(nonctn_studies['hdp_id'])].copy(deep=True)
print(len(nonctn_studies_hdp)) #10 : Match

nonctn_studies_nohdp = nonctn_studies[ pd.isna(nonctn_studies['hdp_id'])].copy(deep=True)
print(len(nonctn_studies_nohdp)) #6 : Match

## Try to get StudyID for NonCTN rows that do not have HDPID (and hence no studyid).
## These seem to be matched by ACT Code per appl_id??
merge = pd.merge(nonctn_studies_nohdp, nonctn_studies_hdp[['xstudy_id_stewards', 'act_code', 'study_id']], on='xstudy_id_stewards')

print(nonctn_studies_hdp[['study_id', 'xstudy_id_stewards', 'num_appl_by_xstudyidstewards', 'num_hdp_by_xstudyidstewards', 'appl_id', 'hdp_id']])
print("-----")
print(nonctn_studies_nohdp[['study_id', 'xstudy_id_stewards', 'num_appl_by_xstudyidstewards', 'num_hdp_by_xstudyidstewards', 'appl_id', 'hdp_id']])
print("-----")

merge_match = merge[ merge['act_code_x'] == merge['act_code_y']].copy(deep=True)
merge_match.rename(columns={'study_id_y': 'study_id', 'act_code_y':'act_code'}, inplace=True)
nonctn_studies_nohdp = merge_match.drop(columns=['act_code_x', 'study_id_x'])
print(len(nonctn_studies_nohdp))

Number of ctn studies: 17
Non-CTN studies: 16
10
6
      study_id  xstudy_id_stewards  num_appl_by_xstudyidstewards  \
888       62.0                  60                             4   
1400      61.0                  60                             4   
428      595.0                 550                             3   
1495     594.0                 550                             3   
442      600.0                 554                             3   
1502     599.0                 554                             3   
447      613.0                 565                             3   
1506     612.0                 565                             3   
433     1097.0                1043                             3   
1496    1096.0                1043                             3   

      num_hdp_by_xstudyidstewards   appl_id    hdp_id  
888                             2  10705012  HDP01284  
1400                            2   9898129  HDP00122  
428                             

In [21]:
#### Creating the key of study_id and appl_id from all the datasets created above:
keep_cols = ['study_id', 'appl_id']
studyidkey = pd.concat([studyidgood1[keep_cols], studyidgood3[keep_cols], studyidgood4[keep_cols], nonctn_studies_nohdp[keep_cols], nonctn_studies_hdp[keep_cols]])

print(len(studyidkey))
studyidkey.rename(columns={'study_id' : 'xstudy_id'}, inplace=True)
## Not using 15 from studyidgood2 because one appl_id is getting >1 HDPIDs.
#### Match 1585

1585


In [22]:
## Make sure that only one appl_id exists in the dataset
max(studyidkey.groupby('appl_id').size())

1

In [23]:
# Add these studyids to the combined dataset:
combined_data_studyid = pd.merge(combined_data, studyidkey, how='left', on='appl_id')
print(len(combined_data_studyid[pd.isna(combined_data_studyid.xstudy_id)])) ## Unmatched: 17 CTN + 15 HDPID
print(len(combined_data_studyid))

32
1617


In [24]:
combined_data_studyid['study_id'] = [ x if not pd.isna(x) else s for (s,x) in combined_data_studyid[['study_id', 'xstudy_id']].values]
print(len(combined_data[pd.isna(combined_data['study_id'])]))
print(len(combined_data_studyid[pd.isna(combined_data_studyid['study_id'])]))
## 339 study ids filled: Match!
combined_data_studyid.drop(columns='xstudy_id', inplace=True)

combined_data_studyid[['study_id', 'valid_flag', 'xstudy_id_stewards', 'num_appl_by_xstudyidstewards', 'num_hdp_by_xstudyidstewards', 'appl_id', 'hdp_id', 'proj_ser_num', 'subproj_id', 'proj_num_spl_sfx_code', 'proj_num']].to_csv(input_dir/"full_data_studyid.csv", index=False)

344
5


In [25]:
#### Finding most recent appl_id for each study
combined_data_studyid.sort_values(by=["study_id", "fisc_yr"], inplace=True)

for k in ['bgt_end', 'proj_end_date']:
    combined_data_studyid[k+'_date'] = [  np.NaN if pd.isna(d) else ( np.NaN if str(d).startswith('0000') else datetime.strptime(str(d)[0:10], "%Y-%m-%d") ) for d in  combined_data_studyid[k]  ]

latest_proj_end_dt_forstudy = combined_data_studyid.groupby('study_id')['proj_end_date_date'].max()
combined_data_studyid['latest_proj_end_dt_forstudy'] = [latest_proj_end_dt_forstudy[k] if k in latest_proj_end_dt_forstudy else np.NaN for k in combined_data_studyid.study_id]
latest_fy = combined_data_studyid.groupby('study_id')['fisc_yr'].max()
combined_data_studyid['latest_fy'] = [latest_fy[k] if k in latest_fy else np.NaN for k in combined_data_studyid.study_id]
combined_data_studyid['match_fy'] = [ (l==f) | (pd.isna(l) and pd.isna(f)) for (l,f) in combined_data_studyid[['latest_fy', 'fisc_yr']].values]

data_fiscyr_match = combined_data_studyid[ combined_data_studyid['match_fy']]
print(len(data_fiscyr_match))
### QUESTION: 1312

latest_bgt_end = data_fiscyr_match.groupby('study_id')['bgt_end_date'].max()
data_fiscyr_match['latest_bgt_end'] = [latest_bgt_end[k] if k in latest_bgt_end else np.NaN for k in data_fiscyr_match.study_id]
data_fiscyr_match['match_bgt_end'] = [ (l==f) | (pd.isna(l) and pd.isna(f)) for (l,f) in data_fiscyr_match[['latest_bgt_end', 'bgt_end_date']].values]
data_fiscyr_bgtend_match = data_fiscyr_match[data_fiscyr_match['match_bgt_end']]
print(len(data_fiscyr_bgtend_match))

count_studyids = data_fiscyr_bgtend_match.groupby('study_id').size()
print(f"Checking for any duplicates in the studyid: {max(count_studyids)}")

1313
1313
Checking for any duplicates in the studyid: 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_fiscyr_match['latest_bgt_end'] = [latest_bgt_end[k] if k in latest_bgt_end else np.NaN for k in data_fiscyr_match.study_id]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_fiscyr_match['match_bgt_end'] = [ (l==f) | (pd.isna(l) and pd.isna(f)) for (l,f) in data_fiscyr_match[['latest_bgt_end', 'bgt_end_date']].values]


In [26]:
recentapplid_key = data_fiscyr_bgtend_match[keep_cols].copy(deep=True)
recentapplid_key.rename(columns={'appl_id':'study_most_recent_appl'}, inplace=True)
print(len(recentapplid_key)) ## QUESTION: vs 1313
recentapplid_key.columns

hdpid_studyid_key = combined_data_studyid[['study_id', 'hdp_id', 'appl_id']].copy(deep=True)
hdpid_studyid_key = hdpid_studyid_key[ ~(pd.isna(hdpid_studyid_key.study_id) | pd.isna(hdpid_studyid_key.hdp_id))]
print(len(hdpid_studyid_key)) ##QUESTION: vs 1273 in 
hdpid_studyid_key.rename(columns={'appl_id':'study_hdp_id_appl', 'hdp_id':'study_hdp_id'}, inplace=True)


1313
1273


In [27]:
#### Create the final lookup table:

study_lookup_table = combined_data_studyid[~pd.isna(combined_data_studyid.study_id)][keep_cols].copy(deep=True)
print(len(study_lookup_table))
study_lookup_table = pd.merge(study_lookup_table, recentapplid_key, how='left', on='study_id' )
study_lookup_table = pd.merge(study_lookup_table, hdpid_studyid_key, how='left', on='study_id' )
study_lookup_table.drop_duplicates(inplace=True)
print(len(study_lookup_table)) ## Question: 1588 vs 1607
study_lookup_table.rename(columns={"study_id":"xstudy_id"}, inplace=True)
study_lookup_table.columns
study_lookup_table.to_csv(input_dir/"study_lookup_table_gen.csv", index=False)


1612
1612


In [28]:
## Checking
# Is Sabrina generated table the same that I generate here?
gt_file = pd.read_csv(input_dir/"study_lookup_table.csv")
convert_appl_ids_tostr(gt_file)
convert_appl_ids_tostr(gt_file, 'study_most_recent_appl')
convert_appl_ids_tostr(gt_file, 'study_hdp_id_appl')
gt_file['study_hdp_id_appl'].replace("0", np.NaN, inplace=True)


print(f"Lengths of the file generated: {len(gt_file)}")
print(f"Length of the dataset calculated in this notebook: {len(study_lookup_table)}")

print(f"Max study id in the file generated: {max(gt_file.xstudy_id)}")
print(f"Max study id in the dataset generated: {max(study_lookup_table.xstudy_id)}")

gt_file.sort_values(by='appl_id', inplace=True)
study_lookup_table.sort_values(by='appl_id', inplace=True)

columns_to_match = ['appl_id', 'study_most_recent_appl', 'study_hdp_id', 'study_hdp_id_appl']
a = gt_file[columns_to_match].rename( columns = {k: k+'_x' for k in columns_to_match}).sort_values(by=['appl_id_x', 'study_hdp_id_x']).reset_index(drop=True)
b = study_lookup_table[columns_to_match].rename( columns = {k: k+'_y' for k in columns_to_match}).sort_values(by=['appl_id_y', 'study_hdp_id_y']).reset_index(drop=True)

compare_dataset = pd.concat([a,b], axis=1)


columns_to_match = ['appl_id', 'study_most_recent_appl', 'study_hdp_id', 'study_hdp_id_appl']

for c in columns_to_match:
    compare_dataset[c+'_check'] = [(x==y) | (pd.isna(x) & pd.isna(y)) for (x,y) in compare_dataset[[c+'_x', c+'_y']].values]
    print(f"Comparing: {c}")
    print(f"Getting matches for: {len(compare_dataset[compare_dataset[c+'_check']])}")
compare_dataset['all_good'] = [ x & y & z & w for (x,y,z,w) in compare_dataset[ [k+'_check' for k in columns_to_match ]].values]
compare_dataset.to_csv(input_dir/"comparison.csv")

# Do ALL HDPIDs from platform's original table make into the table? If not - why?
missing_hdps_fromprogress_tracker = progress_tracker_df[~progress_tracker_df['hdp_id'].isin(study_lookup_table.study_hdp_id)]['hdp_id'].drop_duplicates()
print(len(missing_hdps_fromprogress_tracker))
print(missing_hdps_fromprogress_tracker)
missing_hdps_fromprogress_tracker = progress_tracker_df[~progress_tracker_df['hdp_id'].isin(gt_file.study_hdp_id)]['hdp_id'].drop_duplicates()
print(len(missing_hdps_fromprogress_tracker))
print(missing_hdps_fromprogress_tracker)

# Can we keep track of projects that are not included because of formatting inconsistencies?



Lengths of the file generated: 1612
Length of the dataset calculated in this notebook: 1612
Max study id in the file generated: 1313
Max study id in the dataset generated: 1312.0
Comparing: appl_id
Getting matches for: 1612
Comparing: study_most_recent_appl
Getting matches for: 1612
Comparing: study_hdp_id
Getting matches for: 1612
Comparing: study_hdp_id_appl
Getting matches for: 1612
6
882    HDP00885
883    HDP00886
879    HDP00882
878    HDP00881
880    HDP00883
881    HDP00884
Name: hdp_id, dtype: object
6
882    HDP00885
883    HDP00886
879    HDP00882
878    HDP00881
880    HDP00883
881    HDP00884
Name: hdp_id, dtype: object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[appl_id_col].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[appl_id_col].replace("-1", np.NaN, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

In [32]:
## Looking at study_most_recent_appl, and seeing which were not matchd to platform
unmatched = gt_file[pd.isna(gt_file.study_hdp_id)]
print(len(unmatched))
unmatched_most_recent = unmatched.study_most_recent_appl.drop_duplicates()
print(len(unmatched_most_recent))

51
40


In [59]:
## Is it in??
reporter_df[reporter_df.appl_id.isin(unmatched_most_recent.values)][['appl_id', 'proj_num', 'proj_ser_num', 'proj_url', 'pi', 'awd_ty']]

Unnamed: 0,appl_id,proj_num,proj_ser_num,proj_url,pi,awd_ty
46,10022491,8R44NS119770-03,NS119770,https://reporter.nih.gov/project-details/10022491,Pierre Riviere,8
63,10029002,4R44DA046316-02,DA046316,https://reporter.nih.gov/project-details/10029002,STUART J KAHN;JOHN A ZEBALA,4N
102,10131167,5R34DA046635-03,DA046635,https://reporter.nih.gov/project-details/10131167,JIAN KONG,5
107,10133699,5U24HD095254-04,HD095254,https://reporter.nih.gov/project-details/10133699,Abhik Das,5
110,10136565,5R01DA047094-03,DA047094,https://reporter.nih.gov/project-details/10136565,Rajita Sinha,5
169,10167785,5U01MH114087-05,MH114087,https://reporter.nih.gov/project-details/10167785,Brian Kenneth Ahmedani;GREGORY E. SIMON,5
197,10186827,5U19MH113135-05,MH113135,https://reporter.nih.gov/project-details/10186827,SPERO MARTIN MANSON,5
199,10197809,5U19MH121738-03,MH121738,https://reporter.nih.gov/project-details/10197809,Beth E. Waitzfelder,5
200,10197811,5R01MH120124-03,MH120124,https://reporter.nih.gov/project-details/10197811,Kara Zivin,5
224,10217075,5U01DA046430-02,DA046430,https://reporter.nih.gov/project-details/10217075,ADAM BISAGA,5


In [67]:
reporter_df[ reporter_df.proj_ser_num=='MH114087'][['appl_id', 'proj_num', 'proj_ser_num', 'pi', 'proj_title']]

Unnamed: 0,appl_id,proj_num,proj_ser_num,pi,proj_title
114,10139426,3U01MH114087-04S1,MH114087,Brian Kenneth Ahmedani,Patient perspectives on clinical approaches to...
169,10167785,5U01MH114087-05,MH114087,Brian Kenneth Ahmedani;GREGORY E. SIMON,An Evaluation of the National Zero Suicide Mod...
1020,9676620,3U01MH114087-02S2,MH114087,Brian Kenneth Ahmedani;GREGORY E. SIMON,Evaluating the Impact of Changes in Opioid Pre...


In [34]:
reporter_df.columns

Index(['proj_abs', 'act_code', 'ic_code', 'adm_ic', 'adm_ic_code', 'adm_ic_nm',
       'fund_ic', 'ic_fund_code', 'ic_fund_yr', 'fund_ic_nm',
       'fund_ic_tot_cst', 'appl_id', 'arra_fund', 'tot_fund', 'awd_not_date',
       'awd_ty', 'bgt_end', 'bgt_strt', 'cfda_code', 'cong_dist', 'ctc_pi_nm',
       'cr_pro_num', 'covid_res', 'amt_dir', 'fisc_yr', 'ful_foa',
       'sty_sec_ful_grp_code', 'sty_sec_ful_nm', 'sty_sec_ful_des_code',
       'sty_sec_ful_flex_code', 'sty_sec_ful_srg_code', 'sty_sec_ful_srg_flex',
       'fund_mech', 'indct_cst_amt', 'is_act', 'is_new', 'mech_code_dc',
       'org_dept_type', 'org_ext_id', 'org_cy', 'org_ctry', 'org_duns',
       'org_fips', 'org_ipf_code', 'org_nm', 'org_st', 'org_zip_code',
       'org_ty_code', 'org_ty_oth', 'org_ty_nm', 'phr_text', 'pref_terms',
       'pi_fst_nm', 'pi', 'pi_is_ctc', 'pi_lst_nm', 'pi_mid_nm', 'pi_prof_id',
       'pi_title', 'prg_ofc_fst_nm', 'prg_ofc', 'prg_ofc_lst_nm',
       'prg_ofc_mid_nm', 'proj_url', 'projend