# Notebook to generate AABC cumulative recruitment stats and historical HCA stats by key demographic variables (draft)

In [14]:
#load some libraries
import pandas as pd
import seaborn as sns
from ccf.box import LifespanBox
import yaml


In [36]:
#request or load AABC REDCAP API Key

In [37]:
#load HCA inventory
filename="./config.yml"
with open(filename, 'r') as fd:
        config=yaml.load(fd, Loader=yaml.SafeLoader)
secret=pd.read_csv(config['config_files']['secrets'])

box = LifespanBox(cache="./tmp")
pathp=box.downloadFile(config['hcainventory'])
ids=pd.read_csv(pathp)

In [38]:
#subset to v1 for recruitment stats
forstats=ids[['subject','redcap_event','event_age','site','M/F','race','ethnic_group','pedid']].loc[ids.redcap_event.isin(['V1','V2'])].sort_values('redcap_event').drop_duplicates(subset='subject',keep='first')


In [39]:
#bin ages
bins= [20,30,40,50,60,70,80,90,100,110]
forstats['AgeGroup'] = pd.cut(forstats['event_age'], bins=bins,right=False)# labels=labels,

## produce some UNIVARIATE HCA demographic stats

In [40]:
a=pd.DataFrame(forstats.groupby(['site']).count().pedid).rename(columns={'pedid':'count'})
a.to_csv('HCA_Site.csv')
a

Unnamed: 0_level_0,count
site,Unnamed: 1_level_1
MGH,292
UCLA,298
UMinn,307
WashU,318


In [41]:
b=pd.DataFrame(forstats.groupby(['AgeGroup']).count().pedid).rename(columns={'pedid':'count'})
b.to_csv('HCA_Age.csv')
b

Unnamed: 0_level_0,count
AgeGroup,Unnamed: 1_level_1
"[20, 30)",11
"[30, 40)",104
"[40, 50)",234
"[50, 60)",233
"[60, 70)",210
"[70, 80)",184
"[80, 90)",194
"[90, 100)",45
"[100, 110)",0


In [42]:
c=pd.DataFrame(forstats.groupby(['M/F']).count().pedid).rename(columns={'pedid':'count'})
c.to_csv('HCA_Sex.csv')
c

Unnamed: 0_level_0,count
M/F,Unnamed: 1_level_1
F,690
M,525


In [43]:
d=pd.DataFrame(forstats.groupby(['race']).count().pedid).rename(columns={'pedid':'count'})
d.to_csv('HCA_Race.csv')
d

Unnamed: 0_level_0,count
race,Unnamed: 1_level_1
American Indian/Alaska Native,3
Asian,77
Black or African American,167
Hawaiian or Pacific Islander,3
More than one race,46
Unknown or not reported,23
White,896


In [44]:
e=pd.DataFrame(forstats.groupby(['ethnic_group']).count().pedid).rename(columns={'pedid':'count'})
e.to_csv('HCA_Ethnicity.csv')
e

Unnamed: 0_level_0,count
ethnic_group,Unnamed: 1_level_1
Hispanic or Latino,139
Not Hispanic or Latino,1074
unknown or not reported,2


## HCA demographics by Age

In [None]:
print('************  BY AGE  ********************')
print(forstats.groupby(['AgeGroup','site']).count().pedid)
pd.DataFrame(pd.crosstab(forstats['AgeGroup'], [forstats.site])).to_csv('HCA_Age_x_Site.csv',index=True)
print('*******************************************************')
print(forstats.groupby(['AgeGroup','M/F']).count().pedid)
pd.DataFrame(pd.crosstab(forstats['AgeGroup'], forstats['M/F'])).to_csv('HCA_Age_x_Sex.csv',index=True)
print('*******************************************************')
print(forstats.groupby(['AgeGroup','race']).count().pedid)
pd.DataFrame(pd.crosstab(forstats['AgeGroup'], [forstats.race])).to_csv('HCA_Age_x_Race.csv',index=True)
print('*******************************************************')
print(forstats.groupby(['AgeGroup','ethnic_group']).count().pedid)
pd.DataFrame(pd.crosstab(forstats['AgeGroup'], [forstats.ethnic_group])).to_csv('HCA_Age_x_Ethnicity.csv',index=True)
print('*******************************************************')

In [None]:
# # HCA demographics by Site

In [None]:
print('************* BY SITE *********************')
print(forstats.groupby(['site','M/F']).count().pedid)
pd.DataFrame(pd.crosstab(forstats['site'], [forstats['M/F']])).to_csv('HCA_Site_x_Sex.csv',index=True)
print('*******************************************************')
print(forstats.groupby(['site','race']).count().pedid)
pd.DataFrame(pd.crosstab(forstats['site'], [forstats.race])).to_csv('HCA_Site_x_Race.csv',index=True)
print('*******************************************************')
print(forstats.groupby(['site','ethnic_group']).count().pedid)
pd.DataFrame(pd.crosstab(forstats['site'], [forstats.ethnic_group])).to_csv('HCA_Site_x_Ethnicity.csv',index=True)
print('*******************************************************')

print('************ BY SEX **********************')
pd.DataFrame(pd.crosstab(forstats['M/F'], [forstats['race']])).to_csv('HCA_Sex_x_Race.csv',index=True)
print('*******************************************************')
pd.DataFrame(pd.crosstab(forstats['M/F'], [forstats['ethnic_group']])).to_csv('HCA_Sex_x_Ethnicity.csv',index=True)
print('*******************************************************')

print('************ BY RACE **********************')
pd.DataFrame(pd.crosstab(forstats['race'], [forstats['ethnic_group']])).to_csv('HCA_Race_x_Ethnicity.csv',index=True)
print('*******************************************************')