In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from db_queries import get_outputs, get_population, get_best_model_versions
from db_queries import get_ids as get_id_table
# from get_draws.api import get_draws

!whoami
!date

ndbs
Mon Oct 12 12:29:29 PDT 2020


# Explore stunting and population data available in `db_queries`

My goal is to rank countries by number of children stunted, so we need stunting prevalence and population under 5. As I discovered below, `get_outputs` does not return prevalence for risks, so I will try `get_draws` instead, in a separate notebook. 

In order to facilitate pulling data from GBD, I am developing some functions to make it easier to work with id's in GBD. I intend to put these functions in a module for reuse once I have a better idea how they should work.


# Goal 1: Write a function to return a list of id's for specified names

In [2]:
get_id_table('measure').head()

Unnamed: 0,measure_id,measure_name
0,1,Deaths
1,2,DALYs (Disability-Adjusted Life Years)
2,3,YLDs (Years Lived with Disability)
3,4,YLLs (Years of Life Lost)
4,5,Prevalence


In [3]:
get_id_table('gbd_round')

Unnamed: 0,gbd_round_id,gbd_round
0,1,2010
1,2,2013
2,3,2015
3,4,2016
4,5,2017
5,6,2019
6,7,2020


In [4]:
get_id_table('gbd_round').dtypes

gbd_round_id     int64
gbd_round       object
dtype: object

In [5]:
def get_ids(entity, *entity_names):
    ids = get_id_table(entity)
    if entity in ['gbd_round']:
        entity_name_column = f'{entity}'
    else:
        entity_name_column = f'{entity}_name'
    if len(entity_names)>0:
        ids = ids.query(f'{entity_name_column} in {entity_names}')
    return ids.set_index(entity_name_column)[f'{entity}_id']

In [6]:
get_ids('measure', 'Prevalence')

measure_name
Prevalence    5
Name: measure_id, dtype: int64

In [7]:
get_ids('measure', 'Prevalence','YLDs (Years Lived with Disability)')

measure_name
YLDs (Years Lived with Disability)    3
Prevalence                            5
Name: measure_id, dtype: int64

In [8]:
# Note: The db_queries functions won't accept the numpy int64 type as an argument for gbd_round_id...
# it must be converted to an ordinary python int, which apparently .astype(int) does NOT do
get_ids('gbd_round', '2019').astype(int)

gbd_round
2019    6
Name: gbd_round_id, dtype: int64

# Goal 2: Try to find stunting data

Figure out which decomp step we need by using `get_best_model_versions`. The returned df will be empty unless the decomp step matches the status, so if we get a nonempty result when `status='best'`, then we know we have the right decomp step (step 4 is the correct one).

https://scicomp-docs.ihme.washington.edu/db_queries/current/get_best_model_versions.html?highlight=decomp

In [9]:
mod_ent_table = get_id_table('modelable_entity')
mod_ent_table.head()
mod_ent_table[mod_ent_table.modelable_entity_name.str.contains("stunting")]

Unnamed: 0,modelable_entity_id,modelable_entity_name,modelable_entity_description
2455,8831,Child stunting PAF,PAF model for risk: Childhood stunting
2600,9083,Child stunting Relative Risk,Relative Risks for risk: Childhood stunting
3312,10513,Childhood stunting SD,Use height for age data (height for age z scor...
4418,16515,"Child stunting exposure, <-3 sd, interpolated ...","Child stunting exposure, <-3 sd, interpolated ..."
4419,16516,"Child stunting exposure, -3 to -2 sd, interpo...","Child stunting exposure, -3 to -2 sd, interpo..."
4420,16517,"Child stunting exposure, -2 to -1 sd, interpo...","Child stunting exposure, -2 to -1 sd, interpo..."


In [10]:
mod_ent_table.loc[4420].values

array([16517,
       'Child stunting exposure,  -2 to -1 sd, interpolated annual results',
       'Child stunting exposure,  -2 to -1 sd, interpolated annual results'],
      dtype=object)

In [11]:
# The fact that this returns results shows that deconmp step 4 is best
get_best_model_versions(entity='modelable_entity',
                        ids=[8831, 9083, 10513, 16515, 16516, 16517],
                        gbd_round_id=6,
                        status='best',
                        decomp_step='step4'
                       )

Unnamed: 0,cause_id,cause_name,rei_id,rei_name,modelable_entity_id,modelable_entity_name,model_version_id,best_start,date_inserted,description
0,,,241,Child stunting,8831,Child stunting PAF,492326,2020-02-10 18:21:26,2020-02-10 18:21:26,Interpolation of Final Results 1990-2019 (sex ...
1,,,241,Child stunting,9083,Child stunting Relative Risk,417476,2019-08-26 18:48:18,2019-08-26 18:48:15,Copy of GBD 2017 relative risks (model_version...


## Write a function to search names in an id table like we did above

Hmm, there are actually many more rows for stunting when we ignore case...

In [12]:
def search_id_table(entity, pattern, **contains_kwargs):
    ids = get_id_table(entity)
    if entity in ['gbd_round']:
        entity_name_column = f'{entity}'
    else:
        entity_name_column = f'{entity}_name'
    return ids[ids[entity_name_column].str.contains(pattern, **contains_kwargs)]

search_id_table('modelable_entity', 'stunting', case=False)

Unnamed: 0,modelable_entity_id,modelable_entity_name,modelable_entity_description
2455,8831,Child stunting PAF,PAF model for risk: Childhood stunting
2529,8949,"Severe Stunting, < -3 SD (post-ensemble)",Uses STGPR estimates at thresholds and mean to...
2530,8950,Stunting Between -3 SD and -2 SD (post-ensemble),Uses STGPR estimates at thresholds and mean to...
2531,8951,Stunting Between -2 SD and -1 SD (post-ensemble),Uses STGPR estimates at thresholds and mean to...
2600,9083,Child stunting Relative Risk,Relative Risks for risk: Childhood stunting
3311,10512,Mean Stunting (post-ensemble),Uses STGPR estimates at thresholds and mean to...
3312,10513,Childhood stunting SD,Use height for age data (height for age z scor...
3346,10556,"Moderate Stunting, < -2 SD (post-ensemble)",Uses STGPR estimates at thresholds and mean to...
3347,10557,"Mild Stunting, < -1 SD (post-ensemble)",Uses STGPR estimates at thresholds and mean to...
4418,16515,"Child stunting exposure, <-3 sd, interpolated ...","Child stunting exposure, <-3 sd, interpolated ..."


## Write a function to search the id table as above, then return a map of names to id's

In [13]:
def find_ids(entity, pattern, **contains_kwargs):
    ids = search_id_table(entity, pattern, **contains_kwargs)
    if entity in ['gbd_round']:
        entity_name_column = f'{entity}'
    else:
        entity_name_column = f'{entity}_name'
    return ids.set_index(entity_name_column)[f'{entity}_id']

find_ids('modelable_entity', 'stunting', case=False)

modelable_entity_name
Child stunting PAF                                                         8831
Severe Stunting, < -3 SD (post-ensemble)                                   8949
Stunting Between -3 SD and -2 SD (post-ensemble)                           8950
Stunting Between -2 SD and -1 SD (post-ensemble)                           8951
Child stunting Relative Risk                                               9083
Mean Stunting (post-ensemble)                                             10512
Childhood stunting SD                                                     10513
Moderate Stunting, < -2 SD (post-ensemble)                                10556
Mild Stunting, < -1 SD (post-ensemble)                                    10557
Child stunting exposure, <-3 sd, interpolated annual results              16515
Child stunting exposure,  -3 to -2 sd, interpolated annual results        16516
Child stunting exposure,  -2 to -1 sd, interpolated annual results        16517
Mean Stunting (pos

## Look up model versions again, but with *all* the id's we found for stunting

In [14]:
# The fact that this returns results shows that deconmp step 4 is best
get_best_model_versions(entity='modelable_entity',
                        ids=find_ids('modelable_entity', 'stunting', case=False).to_list(),
                        gbd_round_id=6,
                        status='best',
                        decomp_step='step4'
                       ) 

Unnamed: 0,cause_id,cause_name,rei_id,rei_name,modelable_entity_id,modelable_entity_name,model_version_id,best_start,date_inserted,description
0,,,241,Child stunting,8831,Child stunting PAF,492326,2020-02-10 18:21:26,2020-02-10 18:21:26,Interpolation of Final Results 1990-2019 (sex ...
1,,,241,Child stunting,8949,"Severe Stunting, < -3 SD (post-ensemble)",449417,2019-10-07 22:13:28,2019-10-07 22:11:31,GBD19 Final Results
2,,,241,Child stunting,8950,Stunting Between -3 SD and -2 SD (post-ensemble),449447,2019-10-07 22:28:44,2019-10-07 22:26:37,GBD19 Final Results
3,,,241,Child stunting,8951,Stunting Between -2 SD and -1 SD (post-ensemble),448088,2019-10-07 18:41:33,2019-10-07 18:39:58,GBD19 Final Results
4,,,241,Child stunting,9083,Child stunting Relative Risk,417476,2019-08-26 18:48:18,2019-08-26 18:48:15,Copy of GBD 2017 relative risks (model_version...
5,,,241,Child stunting,10556,"Moderate Stunting, < -2 SD (post-ensemble)",449411,2019-10-07 22:10:32,2019-10-07 22:07:51,GBD19 Final Results
6,,,241,Child stunting,10557,"Mild Stunting, < -1 SD (post-ensemble)",449462,2019-10-07 22:48:15,2019-10-07 22:46:14,GBD19 Final Results
7,,,241,Child stunting,23510,"Mean Stunting (post-STGPR, pre-ensemble)",446045,2019-10-08 18:27:16,2019-10-05 11:04:45,stgpr_version_id 98159: GBD 2019 Decomp 4 Results
8,,,241,Child stunting,23515,"Mild Stunting, < -1 SD (post-STGPR, pre-ensemble)",446054,2019-11-21 21:36:07,2019-10-05 11:06:01,stgpr_version_id 98270: GBD 2019 Decomp 4 Results
9,,,241,Child stunting,23516,"Moderate Stunting, < -2 SD (post-STGPR, pre-en...",446030,2019-11-21 21:37:51,2019-10-05 11:00:26,stgpr_version_id 98267: GBD 2019 Decomp 4 Results


# Goal 3: Try pulling SEV values for stunting

The only available outputs for risks from `get_outputs` are ttributable Deaths, DALYs, YLDs, and YLLs for a risk-cause pair, or the Summary Exposure Variable (SEV) for the risk:

https://scicomp-docs.ihme.washington.edu/db_queries/current/get_outputs/available.html#risk-factor-results

What we really want is prevalence, so we'll probably have to use `get_draws` instead, but let's see if we can do a quick calculation using SEVs.

Nope, we can't. The function call for GBD 2019 isn't working... though I could use GBD 2017 instead.

In [15]:
get_ids('rei', 'Child stunting')

rei_name
Child stunting    241
Name: rei_id, dtype: int64

In [16]:
get_ids('measure', 'Summary exposure value')

measure_name
Summary exposure value    29
Name: measure_id, dtype: int64

In [17]:
# The metric for SEVs is 'rate', according to the documentation
get_id_table('metric')

Unnamed: 0,metric_id,metric_name,metric_description
0,1,Number,Like number of deaths
1,2,Percent,Pc is percent of deaths or attributable risk f...
2,3,Rate,Rate
3,4,Rank,Ranking in GBD Compare.
4,5,Years,For those that are measured in years
5,6,p-value,Draw-level statistical significance. (number o...
6,7,MDG p-value,Draw-level statistical significance of achievi...
7,8,Probability of death,Probability of death
8,9,Index score,index score (0-100)


## For GBD 2017, we get results...

In [18]:
get_outputs('rei',
            rei_id=get_ids('rei', 'Child stunting').to_list(),
            measure_id=get_ids('measure', 'Summary exposure value').to_list(),
            metric_id=get_ids('metric', 'Rate').to_list(),
            gbd_round_id=int(get_ids('gbd_round', '2017').iloc[0]),
#             decomp_step='step4',
#             version='latest',
           ).head()

Unnamed: 0,age_group_id,location_id,measure_id,metric_id,rei_id,sex_id,year_id,age_group_name,expected,location_name,location_type,measure_name,metric_name,rei,rei_name,sex,val,upper,lower
0,22,1,29,3,241,3,2017,All Ages,False,Global,global,Summary exposure value,Rate,nutrition_stunting,Child stunting,Both,0.015224,0.017321,0.010703


In [19]:
get_id_table('age_group').query('age_group_id==22')

Unnamed: 0,age_group_id,age_group_name
21,22,All Ages


## ...but with the same query for GBD 2019, it says no tables are found

```
RuntimeError: No tables found for given search parameters. Tables are filtered based on measure, topic, and version arguments, so please verify those make sense. IE you will see this error if you look for SEVs with topic = cause. Given search parameters are: {'topic': 'rei', 'gbd_round_id': [6], 'compare_version_id': [7261], 'process_version_id': [14770, 14849, 14778], 'decomp_step': 'step4', 'version': 'latest', 'conn_def': 'gbd', 'year_id': [2019], 'location_set_id': [35], 'location_id': [1], 'age_group_id': [22], 'sex_id': [3], 'cause_id': [294], 'cause_set_id': [3], 'rei_set_id': [1], 'rei_id': [241], 'measure_id': [29], 'metric_id': [3]}
```

First I tried it with `version='best'`, but that said:
```
RuntimeError: no best compare version id found for gbd_round_id 6. Perhaps try version='latest' instead?
```
Using `decomp_step='step3'` and `version='latest'`, I get:
```
RuntimeError: no latest compare version id found for gbd_round_id 6
```
Most other combinations I've tried give me an error similar to below, saying no tables are found.

In [20]:
get_outputs('rei',
            rei_id=get_ids('rei', 'Child stunting').to_list(),
            measure_id=get_ids('measure', 'Summary exposure value').to_list(),
            metric_id=get_ids('metric', 'Rate').to_list(),
            gbd_round_id=int(get_ids('gbd_round', '2019').iloc[0]),
            decomp_step='step4',
            version='latest',
           ).head()

RuntimeError: No tables found for given search parameters. Tables are filtered based on measure, topic, and version arguments, so please verify those make sense. IE you will see this error if you look for SEVs with topic = cause. Given search parameters are: {'topic': 'rei', 'process_version_id': [14770, 14849, 14778], 'version': 'latest', 'gbd_round_id': [6], 'decomp_step': 'step4', 'compare_version_id': [7261], 'conn_def': 'gbd', 'year_id': [2019], 'location_id': [1], 'location_set_id': [35], 'age_group_id': [22], 'sex_id': [3], 'cause_set_id': [3], 'cause_id': [294], 'rei_set_id': [1], 'rei_id': [241], 'measure_id': [29], 'metric_id': [3]}

# Goal 4: Try looking up populations, since we want to order countries by number of children stunted

https://scicomp-docs.ihme.washington.edu/db_queries/current/get_population.html

In [21]:
find_ids('age_group', 'under', case=False)

age_group_name
Under 5    1
Name: age_group_id, dtype: int64

In [22]:
# There's no 'year_name' for the year table, apparently
get_id_table('year').head()

Unnamed: 0,year_id
0,0
1,1950
2,1951
3,1952
4,1953


In [23]:
get_id_table('year').tail()

Unnamed: 0,year_id
97,2046
98,2047
99,2048
100,2049
101,2050


In [24]:
get_id_table('year').dtypes

year_id    int64
dtype: object

In [25]:
get_id_table('location').head(10)

Unnamed: 0,location_id,location_name,location_type,location_description
0,1,Global,global,
1,6,China,admin0,admin0
2,7,Democratic People's Republic of Korea,admin0,admin0
3,8,Taiwan (Province of China),admin0,admin0
4,10,Cambodia,admin0,admin0
5,11,Indonesia,admin0,admin0
6,12,Lao People's Democratic Republic,admin0,admin0
7,13,Malaysia,admin0,admin0
8,14,Maldives,admin0,admin0
9,15,Myanmar,admin0,admin0


In [26]:
population = get_population(
    age_group_id=find_ids('age_group', 'under', case=False).to_list(),
    location_id='all',
    year_id=2019,
    gbd_round_id=6,
    decomp_step='step4',
    with_ui=True,
)

print(population.shape)
population.head()

(1082, 8)


Unnamed: 0,age_group_id,location_id,year_id,sex_id,population,upper,lower,run_id
0,1,1,2019,3,662842700.0,681974500.0,643879200.0,192
1,1,31,2019,3,27561080.0,29081740.0,25961040.0,192
2,1,32,2019,3,9572408.0,10530680.0,8656851.0,192
3,1,33,2019,3,204498.2,229297.8,179588.6,192
4,1,34,2019,3,759744.9,860372.2,661797.5,192


## Write a function to map id's to names (inverse of first function I wrote above)

In [27]:
def ids_to_names(entity, *entity_ids):
    ids = get_id_table(entity)
    if entity in ['gbd_round']:
        entity_name_column = f'{entity}'
    else:
        entity_name_column = f'{entity}_name'
    if len(entity_ids)>0:
        ids = ids.query(f'{entity}_id in {entity_ids}')
    return ids.set_index(f'{entity}_id')[entity_name_column]

ids_to_names('location').head()

location_id
1                                    Global
6                                     China
7     Democratic People's Republic of Korea
8                Taiwan (Province of China)
10                                 Cambodia
Name: location_name, dtype: object

## Replace location id's with names using my above function and `DataFrame.replace()`

This call is rather slow; it may be faster to merge with the location id table instead.

In [28]:
population.replace({'location_id': ids_to_names('location')}).head()

Unnamed: 0,age_group_id,location_id,year_id,sex_id,population,upper,lower,run_id
0,1,Global,2019,3,662842700.0,681974500.0,643879200.0,192
1,1,"Central Europe, Eastern Europe, and Central Asia",2019,3,27561080.0,29081740.0,25961040.0,192
2,1,Central Asia,2019,3,9572408.0,10530680.0,8656851.0,192
3,1,Armenia,2019,3,204498.2,229297.8,179588.6,192
4,1,Azerbaijan,2019,3,759744.9,860372.2,661797.5,192


In [30]:
# Try with merge instead
population.merge(get_id_table('location'), on='location_id').head()

Unnamed: 0,age_group_id,location_id,year_id,sex_id,population,upper,lower,run_id,location_name,location_type,location_description
0,1,1,2019,3,662842700.0,681974500.0,643879200.0,192,Global,global,
1,1,31,2019,3,27561080.0,29081740.0,25961040.0,192,"Central Europe, Eastern Europe, and Central Asia",superregion,superregion
2,1,32,2019,3,9572408.0,10530680.0,8656851.0,192,Central Asia,region,region
3,1,33,2019,3,204498.2,229297.8,179588.6,192,Armenia,admin0,admin0
4,1,34,2019,3,759744.9,860372.2,661797.5,192,Azerbaijan,admin0,INF-5395 - Subnationals for Azerbaijan (locati...
