In [1]:
import pandas as pd, numpy as np
import db_queries as db
# import id_helper as idh

!whoami
!date

ndbs
Mon Oct 12 17:02:52 PDT 2020


In [2]:
!pwd

/ihme/homes/ndbs/vivarium_data_analysis/pre_processing/lsff_project


In [3]:
import sys, os.path

sys.path.append(os.path.abspath("../..")) # Adds the vivarium_data_analysis directory to the python path
from pre_processing import id_helper as idh

In [4]:
%load_ext autoreload
%autoreload 2

# Copy valid entities from documentation and transform into a list of strings

https://scicomp-docs.ihme.washington.edu/db_queries/current/get_ids.html

In [5]:
entities = """
“age_group”

“age_group_set”

“cause”

“cause_set”

“cause_set_version”

“covariate”

“decomp_step”

“gbd_round”

“healthstate”

“indicator_component”

“life_table_parameter”

“location”

“location_set”

“location_set_version”

“measure”

“metric”

“modelable_entity”

“sdg_indicator”

“sequela”

“sequela_set”

“sequela_set_version”

“sex”

“split”

“study_covariate”

“rei”

“rei_set”

“rei_set_version”

“year”
"""
entities = [entity.strip('“”') for entity in entities.split()]
entities

['age_group',
 'age_group_set',
 'cause',
 'cause_set',
 'cause_set_version',
 'covariate',
 'decomp_step',
 'gbd_round',
 'healthstate',
 'indicator_component',
 'life_table_parameter',
 'location',
 'location_set',
 'location_set_version',
 'measure',
 'metric',
 'modelable_entity',
 'sdg_indicator',
 'sequela',
 'sequela_set',
 'sequela_set_version',
 'sex',
 'split',
 'study_covariate',
 'rei',
 'rei_set',
 'rei_set_version',
 'year']

In [6]:
len(entities)

28

# Find entities for which there is no "entity_name" column

For all of these **except** `'life_table_parameter'` and `'year'`, the name column is the name of the entity itself (without `_name` appended).

For `'life_table_parameter'`, the name column is `'parameter_name'`, and for `'year'`, there is only one column, the `'year_id'` column.

Note that the "id" column is always `f'{entity}_id'`.

In [7]:
anomalous_entity_names = {
    entity: db.get_ids(entity).columns 
    for entity in entities  
    if f'{entity}_name' not in db.get_ids(entity).columns
}
anomalous_entity_names

{'cause_set_version': Index(['cause_set_version_id', 'cause_set_version',
        'cause_set_version_description'],
       dtype='object'),
 'gbd_round': Index(['gbd_round_id', 'gbd_round'], dtype='object'),
 'life_table_parameter': Index(['life_table_parameter_id', 'parameter_name', 'parameter_description'], dtype='object'),
 'location_set_version': Index(['location_set_version_id', 'location_set_version',
        'location_set_version_description'],
       dtype='object'),
 'sequela_set_version': Index(['sequela_set_version_id', 'sequela_set_version',
        'sequela_set_version_description'],
       dtype='object'),
 'sex': Index(['sex_id', 'sex'], dtype='object'),
 'study_covariate': Index(['study_covariate_id', 'study_covariate', 'study_covariate_description'], dtype='object'),
 'rei_set_version': Index(['rei_set_version_id', 'rei_set_version', 'rei_set_version_description'], dtype='object'),
 'year': Index(['year_id'], dtype='object')}

In [8]:
len(anomalous_entity_names)

9

## See what some of them look like

In [9]:
db.get_ids('cause_set_version').head()

Unnamed: 0,cause_set_version_id,cause_set_version,cause_set_version_description
0,2,CodCorrect 2013 INCORRECT,CodCorrect hierarchy used in the 2013 round. T...
1,3,CodCorrect 2015,"CodCorrect hierarchy for the 2015 round, initi..."
2,4,GBD computation 2010,Computation or analytical causes (as they were...
3,5,"GBD computation 2013_v1, mort/cod paper INCORRECT",Causes used in computation to generate results...
4,6,GBD computation 2015,"Causes for GBD computation in 2015 round, init..."


In [10]:
db.get_ids('gbd_round')

Unnamed: 0,gbd_round_id,gbd_round
0,1,2010
1,2,2013
2,3,2015
3,4,2016
4,5,2017
5,6,2019
6,7,2020


In [11]:
db.get_ids('life_table_parameter')

Unnamed: 0,life_table_parameter_id,parameter_name,parameter_description
0,1,mx,mortality rate
1,2,ax,mean years lived in age interval among those w...
2,3,qx,probability of death
3,4,lx,survivorship curve
4,5,ex,life expectancy
5,6,pred_ex,interpolated life expectancy from the theoreti...
6,7,nLx,person years lived between age x and x+n
7,8,Tx,person-years lived in and above age interval


In [12]:
db.get_ids('sex')

Unnamed: 0,sex_id,sex
0,1,Male
1,2,Female
2,3,Both
3,4,Unknown


In [13]:
db.get_ids('study_covariate').head()

Unnamed: 0,study_covariate_id,study_covariate,study_covariate_description
0,0,sex,special sex covariate used to in epi tool only
1,1,abuse_dependence,identifies results of abuse and dependence com...
2,2,acne_low,data includes cases of low grade acne
3,3,asian_datapoints,Identifies countries from Asia
4,4,adult_less_5,identifies data of adults with 4 or fewer spec...


In [14]:
db.get_ids('year').head()

Unnamed: 0,year_id
0,0
1,1950
2,1951
3,1952
4,1953


In [15]:
db.get_ids('year')['year_id'].astype(str).head()

0       0
1    1950
2    1951
3    1952
4    1953
Name: year_id, dtype: object

In [16]:
db.get_ids('year').set_index('year_id').head()

0
1950
1951
1952
1953


### List the anomolous keys so we can copy them to my module

In [17]:
anomalous_entity_names.keys()

dict_keys(['cause_set_version', 'gbd_round', 'life_table_parameter', 'location_set_version', 'sequela_set_version', 'sex', 'study_covariate', 'rei_set_version', 'year'])

# Test my `names_to_ids` function and my `ids_to_names` function, with no name or id parameters passed

In [18]:
names_to_ids_dict = {entity: idh.names_to_ids(entity) for entity in entities}
names_to_ids_dict['age_group'].head()

age_group_name
Under 5           1
Early Neonatal    2
Late Neonatal     3
Post Neonatal     4
1 to 4            5
Name: age_group_id, dtype: int64

In [19]:
ids_to_names_dict = {entity: idh.ids_to_names(entity) for entity in entities}
ids_to_names_dict['life_table_parameter']

life_table_parameter_id
1         mx
2         ax
3         qx
4         lx
5         ex
6    pred_ex
7        nLx
8         Tx
Name: parameter_name, dtype: object

## Check that `names_to_ids` and `ids_to_names` are inverses

In [20]:
names_to_ids_dict['life_table_parameter']

parameter_name
mx         1
ax         2
qx         3
lx         4
ex         5
pred_ex    6
nLx        7
Tx         8
Name: life_table_parameter_id, dtype: int64

In [21]:
# Check whether the functions are "aligned", i.e. entries appear in the same order
# This isn't exactly what I was trying to do, but it's thee first thing I did, accidentally
names_to_ids_dict['life_table_parameter'][ids_to_names_dict['life_table_parameter']].equals(
    names_to_ids_dict['life_table_parameter'])

True

In [22]:
# Check whether the functions are "aligned", i.e. entries appear in the same order
ids_names_aligned = {
    entity: ids_to_names_dict[entity][names_to_ids_dict[entity]].equals(ids_to_names_dict[entity])
    for entity in entities
}
ids_names_aligned

{'age_group': True,
 'age_group_set': True,
 'cause': True,
 'cause_set': True,
 'cause_set_version': True,
 'covariate': True,
 'decomp_step': True,
 'gbd_round': True,
 'healthstate': True,
 'indicator_component': True,
 'life_table_parameter': True,
 'location': True,
 'location_set': True,
 'location_set_version': True,
 'measure': True,
 'metric': True,
 'modelable_entity': True,
 'sdg_indicator': True,
 'sequela': True,
 'sequela_set': True,
 'sequela_set_version': True,
 'sex': True,
 'split': True,
 'study_covariate': True,
 'rei': True,
 'rei_set': True,
 'rei_set_version': True,
 'year': True}

In [23]:
# The functions are inverses is the 'x' column is identical the 'y' column
names_to_ids_dict['life_table_parameter'].reset_index().merge(
    ids_to_names_dict['life_table_parameter'].reset_index(), on='life_table_parameter_id')

Unnamed: 0,parameter_name_x,life_table_parameter_id,parameter_name_y
0,mx,1,mx
1,ax,2,ax
2,qx,3,qx
3,lx,4,lx
4,ex,5,ex
5,pred_ex,6,pred_ex
6,nLx,7,nLx
7,Tx,8,Tx


# Test some other functions in `id_helper`

In [24]:
idh.search_id_table('rei', 'stunting', case=False)

Unnamed: 0,rei_id,rei_name,rei
157,241,Child stunting,nutrition_stunting


In [25]:
idh.find_ids('rei', 'stunting', case=False)

241

In [26]:
idh.names_to_ids('rei', 'Child stunting')

rei_name
Child stunting    241
Name: rei_id, dtype: int64

In [27]:
idh.list_ids('rei', 'Child stunting')

241

In [28]:
# Why are these in opposite order?
idh.list_ids('rei', 'Child stunting', 'Child wasting')

[240, 241]

In [29]:
# Oh, because they must be in the opposite order in the rei id table
idh.names_to_ids('rei', 'Child stunting', 'Child wasting')

rei_name
Child wasting     240
Child stunting    241
Name: rei_id, dtype: int64

In [30]:
idh.__doc__

"\nModule to facilitate using GBD id's in the shared functions.\n"

In [31]:
len(idh.get_entities_from_docstring())

22

In [32]:
len(idh.entities)

28

In [33]:
idh.find_anomalous_name_columns(idh.entities)

{'cause_set_version': Index(['cause_set_version_id', 'cause_set_version',
        'cause_set_version_description'],
       dtype='object'),
 'gbd_round': Index(['gbd_round_id', 'gbd_round'], dtype='object'),
 'life_table_parameter': Index(['life_table_parameter_id', 'parameter_name', 'parameter_description'], dtype='object'),
 'location_set_version': Index(['location_set_version_id', 'location_set_version',
        'location_set_version_description'],
       dtype='object'),
 'sequela_set_version': Index(['sequela_set_version_id', 'sequela_set_version',
        'sequela_set_version_description'],
       dtype='object'),
 'sex': Index(['sex_id', 'sex'], dtype='object'),
 'study_covariate': Index(['study_covariate_id', 'study_covariate', 'study_covariate_description'], dtype='object'),
 'rei_set_version': Index(['rei_set_version_id', 'rei_set_version', 'rei_set_version_description'], dtype='object'),
 'year': Index(['year_id'], dtype='object')}