# Country Covariates Cases

In [35]:
# import libraries
from scipy.interpolate import griddata, interp2d

from cascade.input_data.db.bundle import _get_bundle_data
from cascade.input_data.configuration.construct_country import (
    convert_gbd_ids_to_dismod_values,)
from cascade.input_data.db.country_covariates import country_covariates
from cascade.input_data.db.demographics import get_all_age_spans
from cascade.testing_utilities import make_execution_context

In [18]:
# create execution context
execution_context = make_execution_context()
execution_context.parameters.gbd_round_id = 5

In [19]:
def cov_summaries(covariate, covariate_id):
    print(f"covariate {covariate_id}, {covariate.loc[0, 'covariate_name_short']}")
    print(f"    shape: {covariate.shape}")
    print(f"    location_ids: {covariate['location_id'].unique()}")
    print()
    print(f"    age_group_ids: {covariate['age_group_id'].unique()}")
    print()
    print(f"    sex_ids: {covariate['sex_id'].unique()}")
    print()
    print(f"    year_ids: {covariate['year_id'].unique()}")
    print()

In [23]:
# get covariates for USA
location_id = 102
demographics = dict(
    age_group_ids="all", year_ids="all", sex_ids="all", location_ids=[location_id]
)

age_groups = get_all_age_spans()
print("age groups")
print(list(age_groups.columns))
print(age_groups.shape)
print()

# covariates with no "best" for gbd_round_id=5 -> 1095, 1985 (so throws an error if retrieve)

covariate_ids = [26, 14, 869, 258, 68, 842, 1194, 1195, 1241]
ccovs = {}

for covariate_id in covariate_ids:
    ccov_data = country_covariates(covariate_id, demographics, execution_context.parameters.gbd_round_id)
    ccov_ranges_df = convert_gbd_ids_to_dismod_values(ccov_data, age_groups)
    ccov_ranges_df["avg_age"] = ccov_ranges_df[["age_lower", "age_upper"]].mean(axis=1)
    ccov_ranges_df["avg_time"] = ccov_ranges_df[["time_lower", "time_upper"]].mean(axis=1)
    ccovs[covariate_id] = ccov_ranges_df
    cov_summaries(ccov_data, covariate_id)

print(f"{len(ccovs)} covariates retrieved for the examples")

age groups
['age_group_id', 'age_group_years_start', 'age_group_years_end']
(406, 3)

covariate 26, cum_cigs_20_yr
    shape: (1748, 7)
    location_ids: [102]

    age_group_ids: [  2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19
  20  30  31  32 235]

    sex_ids: [1 2]

    year_ids: [1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993
 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007
 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017]

covariate 14, cigarettes_pc
    shape: (58, 7)
    location_ids: [102]

    age_group_ids: [22]

    sex_ids: [3]

    year_ids: [1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973
 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987
 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001
 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
 2016 2017]

covariate 869, malaria_incidence
    shape: (874, 7)
    location_ids

In [31]:
# Case - Interpolate over age and time by sex -- covariate is by_age, by_sex, and has multiple time values
# use covariate_id = 26, cum cigs 20 years
# use measurement bundle 173

ccov_26 = ccovs[26]
ccov_26_f = ccov_26[ccov_26["x_sex"] == -0.5]
ccov_26_m = ccov_26[ccov_26["x_sex"] == 0.5]

print(ccov_26_f.shape)
print(ccov_26_m.shape)


(874, 13)
(874, 13)


In [48]:
# look at covariate values

print(f"""ccov_26_m, 42.5, 1980.5: 
      {float(ccov_26_m[(ccov_26_m['avg_age']==42.5) & (ccov_26_m['avg_time']==1980.5)]['mean_value'])}""")
     
print(f"""ccov_26_m, 42.5, 2002.5: 
      {float(ccov_26_m[(ccov_26_m['avg_age']==42.5) & (ccov_26_m['avg_time']==2002.5)]['mean_value'])}""")      
            

ccov_26_m, 42.5, 1980.5: 
      4919.19753090102
ccov_26_m, 42.5, 2002.5: 
      1915.6339250152398


In [52]:
# check interpolation method - griddata

age = [42,42,42,42,42,135]
time = [1978,1980.5,1985,2002,2003,2003]

griddata_m = griddata( (ccov_26_m["avg_age"], ccov_26_m["avg_time"]), ccov_26_m["mean_value"], (age, time) )

print(type(griddata_m))
print(griddata_m.shape)
print(time)
print(griddata_m)

<class 'numpy.ndarray'>
(6,)
[1978, 1980.5, 1985, 2002, 2003, 2003]
[          nan 4992.91345524 4852.41067621 1958.38155127 1797.75895333
           nan]


In [50]:
# check interpolation method - interp2d

f_ccov_26_m = interp2d(ccov_26_m["avg_age"], ccov_26_m["avg_time"], ccov_26_m["mean_value"])

age = [42,42,42,42]
time = [1980.5,1985,2002,2003]

interp2d_m = []

for x, y in zip(age, time):
    interp2d_m.append(f_ccov_26_m(x, y))

print(type(interp2d_m))
print(len(interp2d_m))
print(time)
print(interp2d_m)

<class 'list'>
4
[1980.5, 1985, 2002, 2003]
[array([2317.30844282]), array([2252.7845264]), array([937.74567859]), array([862.03057269])]


In [None]:
# Case - Measurement data has sex=both, but covariate only has values for sex=female and sex=male
# use covariate_id = 26, cum cigs 20 years
# use measurement bundle 173


In [None]:
# Case - Measurement data has a time point outside the range of the covariate
# use covariate_id = 26, cum cigs 20 years
# use measurement bundle 173


In [None]:
# Case - Measurement data has an age interval which does not overlap with the covariate age intervals
# check these:
# covariate_id = 258
# covariate_id = 1095, art_coverage


In [None]:
# Case - Measurement data has sex=female and sex=male, but covariate is not by_sex
# covariate_id = 14, cigs per capita
# covariate_id = 869, malaria incidence


In [None]:
# Case - Interpolate over time only, because covariate is not by_age (there is only one age group - 22 or 27)
# covariate_id = 14, cigs per capita
# covariate_id = 68, mean_bmi


In [None]:
# Case - Interpolate over age only, because covariate only has values for one year (would this ever happen?)
# make something up here


In [None]:
# Case - covariate is binary
# check these: 
# covariate_id = 842
# covariate_id = 1194
# covariate_id = 1195
# covariate_id = 1241


In [59]:
ccov_842 = ccovs[842]
ccov_1194 = ccovs[1194]
ccov_1241 = ccovs[1241]

ccov_ids = [842, 1194, 1241]

for ccov_id in ccov_ids:
    ccov = ccovs[ccov_id]
    print(f"ccov {ccov_id} {ccov.loc[0, 'covariate_name_short']} (min, max): {ccov.mean_value.min()}, {ccov.mean_value.max()}")


ccov 842 H5N1_epidemic_yrs_bin (min, max): 0.0, 1.0
ccov 1194 fortification_standard (min, max): 1.0, 1.0
ccov 1241 fortification_standard_iron (min, max): 0.0, 1.0
