# Code to run Curvefit across all locations for model validation exercise

In [27]:
import os
import warnings

from db_queries import get_location_metadata
import numpy as np
import pandas as pd
import yaml
import re

from covid_model_deaths import runner
from covid_model_deaths.deaths_io import InputsContext, MEASURES, Checkpoint
from covid_model_deaths.globals import COLUMNS

pd.options.display.max_rows = 99
pd.options.display.max_columns = 99
warnings.simplefilter('ignore')

DATA_DATE = "2020_05_29" # Date for all data used
VALIDATION_DATE = "2020-03-27" # Date to use data upto and including

RUN_TYPE = 'validation'
MODEL_INPUTS_VERSION = 'production-runs/' + DATA_DATE
SNAPSHOT_VERSION = 'production-runs/' + DATA_DATE
DATESTAMP_LABEL = '2020_05_23_Europe' # Will want to change this.

PEAK_FILE = '/ihme/covid-19/deaths/mobility_inputs/2020_04_20/peak_locs_april20_.csv'
PEAK_DURATION_FILE = None
R0_FILE = None
LOCATION_SET_VERSION = 678
r0_locs = []
# Locations where no pseudo data is used
NO_PSEUDO = [
    564, # South Dakota
    538, # Iowa
    # Mexican subnationals
    4644, 4657, 4651, 4663, 4665, 4667, 4669
]

VALIDATION_FOLDER = re.sub("-", "_", VALIDATION_DATE)
CODE_DIR = os.path.abspath('../src/covid_model_deaths')
OUTPUT_DIR = f'/ihme/covid-19/deaths/{RUN_TYPE}/{DATA_DATE}/{VALIDATION_FOLDER}'
if not os.path.exists(f'/ihme/covid-19/deaths/{RUN_TYPE}/{DATA_DATE}'):
    os.mkdir(f'/ihme/covid-19/deaths/{RUN_TYPE}/{DATA_DATE}')
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)
inputs = InputsContext(f'/ihme/covid-19/model-inputs/{MODEL_INPUTS_VERSION}')
checkpoint = Checkpoint(OUTPUT_DIR)


smooth_draw_path = f'{OUTPUT_DIR}/smoothed_euro_data.csv'
raw_draw_path = f'{OUTPUT_DIR}/euro_data.csv'
average_draw_path = f'{OUTPUT_DIR}/past_avg_smoothed_euro_data.csv'
yesterday_draw_path = '/ihme/covid-19/deaths/prod/2020_05_22_Europe/smoothed_euro_data.csv'
before_yesterday_draw_path = '/ihme/covid-19/deaths/prod/2020_05_19_Europe/smoothed_euro_data.csv'
compare_average_path = '/ihme/covid-19/deaths/prod/2020_05_22_Europe/smoothed_euro_data.csv'


print(f'Writing to {OUTPUT_DIR}')
print(CODE_DIR)
print(checkpoint)



2020-06-02 16:25:53.807 | DEBUG    | covid_model_deaths.deaths_io.checkpoint:_setup_checkpoint_dir:45 - Making checkpoint directory at /ihme/covid-19/deaths/validation/2020_05_29/2020_03_27/checkpoint


Writing to /ihme/covid-19/deaths/validation/2020_05_29/2020_03_27
/ihme/code/covid-19/user/ctroeger/covid-model-deaths/src/covid_model_deaths
Checkpoint(/ihme/covid-19/deaths/validation/2020_05_29/2020_03_27/checkpoint)


In [28]:
MODEL_INPUTS_VERSION = 'production-runs/' + DATA_DATE
SNAPSHOT_VERSION = 'production-runs/' + DATA_DATE

import re
re.sub("-", "_", VALIDATION_DATE)

'2020_03_27'

# Metadata

In [29]:

metadata = {}
with open(f'/ihme/covid-19/model-inputs/{MODEL_INPUTS_VERSION}/metadata.yaml') as f:
    metadata['inputs_version'] = yaml.full_load(f)
    
metadata['run_type'] = RUN_TYPE
metadata['model_inputs_version'] = MODEL_INPUTS_VERSION
metadata['snapshot_version'] = SNAPSHOT_VERSION
metadata['datestamp_label'] = DATESTAMP_LABEL
metadata['peak_file'] = PEAK_FILE
metadata['location_set_version_id'] = LOCATION_SET_VERSION
metadata['output_dir'] = OUTPUT_DIR
metadata['no_pseudo'] = NO_PSEUDO
metadata['average'] = {
    'yesterday': yesterday_draw_path,
    'before_yesterday': before_yesterday_draw_path
}
metadata['compare_average'] = compare_average_path

with open(f'{OUTPUT_DIR}/metadata.yaml', 'w') as f:
    yaml.dump(metadata, f)



# Input data

In [30]:
def filter_data(data: pd.DataFrame, kind='full') -> pd.DataFrame:
    # manually adjust Iceland spike (0 deaths to 5 deaths to 0 deaths in March...)
    iceland = data['Country/Region'] == 'Iceland'
    iceland_spike = iceland & (data['Date'] == pd.Timestamp('2020-03-15'))
    if kind == 'full':
        data.loc[iceland_spike, ['Deaths', 'Death rate']] = 0
    elif kind == 'deaths':
        data = data.loc[~iceland_spike]
        min_iceland_date = data.loc[iceland, 'Date'].min()
        data.loc[iceland, 'Days'] = (data.loc[iceland, 'Date'] - min_iceland_date).dt.days
        
    catalonia  = data['location_id'] == 60368
    catalonia_spike = catalonia & (data['Date'] >= pd.Timestamp('2020-05-21'))
    data = data[~catalonia_spike]
        
    return data

def get_locations(location_set_version_id):
    # get locaton_ids
    loc_df = get_location_metadata(location_set_id=111,
                                   location_set_version_id=location_set_version_id)

    # Drop any locations in the US and keep only most detailed for modeling
    most_detailed = loc_df['most_detailed'] == 1
    # non_us = ~loc_df['path_to_top_parent'].str.startswith('102,')
    keep_columns = ['location_id', 'location_ascii_name', 'parent_id', 'level', 'most_detailed']

    euro_df = loc_df.loc[most_detailed, keep_columns]
    euro_df = euro_df.rename(columns={'location_ascii_name':'Location'})

    # Add parents
    loc_df = loc_df[['location_id', 'location_ascii_name']]
    loc_df = loc_df.rename(columns={'location_id':'parent_id',
                                    'location_ascii_name':'Country/Region'})
    euro_df = euro_df.merge(loc_df)

    euro_df = euro_df.loc[:, ['location_id', 'Location', 'Country/Region', 'level']]
    return euro_df # don't like the name but probably easier to NOT change it.

In [31]:
loc_df = get_locations(LOCATION_SET_VERSION)
input_full_df = filter_data(inputs.load(MEASURES.full_data))
input_death_df = filter_data(inputs.load(MEASURES.deaths), kind='deaths')

# Subset to just dates for validation run
input_full_df = input_full_df[input_full_df['Date'] <= VALIDATION_DATE]
input_death_df = input_death_df[input_death_df['Date'] <= VALIDATION_DATE]

input_age_pop_df = inputs.load(MEASURES.age_pop)
input_age_death_df = inputs.load(MEASURES.age_death)
smoothed_case_df, smoothed_death_df = runner.get_smoothed(input_full_df)


2020-06-02 16:25:58.178 | DEBUG    | covid_model_deaths.deaths_io.inputs:load:40 - Loading full_data.csv from /ihme/covid-19/model-inputs/2020_05_29.01.
2020-06-02 16:25:58.227 | DEBUG    | covid_model_deaths.deaths_io.inputs:load:40 - Loading deaths.csv from /ihme/covid-19/model-inputs/2020_05_29.01.
2020-06-02 16:25:58.263 | DEBUG    | covid_model_deaths.deaths_io.inputs:load:40 - Loading age_pop.csv from /ihme/covid-19/model-inputs/2020_05_29.01.
2020-06-02 16:25:58.270 | DEBUG    | covid_model_deaths.deaths_io.inputs:load:40 - Loading age_death.csv from /ihme/covid-19/model-inputs/2020_05_29.01.


In [32]:
#input_full_df[input_full_df['Date'] <= VALIDATION_DATE].head(50)
#input_death_df[input_death_df['location_id'] == 60382]
loc_df

Unnamed: 0,location_id,Location,Country/Region,level
0,75,Austria,Austria,0
1,76,Belgium,Belgium,0
2,45,Bulgaria,Bulgaria,0
3,46,Croatia,Croatia,0
4,77,Cyprus,Cyprus,0
...,...,...,...,...
130,572,Wisconsin,United States of America,1
131,573,Wyoming,United States of America,1
132,60886,King and Snohomish Counties,Washington,2
133,3539,Spokane County,Washington,2


In [None]:
# save cases for viz
smoothed_case_df[[COLUMNS.location_id, COLUMNS.date, 'ln(case rate)', 'population']].to_csv(
    f'{OUTPUT_DIR}/smoothed_cases.csv', index=False
)

# Save pops for Bobby.
pop_df = input_age_pop_df.merge(loc_df).reset_index(drop=True)
pop_df[['location_id', 'Location', 'age_group', 'population']].to_csv(f'{OUTPUT_DIR}/pops.csv', index=False)

checkpoint.write('location', loc_df)
checkpoint.write('full_data', input_full_df)
checkpoint.write('deaths', input_death_df)
checkpoint.write('smoothed_cases', smoothed_case_df)
checkpoint.write('smoothed_deaths', smoothed_death_df)
checkpoint.write('age_pop', input_age_pop_df)
checkpoint.write('age_death', input_age_death_df)

In [None]:
# Can be deleted, I just want to check values
#loc_df.tail()


## prepare data for case-to-death

In [None]:
%%time
full_df = checkpoint.load('full_data')
death_df = checkpoint.load('deaths')
age_pop_df = checkpoint.load('age_pop')
age_death_df = checkpoint.load('age_death')

backcast_location_ids = runner.get_backcast_location_ids(full_df, most_detailed=False)
cases_and_backcast_deaths_df = runner.make_cases_and_backcast_deaths(full_df, death_df, age_pop_df, 
                                                                     age_death_df, backcast_location_ids, 
                                                                     subnat=False)

cases_and_backcast_deaths_df.to_csv(f'{OUTPUT_DIR}/backcast_for_case_to_death.csv', index=False)
checkpoint.write('cases_and_backcast_deaths', cases_and_backcast_deaths_df)

## Impute death thresholds.

In [14]:
%%time
cases_and_backcast_deaths_df = checkpoint.load('cases_and_backcast_deaths')
loc_df = checkpoint.load('location')

# loc_df needs to be updated with locations that have death data
model_run_locs = input_death_df['location_id'].unique()
cases_and_backcast_deaths_df = cases_and_backcast_deaths_df[cases_and_backcast_deaths_df['location_id'].isin(model_run_locs)]
loc_df = loc_df[loc_df['location_id'].isin(model_run_locs)]

checkpoint.write('location', loc_df)
#df_to_run = full_df.loc[full_df[COLUMNS.location_id].isin(loc_df[COLUMNS.location_id].to_list())]

2020-06-02 13:53:26.576 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading cases_and_backcast_deaths from in memory cache.
2020-06-02 13:53:26.578 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading location from in memory cache.


CPU times: user 9.54 ms, sys: 3.38 ms, total: 12.9 ms
Wall time: 11.4 ms


In [15]:
threshold_dates = runner.impute_death_threshold(cases_and_backcast_deaths_df,
                                                loc_df)
threshold_dates.to_csv(f'{OUTPUT_DIR}/threshold_dates.csv', index=False)
checkpoint.write('threshold_dates', threshold_dates)

100%|██████████| 139/139 [00:07<00:00, 18.31it/s]


## Make last day data

In [16]:
smoothed_death_df = checkpoint.load('smoothed_deaths')
threshold_dates = checkpoint.load('threshold_dates')

date_mean_df = runner.make_date_mean_df(threshold_dates)
last_day_df = runner.make_last_day_df(smoothed_death_df,date_mean_df)
last_day_df.to_csv(f'{OUTPUT_DIR}/last_day.csv', index=False)

checkpoint.write('date_mean', date_mean_df)
checkpoint.write('last_day', last_day_df)

2020-06-02 13:54:16.597 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading smoothed_deaths from in memory cache.
2020-06-02 13:54:16.599 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading threshold_dates from in memory cache.


## get leading indicator

In [17]:
full_df = checkpoint.load('full_data')
loc_df = checkpoint.load('location')


df_to_run = full_df.loc[full_df[COLUMNS.location_id].isin(loc_df[COLUMNS.location_id].to_list())]
dcr_df, dhr_df, leading_indicator_df = runner.make_leading_indicator(
    df_to_run,
    SNAPSHOT_VERSION
)
dcr_df.to_csv(f'{OUTPUT_DIR}/lagged_death_to_case_ratios.csv', index=False)
dhr_df.to_csv(f'{OUTPUT_DIR}/lagged_death_to_hosp_ratios.csv', index=False)
leading_indicator_df.to_csv(f'{OUTPUT_DIR}/leading_indicator.csv', index=False)
leading_indicator_df = leading_indicator_df[[COLUMNS.location_id, COLUMNS.date, COLUMNS.ln_age_death_rate]]
leading_indicator_df = leading_indicator_df.loc[~leading_indicator_df[COLUMNS.ln_age_death_rate].isnull()]

checkpoint.write('leading_indicator', leading_indicator_df)

2020-06-02 13:54:21.417 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading full_data from in memory cache.
2020-06-02 13:54:21.419 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading location from in memory cache.


## store model data and covariate data, submit models

In [18]:
full_df = checkpoint.load('full_data')
death_df = checkpoint.load('deaths')
age_pop_df = checkpoint.load('age_pop')
age_death_df = checkpoint.load('age_death')
date_mean_df = checkpoint.load('date_mean')
last_day_df = checkpoint.load('last_day')
leading_indicator_df = checkpoint.load('leading_indicator')
loc_df = checkpoint.load('location')

submodel_dict = runner.submit_models(death_df, age_pop_df, age_death_df, date_mean_df, leading_indicator_df,
                                     loc_df, r0_locs,
                                     PEAK_FILE, OUTPUT_DIR, 
                                     SNAPSHOT_VERSION, MODEL_INPUTS_VERSION, 
                                     R0_FILE, CODE_DIR, NO_PSEUDO)

checkpoint.write('submodel_dict', submodel_dict)


2020-06-02 13:54:40.197 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading full_data from in memory cache.
2020-06-02 13:54:40.200 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading deaths from in memory cache.
2020-06-02 13:54:40.202 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading age_pop from in memory cache.
2020-06-02 13:54:40.203 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading age_death from in memory cache.
2020-06-02 13:54:40.205 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading date_mean from in memory cache.
2020-06-02 13:54:40.206 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading last_day from in memory cache.
2020-06-02 13:54:40.207 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading leading_indicator from in memory cache.
2020-06-02 13:54:40.209 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading location from 

KeyboardInterrupt: 

## compile draws


In [None]:
smoothed_death_df = checkpoint.load('smoothed_deaths')
age_pop_df = checkpoint.load('age_pop')
threshold_dates = checkpoint.load('threshold_dates')
submodel_dict = checkpoint.load('submodel_dict')
loc_df = checkpoint.load('location')

# obs_df = full_df[full_df.location_id.isin(loc_df.location_id)]
obs_df = smoothed_death_df[smoothed_death_df.location_id.isin(loc_df.location_id)]

draw_dfs, past_draw_dfs, models_used, days, ensemble_draws_dfs, failed_locs = runner.compile_draws(
    loc_df, submodel_dict, obs_df, threshold_dates, age_pop_df
)

if 'location' not in models_used:
    raise ValueError('No location-specific draws used, must be using wrong tag')
draw_df = pd.concat(draw_dfs)
loc_df = loc_df.loc[~loc_df.location_id.isin(failed_locs)]
model_type_df = pd.DataFrame({'location': loc_df['Location'].tolist(),
                              'model_used': models_used})

# write
draw_df.to_csv(smooth_draw_path, index=False)
model_type_df.to_csv(f'{OUTPUT_DIR}/state_models_used.csv', index=False)
ensemble_plot_path = runner.make_and_save_draw_plots(OUTPUT_DIR, loc_df,
                                                     ensemble_draws_dfs, days, models_used, age_pop_df)
print(ensemble_plot_path)
checkpoint.write('draw_data', draw_df)
checkpoint.write('failed_locations', failed_locs)


In [None]:
failed_locs

#### store deaths with true past

In [None]:
raw_df = checkpoint.load('full_data')
loc_df = checkpoint.load('location')
loc_df = loc_df.loc[~loc_df.location_id.isin(failed_locs)]
raw_df['Location'] = raw_df['Province/State']
raw_df = raw_df.loc[raw_df['location_id'].isin(loc_df['location_id'].to_list())]
raw_df.loc[raw_df['Location'].isnull(), 'Location'] = raw_df['Country/Region']
runner.swap_observed(OUTPUT_DIR, smooth_draw_path, raw_draw_path, raw_df)


## combine with previous predictions

In [None]:
# full_df = checkpoint.load('full_data')
# #avg_df = runner.average_draws(smooth_draw_path, yesterday_draw_path, before_yesterday_draw_path, past_avg_window=10)
# #avg_df.to_csv(average_draw_path, index=False)
# compare_average_plot_path = runner.make_and_save_compare_average_plots(OUTPUT_DIR,
#                                                                        smooth_draw_path,
#                                                                        smooth_draw_path,
#                                                                        yesterday_draw_path,
#                                                                        before_yesterday_draw_path,
#                                                                        full_df,
#                                                                        'Not United States of America')

In [None]:
# compare_to_previous_plot_path = runner.make_and_save_compare_to_previous_plots(OUTPUT_DIR,
#                                                                                smooth_draw_path,
#                                                                                compare_average_path,
#                                                                                "Not US")

In [None]:
# viz_dir = runner.send_plots_to_diagnostics(DATESTAMP_LABEL,
#                                            f'{OUTPUT_DIR}/ensemble_plot.pdf',
#                                            compare_average_plot_path,
#                                            compare_to_previous_plot_path)
# print(viz_dir)

## store point estimates, and peaks derived from them

In [None]:
loc_df = checkpoint.load('location')
submodel_dict = checkpoint.load('submodel_dict')
draw_df = checkpoint.load('draw_data')
age_pop_df = checkpoint.load('age_pop')
runner.save_points_and_peaks(loc_df, submodel_dict, draw_df, age_pop_df, OUTPUT_DIR)

In [22]:
# Something from Mark
import subprocess as sub
import time as time

length = sub.getoutput('qstat').count('curve')
print(length)
#     while length != 0:
#         time.sleep(30)
#         length = subgetoutput('qstat').count('curve')
#         print(length)

77
