In [1]:
import os
import warnings

from db_queries import get_location_metadata
import numpy as np
import pandas as pd
import yaml
import re

from covid_model_deaths import runner
from covid_model_deaths.deaths_io import InputsContext, MEASURES, Checkpoint
from covid_model_deaths.globals import COLUMNS

pd.options.display.max_rows = 99
pd.options.display.max_columns = 99
warnings.simplefilter('ignore')

In [2]:
DATA_DATE = "2020_05_29" # Date for all data used
# Completed these "2020-03-27", "2020-04-03", "2020-04-10",
# "2020-04-17" breaks, do the rest?
# date_list = ["2020-04-24","2020-05-01","2020-05-08","2020-05-15","2020-05-22","2020-05-29"]
# date_list = ["2020-03-27", "2020-04-03", "2020-04-10", "2020-04-17"]
date_list = ["2020-03-31","2020-04-03", "2020-04-10"] #, 
date_list = ["2020-04-24","2020-05-01","2020-05-08","2020-05-15","2020-05-22","2020-05-29"]
# "2020-03-27" breaks ...

In [3]:
for day in date_list:
    VALIDATION_DATE = day # Date to use data upto and including

    RUN_TYPE = 'validation'
    MODEL_INPUTS_VERSION = 'production-runs/' + DATA_DATE
    SNAPSHOT_VERSION = 'production-runs/' + DATA_DATE
    DATESTAMP_LABEL = '2020_05_23_Europe' # Will want to change this.

    PEAK_FILE = '/ihme/covid-19/deaths/mobility_inputs/2020_04_20/peak_locs_april20_.csv'
    PEAK_DURATION_FILE = None
    R0_FILE = None
    LOCATION_SET_VERSION = 678
    r0_locs = []
    # Locations where no pseudo data is used
    NO_PSEUDO = [
        564, # South Dakota
        538, # Iowa
        # Mexican subnationals
        4644, 4657, 4651, 4663, 4665, 4667, 4669
    ]

    VALIDATION_FOLDER = re.sub("-", "_", VALIDATION_DATE)
    CODE_DIR = os.path.abspath('../src/covid_model_deaths')
    OUTPUT_DIR = f'/ihme/covid-19/deaths/{RUN_TYPE}/{DATA_DATE}/{VALIDATION_FOLDER}'
    if not os.path.exists(f'/ihme/covid-19/deaths/{RUN_TYPE}/{DATA_DATE}'):
        os.mkdir(f'/ihme/covid-19/deaths/{RUN_TYPE}/{DATA_DATE}')
    if not os.path.exists(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)
    inputs = InputsContext(f'/ihme/covid-19/model-inputs/{MODEL_INPUTS_VERSION}')
    checkpoint = Checkpoint(OUTPUT_DIR)

    smooth_draw_path = f'{OUTPUT_DIR}/smoothed_euro_data.csv'
    raw_draw_path = f'{OUTPUT_DIR}/euro_data.csv'
    average_draw_path = f'{OUTPUT_DIR}/past_avg_smoothed_euro_data.csv'
    yesterday_draw_path = '/ihme/covid-19/deaths/prod/2020_05_22_Europe/smoothed_euro_data.csv'
    before_yesterday_draw_path = '/ihme/covid-19/deaths/prod/2020_05_19_Europe/smoothed_euro_data.csv'
    compare_average_path = '/ihme/covid-19/deaths/prod/2020_05_22_Europe/smoothed_euro_data.csv'

    print(f'Writing to {OUTPUT_DIR}')
    print(CODE_DIR)
    print(checkpoint)

    metadata = {}
    with open(f'/ihme/covid-19/model-inputs/{MODEL_INPUTS_VERSION}/metadata.yaml') as f:
        metadata['inputs_version'] = yaml.full_load(f)

    metadata['run_type'] = RUN_TYPE
    metadata['model_inputs_version'] = MODEL_INPUTS_VERSION
    metadata['snapshot_version'] = SNAPSHOT_VERSION
    metadata['datestamp_label'] = DATESTAMP_LABEL
    metadata['peak_file'] = PEAK_FILE
    metadata['location_set_version_id'] = LOCATION_SET_VERSION
    metadata['output_dir'] = OUTPUT_DIR
    metadata['no_pseudo'] = NO_PSEUDO
    metadata['average'] = {
        'yesterday': yesterday_draw_path,
        'before_yesterday': before_yesterday_draw_path
    }
    metadata['compare_average'] = compare_average_path

    with open(f'{OUTPUT_DIR}/metadata.yaml', 'w') as f:
        yaml.dump(metadata, f)

    def filter_data(data: pd.DataFrame, kind='full') -> pd.DataFrame:
        # manually adjust Iceland spike (0 deaths to 5 deaths to 0 deaths in March...)
        iceland = data['Country/Region'] == 'Iceland'
        iceland_spike = iceland & (data['Date'] == pd.Timestamp('2020-03-15'))
        if kind == 'full':
            data.loc[iceland_spike, ['Deaths', 'Death rate']] = 0
        elif kind == 'deaths':
            data = data.loc[~iceland_spike]
            min_iceland_date = data.loc[iceland, 'Date'].min()
            data.loc[iceland, 'Days'] = (data.loc[iceland, 'Date'] - min_iceland_date).dt.days

        catalonia  = data['location_id'] == 60368
        catalonia_spike = catalonia & (data['Date'] >= pd.Timestamp('2020-05-21'))
        data = data[~catalonia_spike]

        return data

    def get_locations(location_set_version_id):
        # get locaton_ids
        loc_df = get_location_metadata(location_set_id=111,
                                       location_set_version_id=location_set_version_id)

        # Drop any locations in the US and keep only most detailed for modeling
        most_detailed = loc_df['most_detailed'] == 1
        # non_us = ~loc_df['path_to_top_parent'].str.startswith('102,')
        keep_columns = ['location_id', 'location_ascii_name', 'parent_id', 'level', 'most_detailed']

        euro_df = loc_df.loc[most_detailed, keep_columns]
        euro_df = euro_df.rename(columns={'location_ascii_name':'Location'})

        # Add parents
        loc_df = loc_df[['location_id', 'location_ascii_name']]
        loc_df = loc_df.rename(columns={'location_id':'parent_id',
                                        'location_ascii_name':'Country/Region'})
        euro_df = euro_df.merge(loc_df)

        euro_df = euro_df.loc[:, ['location_id', 'Location', 'Country/Region', 'level']]
        return euro_df # don't like the name but probably easier to NOT change it.

    loc_df = get_locations(LOCATION_SET_VERSION)
    input_full_df = filter_data(inputs.load(MEASURES.full_data))
    input_death_df = filter_data(inputs.load(MEASURES.deaths), kind='deaths')

    # Subset to just dates for validation run
    input_full_df = input_full_df[input_full_df['Date'] <= VALIDATION_DATE]
    input_death_df = input_death_df[input_death_df['Date'] <= VALIDATION_DATE]

    input_age_pop_df = inputs.load(MEASURES.age_pop)
    input_age_death_df = inputs.load(MEASURES.age_death)
    smoothed_case_df, smoothed_death_df = runner.get_smoothed(input_full_df)

    # save cases for viz
    smoothed_case_df[[COLUMNS.location_id, COLUMNS.date, 'ln(case rate)', 'population']].to_csv(
        f'{OUTPUT_DIR}/smoothed_cases.csv', index=False
    )

    # Save pops for Bobby.
    pop_df = input_age_pop_df.merge(loc_df).reset_index(drop=True)
    pop_df[['location_id', 'Location', 'age_group', 'population']].to_csv(f'{OUTPUT_DIR}/pops.csv', index=False)

    checkpoint.write('location', loc_df)
    checkpoint.write('full_data', input_full_df)
    checkpoint.write('deaths', input_death_df)
    checkpoint.write('smoothed_cases', smoothed_case_df)
    checkpoint.write('smoothed_deaths', smoothed_death_df)
    checkpoint.write('age_pop', input_age_pop_df)
    checkpoint.write('age_death', input_age_death_df)

    #%%time
    full_df = checkpoint.load('full_data')
    death_df = checkpoint.load('deaths')
    age_pop_df = checkpoint.load('age_pop')
    age_death_df = checkpoint.load('age_death')

    full_df.to_csv(f'{OUTPUT_DIR}/full_df.csv', index=False)
    death_df.to_csv(f'{OUTPUT_DIR}/death_df.csv', index=False)
    
    backcast_location_ids = runner.get_backcast_location_ids(full_df, most_detailed=False)
    cases_and_backcast_deaths_df = runner.make_cases_and_backcast_deaths(full_df, death_df, age_pop_df, 
                                                                         age_death_df, backcast_location_ids, 
                                                                         subnat=False)

    cases_and_backcast_deaths_df.to_csv(f'{OUTPUT_DIR}/backcast_for_case_to_death.csv', index=False)
    checkpoint.write('cases_and_backcast_deaths', cases_and_backcast_deaths_df)
    
    #%%time
    cases_and_backcast_deaths_df = checkpoint.load('cases_and_backcast_deaths')
    loc_df = checkpoint.load('location')

    # loc_df needs to be updated with locations that have death data
    # must have > 0/NaN deaths , seems to also need case data
    #model_run_locs = cases_and_backcast_deaths_df[cases_and_backcast_deaths_df['Deaths'].notnull() & cases_and_backcast_deaths_df['Confirmed case rate'].notnull()]
    model_run_locs = cases_and_backcast_deaths_df
    model_run_locs.to_csv(f'{OUTPUT_DIR}/model_locations.csv', index=False)
    model_run_locs = model_run_locs['location_id'].unique()
    
    stop

2020-06-08 11:23:48.255 | DEBUG    | covid_model_deaths.deaths_io.checkpoint:_setup_checkpoint_dir:45 - Making checkpoint directory at /ihme/covid-19/deaths/validation/2020_05_29/2020_04_24/checkpoint


Writing to /ihme/covid-19/deaths/validation/2020_05_29/2020_04_24
/ihme/code/covid-19/user/ctroeger/covid-model-deaths/src/src/covid_model_deaths
Checkpoint(/ihme/covid-19/deaths/validation/2020_05_29/2020_04_24/checkpoint)


2020-06-08 11:23:48.491 | DEBUG    | covid_model_deaths.deaths_io.inputs:load:40 - Loading full_data.csv from /ihme/covid-19/model-inputs/2020_05_29.01.
2020-06-08 11:23:48.547 | DEBUG    | covid_model_deaths.deaths_io.inputs:load:40 - Loading deaths.csv from /ihme/covid-19/model-inputs/2020_05_29.01.
2020-06-08 11:23:48.586 | DEBUG    | covid_model_deaths.deaths_io.inputs:load:40 - Loading age_pop.csv from /ihme/covid-19/model-inputs/2020_05_29.01.
2020-06-08 11:23:48.594 | DEBUG    | covid_model_deaths.deaths_io.inputs:load:40 - Loading age_death.csv from /ihme/covid-19/model-inputs/2020_05_29.01.
2020-06-08 11:24:03.173 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading full_data from in memory cache.
2020-06-08 11:24:03.176 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading deaths from in memory cache.
2020-06-08 11:24:03.177 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading age_pop from in memory cache.
2020-06-08 11:24:

NameError: name 'stop' is not defined

In [1]:
df = cases_and_backcast_deaths_df
df = df[df['location_id'] > 0]
t = df.groupby(df['location_id']).mean().dropna()
model_run_locs = t.index.values


NameError: name 'cases_and_backcast_deaths_df' is not defined

In [35]:
model_run_locs = cases_and_backcast_deaths_df[cases_and_backcast_deaths_df['Deaths'].notnull() & cases_and_backcast_deaths_df['Confirmed case rate'].notnull()]
cases_and_backcast_deaths_df[cases_and_backcast_deaths_df['location_id'] == 80].head()

Unnamed: 0,location_id,Province/State,Country/Region,Date,Confirmed,Confirmed case rate,Deaths,Death rate,population
3276,80,France,France,2020-03-01,191.0,3e-06,,,
3277,80,France,France,2020-03-02,212.0,3e-06,,,
3278,80,France,France,2020-03-03,285.0,4e-06,,,
3279,80,France,France,2020-03-04,423.0,6e-06,,,
3280,80,France,France,2020-03-05,613.0,9e-06,,,
