# Code to run Curvefit across all locations for model validation exercise
## All models will be run in a loop

In [1]:
import os
import warnings

from db_queries import get_location_metadata
import numpy as np
import pandas as pd
import yaml
import re

from covid_model_deaths import runner
from covid_model_deaths.deaths_io import InputsContext, MEASURES, Checkpoint
from covid_model_deaths.globals import COLUMNS

pd.options.display.max_rows = 99
pd.options.display.max_columns = 99
warnings.simplefilter('ignore')

## Set dates for loop

In [2]:
DATA_DATE = "2020_06_26" # Date for all data used
# Completed these "2020-03-27", "2020-04-03", "2020-04-10",
# "2020-04-17" breaks, do the rest?
# date_list = ["2020-04-24","2020-05-01","2020-05-08","2020-05-15","2020-05-22","2020-05-29"]
# date_list = ["2020-03-27", "2020-04-03", "2020-04-10", "2020-04-17"]
date_list = ["2020-04-24","2020-05-01","2020-05-08","2020-05-15","2020-05-22","2020-05-29"]
date_list = ["2020-03-31","2020-03-31","2020-04-03", "2020-04-10", "2020-04-17","2020-04-24","2020-05-01"]
date_list = ["2020-03-27","2020-03-31","2020-04-03", "2020-04-10", "2020-04-17","2020-04-24","2020-05-01"]
date_list = ["2020-04-24","2020-05-01","2020-05-08","2020-05-15"]
date_list = ["2020-05-22","2020-05-29","2020-06-05","2020-06-12","2020-06-19"]
date_list = ["2020-04-24","2020-05-01","2020-05-04","2020-05-08","2020-05-15","2020-06-01","2020-06-26"]
# "2020-03-27", "2020-04-17", breaks ...

In [None]:
for day in date_list:
    VALIDATION_DATE = day # Date to use data upto and including

    RUN_TYPE = 'validation'
    MODEL_INPUTS_VERSION = 'production-runs/' + DATA_DATE
    SNAPSHOT_VERSION = 'production-runs/' + DATA_DATE
    DATESTAMP_LABEL = '2020_05_23_Europe' # Will want to change this.

    PEAK_FILE = '/ihme/covid-19/deaths/mobility_inputs/2020_04_20/peak_locs_april20_.csv'
    PEAK_DURATION_FILE = None
    R0_FILE = None
    LOCATION_SET_VERSION = 720
    r0_locs = []
    # Locations where no pseudo data is used
    NO_PSEUDO = [
        564, # South Dakota
        538, # Iowa
        # Mexican subnationals
        4644, 4657, 4651, 4663, 4665, 4667, 4669
    ]

    VALIDATION_FOLDER = re.sub("-", "_", VALIDATION_DATE)
    CODE_DIR = os.path.abspath('../src/covid_model_deaths')
    OUTPUT_DIR = f'/ihme/covid-19/deaths/{RUN_TYPE}/{DATA_DATE}/{VALIDATION_FOLDER}'
    if not os.path.exists(f'/ihme/covid-19/deaths/{RUN_TYPE}/{DATA_DATE}'):
        os.mkdir(f'/ihme/covid-19/deaths/{RUN_TYPE}/{DATA_DATE}')
    if not os.path.exists(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)
    inputs = InputsContext(f'/ihme/covid-19/model-inputs/{MODEL_INPUTS_VERSION}')
    checkpoint = Checkpoint(OUTPUT_DIR)

    smooth_draw_path = f'{OUTPUT_DIR}/smoothed_euro_data.csv'
    raw_draw_path = f'{OUTPUT_DIR}/euro_data.csv'
    average_draw_path = f'{OUTPUT_DIR}/past_avg_smoothed_euro_data.csv'
    yesterday_draw_path = '/ihme/covid-19/deaths/prod/2020_05_22_Europe/smoothed_euro_data.csv'
    before_yesterday_draw_path = '/ihme/covid-19/deaths/prod/2020_05_19_Europe/smoothed_euro_data.csv'
    compare_average_path = '/ihme/covid-19/deaths/prod/2020_05_22_Europe/smoothed_euro_data.csv'

    print(f'Writing to {OUTPUT_DIR}')
    print(CODE_DIR)
    print(checkpoint)

    metadata = {}
    with open(f'/ihme/covid-19/model-inputs/{MODEL_INPUTS_VERSION}/metadata.yaml') as f:
        metadata['inputs_version'] = yaml.full_load(f)

    metadata['run_type'] = RUN_TYPE
    metadata['model_inputs_version'] = MODEL_INPUTS_VERSION
    metadata['snapshot_version'] = SNAPSHOT_VERSION
    metadata['datestamp_label'] = DATESTAMP_LABEL
    metadata['peak_file'] = PEAK_FILE
    metadata['location_set_version_id'] = LOCATION_SET_VERSION
    metadata['output_dir'] = OUTPUT_DIR
    metadata['no_pseudo'] = NO_PSEUDO
    metadata['average'] = {
        'yesterday': yesterday_draw_path,
        'before_yesterday': before_yesterday_draw_path
    }
    metadata['compare_average'] = compare_average_path

    with open(f'{OUTPUT_DIR}/metadata.yaml', 'w') as f:
        yaml.dump(metadata, f)

    def filter_data(data: pd.DataFrame, kind='full') -> pd.DataFrame:
        # manually adjust Iceland spike (0 deaths to 5 deaths to 0 deaths in March...)
        iceland = data['Country/Region'] == 'Iceland'
        iceland_spike = iceland & (data['Date'] == pd.Timestamp('2020-03-15'))
        if kind == 'full':
            data.loc[iceland_spike, ['Deaths', 'Death rate']] = 0
        elif kind == 'deaths':
            data = data.loc[~iceland_spike]
            min_iceland_date = data.loc[iceland, 'Date'].min()
            data.loc[iceland, 'Days'] = (data.loc[iceland, 'Date'] - min_iceland_date).dt.days

        catalonia  = data['location_id'] == 60368
        catalonia_spike = catalonia & (data['Date'] >= pd.Timestamp('2020-05-21'))
        data = data[~catalonia_spike]

        return data

    def get_locations(location_set_version_id):
        # get locaton_ids
        loc_df = get_location_metadata(location_set_id=111,
                                       location_set_version_id=location_set_version_id)

        # Drop any locations in the US and keep only most detailed for modeling
        most_detailed = loc_df['most_detailed'] == 1
        # non_us = ~loc_df['path_to_top_parent'].str.startswith('102,')
        keep_columns = ['location_id', 'location_ascii_name', 'parent_id', 'level', 'most_detailed']

        euro_df = loc_df.loc[most_detailed, keep_columns]
        euro_df = euro_df.rename(columns={'location_ascii_name':'Location'})

        # Add parents
        loc_df = loc_df[['location_id', 'location_ascii_name']]
        loc_df = loc_df.rename(columns={'location_id':'parent_id',
                                        'location_ascii_name':'Country/Region'})
        euro_df = euro_df.merge(loc_df)

        euro_df = euro_df.loc[:, ['location_id', 'Location', 'Country/Region', 'level']]
        return euro_df # don't like the name but probably easier to NOT change it.

    loc_df = get_locations(LOCATION_SET_VERSION)
    input_full_df = filter_data(inputs.load(MEASURES.full_data))
    input_death_df = filter_data(inputs.load(MEASURES.deaths), kind='deaths')

    # Subset to just dates for validation run
    input_full_df = input_full_df[input_full_df['Date'] <= VALIDATION_DATE]
    input_death_df = input_death_df[input_death_df['Date'] <= VALIDATION_DATE]

    input_age_pop_df = inputs.load(MEASURES.age_pop)
    input_age_death_df = inputs.load(MEASURES.age_death)
    smoothed_case_df, smoothed_death_df = runner.get_smoothed(input_full_df)

    # save cases for viz
    smoothed_case_df[[COLUMNS.location_id, COLUMNS.date, 'ln(case rate)', 'population']].to_csv(
        f'{OUTPUT_DIR}/smoothed_cases.csv', index=False
    )

    # Save pops for Bobby.
    pop_df = input_age_pop_df.merge(loc_df).reset_index(drop=True)
    pop_df[['location_id', 'Location', 'age_group', 'population']].to_csv(f'{OUTPUT_DIR}/pops.csv', index=False)

    checkpoint.write('location', loc_df)
    checkpoint.write('full_data', input_full_df)
    checkpoint.write('deaths', input_death_df)
    checkpoint.write('smoothed_cases', smoothed_case_df)
    checkpoint.write('smoothed_deaths', smoothed_death_df)
    checkpoint.write('age_pop', input_age_pop_df)
    checkpoint.write('age_death', input_age_death_df)

    #%%time
    full_df = checkpoint.load('full_data')
    death_df = checkpoint.load('deaths')
    age_pop_df = checkpoint.load('age_pop')
    age_death_df = checkpoint.load('age_death')

    full_df.to_csv(f'{OUTPUT_DIR}/full_df.csv', index=False)
    death_df.to_csv(f'{OUTPUT_DIR}/death_df.csv', index=False)
    
    backcast_location_ids = runner.get_backcast_location_ids(full_df, most_detailed=False)
    cases_and_backcast_deaths_df = runner.make_cases_and_backcast_deaths(full_df, death_df, age_pop_df, 
                                                                         age_death_df, backcast_location_ids, 
                                                                         subnat=False)

    cases_and_backcast_deaths_df.to_csv(f'{OUTPUT_DIR}/backcast_for_case_to_death.csv', index=False)
    checkpoint.write('cases_and_backcast_deaths', cases_and_backcast_deaths_df)
    
    #%%time
    cases_and_backcast_deaths_df = checkpoint.load('cases_and_backcast_deaths')
    loc_df = checkpoint.load('location')

    # loc_df needs to be updated with locations that have death data
    # must have > 0/NaN deaths , seems to also need case data
    #model_run_locs = cases_and_backcast_deaths_df[cases_and_backcast_deaths_df['Deaths'].notnull() & cases_and_backcast_deaths_df['Confirmed case rate'].notnull()]
    #model_run_locs = cases_and_backcast_deaths_df
    df = cases_and_backcast_deaths_df
    
    # Let's write this as a CSV and interrogate in R
    df.to_csv(f'{OUTPUT_DIR}/case_backcast_deaths.csv', index=False)
    
    df = df[df['location_id'] > 0]
    df = df[df['Confirmed case rate'] > 0]
    t = df.groupby(df['location_id']).mean().dropna()
    t = df.groupby('location_id')['Confirmed case rate'].mean().dropna()
    model_run_locs = t.index.values
    model_run_locs = pd.DataFrame(model_run_locs, columns = ['location_id'])
    
    model_run_locs.to_csv(f'{OUTPUT_DIR}/model_locations.csv', index=False)
    model_run_locs = model_run_locs['location_id'].unique()

    # An alternative for locations that survive cases_and_backcast_deaths at all
    # model_run_locs = cases_and_backcast_deaths_df['location_id'].unique()
    # model_run_locs.to_csv(f'{OUTPUT_DIR}/model_locations.csv', index=False)
    # model_run_locs = model_run_locs['location_id'].unique()
    # model_run_locs
    
    cases_and_backcast_deaths_df = cases_and_backcast_deaths_df[cases_and_backcast_deaths_df['location_id'].isin(model_run_locs)]
    loc_df = loc_df[loc_df['location_id'].isin(model_run_locs)]

    checkpoint.write('location', loc_df)

    threshold_dates = runner.impute_death_threshold(cases_and_backcast_deaths_df,
                                                    loc_df)
    threshold_dates.to_csv(f'{OUTPUT_DIR}/threshold_dates.csv', index=False)
    checkpoint.write('threshold_dates', threshold_dates)

    smoothed_death_df = checkpoint.load('smoothed_deaths')
    threshold_dates = checkpoint.load('threshold_dates')

    date_mean_df = runner.make_date_mean_df(threshold_dates)
    last_day_df = runner.make_last_day_df(smoothed_death_df,date_mean_df)
    last_day_df.to_csv(f'{OUTPUT_DIR}/last_day.csv', index=False)

    # this seems to be where some states are lost. Save these two files to investigate
    smoothed_death_df.to_csv(f'{OUTPUT_DIR}/smoothed_death_df.csv', index=False)
    date_mean_df.to_csv(f'{OUTPUT_DIR}/date_mean_df.csv', index=False)

    checkpoint.write('date_mean', date_mean_df)
    checkpoint.write('last_day', last_day_df)

    full_df = checkpoint.load('full_data')
    loc_df = checkpoint.load('location')
    
    # Keep locations we think will work (shouldn't be necessary here)
    #loc_df = loc_df[loc_df['location_id'].isin(model_run_locs)]

    df_to_run = full_df.loc[full_df[COLUMNS.location_id].isin(loc_df[COLUMNS.location_id].to_list())]
    dcr_df, dhr_df, leading_indicator_df = runner.make_leading_indicator(
        df_to_run,
        SNAPSHOT_VERSION
    )
    dcr_df.to_csv(f'{OUTPUT_DIR}/lagged_death_to_case_ratios.csv', index=False)
    dhr_df.to_csv(f'{OUTPUT_DIR}/lagged_death_to_hosp_ratios.csv', index=False)
    leading_indicator_df.to_csv(f'{OUTPUT_DIR}/leading_indicator.csv', index=False)
    leading_indicator_df = leading_indicator_df[[COLUMNS.location_id, COLUMNS.date, COLUMNS.ln_age_death_rate]]
    leading_indicator_df = leading_indicator_df.loc[~leading_indicator_df[COLUMNS.ln_age_death_rate].isnull()]

    checkpoint.write('leading_indicator', leading_indicator_df)

    full_df = checkpoint.load('full_data')
    death_df = checkpoint.load('deaths')
    age_pop_df = checkpoint.load('age_pop')
    age_death_df = checkpoint.load('age_death')
    date_mean_df = checkpoint.load('date_mean')
    last_day_df = checkpoint.load('last_day')
    leading_indicator_df = checkpoint.load('leading_indicator')
    loc_df = checkpoint.load('location')

    #loc_df = loc_df[loc_df['location_id'].isin([3539, 60886, 60887])] # locations that didn't make it in round 1, but to finish all locs should be run
    #loc_df = loc_df[loc_df['location_id'].isin([523, 530, 535, 556, 555, 533])]
    
    submodel_dict = runner.submit_models(death_df, age_pop_df, age_death_df, date_mean_df, leading_indicator_df,
                                         loc_df, r0_locs,
                                         PEAK_FILE, OUTPUT_DIR, 
                                         SNAPSHOT_VERSION, MODEL_INPUTS_VERSION, 
                                         R0_FILE, CODE_DIR, NO_PSEUDO)

    checkpoint.write('submodel_dict', submodel_dict)

    # Something from Mark, holds until jobs finish
    import subprocess as sub
    import time as time

    length = sub.getoutput('qstat').count('curve')
    while length != 0:
        time.sleep(30)
        length = sub.getoutput('qstat').count('curve')
        print(length)
    
    smoothed_death_df = checkpoint.load('smoothed_deaths')
    age_pop_df = checkpoint.load('age_pop')
    threshold_dates = checkpoint.load('threshold_dates')
    submodel_dict = checkpoint.load('submodel_dict')
    loc_df = checkpoint.load('location')

    # obs_df = full_df[full_df.location_id.isin(loc_df.location_id)]
    obs_df = smoothed_death_df[smoothed_death_df.location_id.isin(loc_df.location_id)]

    draw_dfs, past_draw_dfs, models_used, days, ensemble_draws_dfs, failed_locs = runner.compile_draws(
        loc_df, submodel_dict, obs_df, threshold_dates, age_pop_df
    )

    if 'location' not in models_used:
        raise ValueError('No location-specific draws used, must be using wrong tag')
    draw_df = pd.concat(draw_dfs)
    failed_df = loc_df.loc[loc_df.location_id.isin(failed_locs)]
    loc_df = loc_df.loc[~loc_df.location_id.isin(failed_locs)]
    
    failed_df.to_csv(f'{OUTPUT_DIR}/failed_curvefit_locations.csv', index=False)
    
    model_type_df = pd.DataFrame({'location': loc_df['Location'].tolist(),
                                  'model_used': models_used})

    # write
    draw_df.to_csv(smooth_draw_path, index=False)
    model_type_df.to_csv(f'{OUTPUT_DIR}/state_models_used.csv', index=False)
#     ensemble_plot_path = runner.make_and_save_draw_plots(OUTPUT_DIR, loc_df,
#                                                          ensemble_draws_dfs, days, models_used, age_pop_df)
#     print(ensemble_plot_path)
    checkpoint.write('draw_data', draw_df)
    checkpoint.write('failed_locations', failed_locs)

    raw_df = checkpoint.load('full_data')
    loc_df = checkpoint.load('location')
    loc_df = loc_df.loc[~loc_df.location_id.isin(failed_locs)]
    raw_df['Location'] = raw_df['Province/State']
    raw_df = raw_df.loc[raw_df['location_id'].isin(loc_df['location_id'].to_list())]
    raw_df.loc[raw_df['Location'].isnull(), 'Location'] = raw_df['Country/Region']
    runner.swap_observed(OUTPUT_DIR, smooth_draw_path, raw_draw_path, raw_df)

# I don't know what this does   
#     loc_df = checkpoint.load('location')
#     submodel_dict = checkpoint.load('submodel_dict')
#     draw_df = checkpoint.load('draw_data')
#     age_pop_df = checkpoint.load('age_pop')
#     runner.save_points_and_peaks(loc_df, submodel_dict, draw_df, age_pop_df, OUTPUT_DIR)

2020-06-30 16:12:41.949 | DEBUG    | covid_model_deaths.deaths_io.checkpoint:_setup_checkpoint_dir:45 - Making checkpoint directory at /ihme/covid-19/deaths/validation/2020_06_26/2020_04_24/checkpoint
2020-06-30 16:12:41.988 | DEBUG    | covid_model_deaths.deaths_io.inputs:load:40 - Loading full_data.csv from /ihme/covid-19/model-inputs/2020_06_26.01.
2020-06-30 16:12:42.063 | DEBUG    | covid_model_deaths.deaths_io.inputs:load:40 - Loading deaths.csv from /ihme/covid-19/model-inputs/2020_06_26.01.
2020-06-30 16:12:42.113 | DEBUG    | covid_model_deaths.deaths_io.inputs:load:40 - Loading age_pop.csv from /ihme/covid-19/model-inputs/2020_06_26.01.
2020-06-30 16:12:42.124 | DEBUG    | covid_model_deaths.deaths_io.inputs:load:40 - Loading age_death.csv from /ihme/covid-19/model-inputs/2020_06_26.01.


Writing to /ihme/covid-19/deaths/validation/2020_06_26/2020_04_24
/ihme/code/covid-19/user/ctroeger/covid-model-deaths/src/covid_model_deaths
Checkpoint(/ihme/covid-19/deaths/validation/2020_06_26/2020_04_24/checkpoint)


2020-06-30 16:12:56.424 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading full_data from in memory cache.
2020-06-30 16:12:56.426 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading deaths from in memory cache.
2020-06-30 16:12:56.427 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading age_pop from in memory cache.
2020-06-30 16:12:56.428 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading age_death from in memory cache.
 12%|█▏        | 40/324 [00:25<07:53,  1.67s/it] 

In [8]:
#cases_and_backcast_deaths_df[cases_and_backcast_deaths_df['location_id']==62].tail(10)
# full_df.head()
# full_df[full_df['Country/Region'] == "Sweden"]

full_df = checkpoint.load('full_data')
death_df = checkpoint.load('deaths')
age_pop_df = checkpoint.load('age_pop')
age_death_df = checkpoint.load('age_death')
date_mean_df = checkpoint.load('date_mean')
last_day_df = checkpoint.load('last_day')
leading_indicator_df = checkpoint.load('leading_indicator')
loc_df = checkpoint.load('location')
    
loc_df = loc_df[loc_df['location_id'].isin([48, 52])]

submodel_dict = runner.submit_models(death_df, age_pop_df, age_death_df, date_mean_df, leading_indicator_df,
                                     loc_df, r0_locs,
                                     PEAK_FILE, OUTPUT_DIR, 
                                     SNAPSHOT_VERSION, MODEL_INPUTS_VERSION, 
                                     R0_FILE, CODE_DIR, NO_PSEUDO)

2020-06-15 12:08:01.749 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading full_data from in memory cache.
2020-06-15 12:08:01.752 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading deaths from in memory cache.
2020-06-15 12:08:01.753 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading age_pop from in memory cache.
2020-06-15 12:08:01.754 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading age_death from in memory cache.
2020-06-15 12:08:01.755 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading date_mean from in memory cache.
2020-06-15 12:08:01.756 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading last_day from in memory cache.
2020-06-15 12:08:01.757 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading leading_indicator from in memory cache.
2020-06-15 12:08:01.759 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading location from 

In [4]:
VALIDATION_FOLDER = re.sub("-", "_", VALIDATION_DATE)
CODE_DIR = os.path.abspath('../src/covid_model_deaths')
OUTPUT_DIR = f'/ihme/covid-19/deaths/{RUN_TYPE}/{DATA_DATE}/{VALIDATION_FOLDER}'
if not os.path.exists(f'/ihme/covid-19/deaths/{RUN_TYPE}/{DATA_DATE}'):
    os.mkdir(f'/ihme/covid-19/deaths/{RUN_TYPE}/{DATA_DATE}')
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)
inputs = InputsContext(f'/ihme/covid-19/model-inputs/{MODEL_INPUTS_VERSION}')
checkpoint = Checkpoint(OUTPUT_DIR)
    
full_df = checkpoint.load('full_data')
death_df = checkpoint.load('deaths')
age_pop_df = checkpoint.load('age_pop')
age_death_df = checkpoint.load('age_death')
date_mean_df = checkpoint.load('date_mean')
last_day_df = checkpoint.load('last_day')
leading_indicator_df = checkpoint.load('leading_indicator')
loc_df = checkpoint.load('location')

loc_df = loc_df[loc_df['location_id'].isin([3539])]

submodel_dict = runner.submit_models(death_df, age_pop_df, age_death_df, date_mean_df, leading_indicator_df,
                                     loc_df, r0_locs,
                                     PEAK_FILE, OUTPUT_DIR, 
                                     SNAPSHOT_VERSION, MODEL_INPUTS_VERSION, 
                                     R0_FILE, CODE_DIR, NO_PSEUDO)

NameError: name 'VALIDATION_DATE' is not defined

In [9]:
full_df = checkpoint.load('full_data')
death_df = checkpoint.load('deaths')

test_df = full_df.merge(death_df, on=['location_id','Date'])
test_df[test_df['location_id']==90].head()

2020-06-07 09:13:31.130 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading full_data from in memory cache.
2020-06-07 09:13:31.132 | INFO     | covid_model_deaths.deaths_io.checkpoint:load:30 - Loading deaths from in memory cache.


Unnamed: 0,location_id,Province/State,Country/Region_x,Date,Confirmed,Deaths_x,population_x,Confirmed case rate,Death rate_x,Hospitalizations,Location,Country/Region_y,Days,Deaths_y,Death rate_y,population_y
1618,90,,Norway,2020-03-14,1090.0,3.0,5348847.0,0.000204,5.608685e-07,,Norway,Norway,0,3.0,5.608685e-07,5348847.0
1619,90,,Norway,2020-03-15,1221.0,3.0,5348847.0,0.000228,5.608685e-07,,Norway,Norway,1,3.0,5.608685e-07,5348847.0
1620,90,,Norway,2020-03-16,1333.0,3.0,5348847.0,0.000249,5.608685e-07,,Norway,Norway,2,3.0,5.608685e-07,5348847.0
1621,90,,Norway,2020-03-17,1463.0,3.0,5348847.0,0.000274,5.608685e-07,,Norway,Norway,3,3.0,5.608685e-07,5348847.0
1622,90,,Norway,2020-03-18,1550.0,6.0,5348847.0,0.00029,1.121737e-06,,Norway,Norway,4,6.0,1.121737e-06,5348847.0


In [13]:
df = cases_and_backcast_deaths_df
df = df[df['location_id'] > 0]
df = df[df['Confirmed case rate'] > 0]
t = df.groupby('location_id')['Confirmed case rate'].mean().dropna()
t

location_id
8        0.000006
10       0.000003
11       0.000009
12       0.000002
13       0.000048
           ...   
60391    0.000344
60392    0.000343
60412    0.003618
60886    0.001017
60887    0.000431
Name: Confirmed case rate, Length: 398, dtype: float64

In [17]:
#model_run_locs = cases_and_backcast_deaths_df[cases_and_backcast_deaths_df['Deaths'].notnull() & cases_and_backcast_deaths_df['Confirmed case rate'].notnull()]
cases_and_backcast_deaths_df[cases_and_backcast_deaths_df['location_id'] == 80].tail()

df = cases_and_backcast_deaths_df
df = df[df['location_id'] > 0]
t = df.groupby(df['location_id']).mean().dropna()
model_run_locs = t.index.values

model_run_locs = pd.DataFrame(model_run_locs, columns = ['location_id'])
model_run_locs.head()

loc_df[loc_df['location_id']==80]

Unnamed: 0,location_id,Location,Country/Region,level
9,80,France,France,0


In [11]:
date_mean_df = runner.make_date_mean_df(threshold_dates)
last_day_df = runner.make_last_day_df(smoothed_death_df,date_mean_df)
last_day_df.to_csv(f'{OUTPUT_DIR}/last_day.csv', index=False)

# this seems to be where some states are lost.
# smoothed_death_df.to_csv(f'{OUTPUT_DIR}/smoothed_death_df.csv', index=False)
date_mean_df.to_csv(f'{OUTPUT_DIR}/date_mean_df.csv', index=False)

stop

NameError: name 'stop' is not defined