### Cadê Covid - Modelagem

In [113]:
import pandas as pd

In [289]:
def load_casos(file):
    df =  pd.read_csv(file)
    df['date'] = pd.to_datetime(df.date)
    
    return df.sort_values('date')


def remove_cities(df):
    return (
        df[df.place_type == 'state'].drop('city', axis = 1)
                                    .rename(columns={'city_ibge_code' :'state_ibge_code'})
    )


def add_state_density(df, state_density_file, ano):
    state_density_df = (
        pd.read_csv(state_density_file)[['CODIGO_IBGE', ano]]
          .rename(columns={ano: 'population_density'})
    )
    df = df.join(state_density_df.set_index('CODIGO_IBGE'), how='left', on='state_ibge_code')
    
    return df[['state_ibge_code','state', 'population_density', 'date', 'epidemiological_week',
               'estimated_population_2019', 'is_last', 'is_repeated','last_available_confirmed',
               'last_available_confirmed_per_100k_inhabitants', 'last_available_date',
               'last_available_death_rate', 'last_available_deaths', 'new_confirmed','new_deaths', 
               'order_for_place', 'place_type']]


def add_new_confirmed_mm(df, window):
    new_confirmed_df = (
        df[['date','state_ibge_code','new_confirmed']].set_index('date')
          .groupby('state_ibge_code')
          .new_confirmed
          .rolling(window)
          .mean()
    )
    
    return df.join(new_confirmed_df, on=['state_ibge_code', 'date'], rsuffix='_mm')


def add_new_cofirmed_mm_is_decreasing(df):
    is_decreasing_df = (
        df[['date','state_ibge_code','new_confirmed_mm']].set_index('date')
          .groupby('state_ibge_code')
          .new_confirmed_mm
          .rolling(2)
          .apply(lambda x: x.iloc[0] > x.iloc[1])
          .replace({0 : False, 1: True})
    )
    
    return df.join(is_decreasing_df, on=['state_ibge_code', 'date'], rsuffix='_is_dec')

In [290]:
casos_csv = '../data_source/brasil_io_caso_full.csv'

In [240]:
state_density_csv = "../data_source/states_density.csv"
ano = '2010'
window = 14

In [294]:
casos_df = (
    load_casos(casos_csv).pipe(remove_cities)
                         .pipe(add_state_density, state_density_csv, ano)
                         .pipe(add_new_confirmed_mm, window)
                         .pipe(add_new_cofirmed_mm_is_decreasing)
                         .reset_index()
)

In [295]:
casos_df[casos_df.state=='SP']

Unnamed: 0,index,state_ibge_code,state,population_density,date,epidemiological_week,estimated_population_2019,is_last,is_repeated,last_available_confirmed,last_available_confirmed_per_100k_inhabitants,last_available_date,last_available_death_rate,last_available_deaths,new_confirmed,new_deaths,order_for_place,place_type,new_confirmed_mm,new_confirmed_mm_is_dec
30,198092,35.0,SP,166.25,2020-06-07,24,45919049.0,False,False,143073,311.57657,2020-06-07,0.0639,9145,2524,87,104,state,,
57,193514,35.0,SP,166.25,2020-06-08,24,45919049.0,False,False,144593,314.88675,2020-06-08,0.0635,9188,1520,43,105,state,,
83,188919,35.0,SP,166.25,2020-06-09,24,45919049.0,False,False,150138,326.96235,2020-06-09,0.0634,9522,5545,334,106,state,,
111,184289,35.0,SP,166.25,2020-06-10,24,45919049.0,False,False,156316,340.41646,2020-06-10,0.0631,9862,6178,340,107,state,,
138,179612,35.0,SP,166.25,2020-06-11,24,45919049.0,False,False,162520,353.92719,2020-06-11,0.0624,10145,6204,283,108,state,,
165,174897,35.0,SP,166.25,2020-06-12,24,45919049.0,False,False,167900,365.64346,2020-06-12,0.0618,10368,5380,223,109,state,,
192,170167,35.0,SP,166.25,2020-06-13,24,45919049.0,False,False,172875,376.47774,2020-06-13,0.0612,10581,4975,213,110,state,,
219,165417,35.0,SP,166.25,2020-06-14,25,45919049.0,False,False,178202,388.07859,2020-06-14,0.06,10694,5327,113,111,state,,
246,160653,35.0,SP,166.25,2020-06-15,25,45919049.0,False,False,181460,395.17369,2020-06-15,0.0593,10767,3258,73,112,state,,
273,155867,35.0,SP,166.25,2020-06-16,25,45919049.0,False,False,190285,414.39229,2020-06-16,0.0585,11132,8825,365,113,state,,
