# Preparing LSTM data

In [1]:
from pickle import dump
import pandas as pd
from datetime import timedelta
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.stattools import adfuller

## Reading data

In [2]:
df_original = pd.read_csv('../data/df_original_extended.csv')
df_original.DATA = pd.to_datetime(df_original.DATA, format='mixed')

In [3]:
df_original.columns

Index(['DATA', 'CODIGO_MUNICIPIO_6', 'NOVOS_CASOS_SRAG', 'OBITOS_NOVOS',
       'OBITOS', 'OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS', 'TAXA_OBITOS_NOVOS',
       'TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS', 'TAXA_OBITOS', 'casosNovos',
       'casosAcumulado', 'CASOS_NOVOS_MEDIA_MOVEL_7_DIAS', 'MUNICIPIO',
       'SIGLA_ESTADO', 'ESTADO', 'REGIAO', 'POPULACAO_2022',
       'NUMERO_REPRODUCAO_EFETIVO_SRAG_MEDIA',
       'NUMERO_REPRODUCAO_EFETIVO_SRAG_VARIANCIA',
       'NUMERO_REPRODUCAO_EFETIVO_SRAG_QUANTIL_0.025',
       'NUMERO_REPRODUCAO_EFETIVO_SRAG_MEDIANA',
       'NUMERO_REPRODUCAO_EFETIVO_SRAG_QUANTIL_0.975',
       'NUMERO_REPRODUCAO_EFETIVO_MEDIA',
       'NUMERO_REPRODUCAO_EFETIVO_VARIANCIA',
       'NUMERO_REPRODUCAO_EFETIVO_QUANTIL_0.025',
       'NUMERO_REPRODUCAO_EFETIVO_MEDIANA',
       'NUMERO_REPRODUCAO_EFETIVO_QUANTIL_0.975',
       'NUMERO_REPRODUCAO_EFETIVO_ATRASADO_MEDIA',
       'NUMERO_REPRODUCAO_EFETIVO_ATRASADO_VARIANCIA',
       'NUMERO_REPRODUCAO_EFETIVO_ATRASADO_QUANTIL

In [4]:
LAGS = 84
FORECASTING_HORIZON = 84

## Differentiating data

## Preparing data

In [5]:
df_original = df_original.sort_values(by=['MUNICIPIO','DATA'])

In [6]:
epidemiologic_weeks = [pd.to_datetime('2020-04-26'), pd.to_datetime('2020-07-19'), pd.to_datetime('2020-10-11'), pd.to_datetime('2021-01-03'), pd.to_datetime('2021-03-28'), pd.to_datetime('2021-06-20'), pd.to_datetime('2021-09-12'), pd.to_datetime('2021-12-05'), pd.to_datetime(('2022-02-27'))]

### Training data

In [7]:
def create_trainable_dataset(data, n_inputs, n_outputs):
    X,Y=list(),list()
    for i in range(len(data) - n_inputs - n_outputs + 1):
        X.append(data[i:(i + n_inputs)])
        Y.append(data[i + n_inputs:i + n_inputs + n_outputs])
    return np.array(X), np.array(Y)

for week_begin in epidemiologic_weeks:
    if week_begin == pd.to_datetime('2020-04-26'):
        lags = 7
        forecasting_horizon = 28
    elif week_begin == pd.to_datetime('2020-07-19'):
        lags = 14
        forecasting_horizon = 28
    elif week_begin == pd.to_datetime('2020-10-11'):
        lags = 21
        forecasting_horizon = 28                
    else:
        lags = LAGS
        forecasting_horizon = FORECASTING_HORIZON
        
    X_train = np.empty((0, lags), dtype=float)
    y_train = np.empty((0, forecasting_horizon), dtype=float)
    
    X_validation = np.empty((0, lags), dtype=float)
    y_validation = np.empty((0, forecasting_horizon), dtype=float)
    
    X_train_validation = np.empty((0, lags), dtype=float)
    y_train_validation = np.empty((0, forecasting_horizon), dtype=float)    
    
    df_test = pd.DataFrame()
    
    max_date_to_fit = week_begin - pd.DateOffset(days=1)
    print('max_date_to_fit: ', max_date_to_fit)
    
    date_begin_y_validation = max_date_to_fit - timedelta(days=forecasting_horizon + 1)
    date_end_y_validation = max_date_to_fit
    date_begin_X_validation = date_begin_y_validation - timedelta(days= lags)    
    
    date_begin_X_test = max_date_to_fit - timedelta(days=lags - 1)
    date_end_y_test = max_date_to_fit + timedelta(days=forecasting_horizon)    
    
    for municipality in df_original['MUNICIPIO'].unique()[:]:
        df_municipality_moment = df_original.loc[(df_original.MUNICIPIO == municipality) & (df_original.DATA <= date_end_y_test)]
        
        df_municipality_train_data = df_municipality_moment[(df_municipality_moment.DATA < date_begin_y_validation) & (df_municipality_moment['TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS'].isna()==False)]
        train_data = df_municipality_train_data['TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS'].values
        
        df_municipality_validation_data = df_municipality_moment[(df_municipality_moment.DATA >= date_begin_X_validation) & (df_municipality_moment.DATA <= date_end_y_validation)]
        validation_data = df_municipality_validation_data['TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS'].values
        
        df_municipality_test_data = df_municipality_moment[(df_municipality_moment.DATA >= date_begin_X_test) & (df_municipality_moment.DATA <= date_end_y_test)]
        test_data = df_municipality_test_data['TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS'].values
        
        municipality_id = df_municipality_train_data.CODIGO_MUNICIPIO_6.unique()[0]
        
        X_train_municipality,y_train_municipality=create_trainable_dataset(train_data,lags,forecasting_horizon)
        X_train = np.concatenate([X_train, X_train_municipality])
        y_train = np.concatenate([y_train, y_train_municipality])
        
        X_validation_municipality,y_validation_municipality=create_trainable_dataset(validation_data, lags, forecasting_horizon)
        X_validation = np.concatenate([X_validation, X_validation_municipality])
        y_validation = np.concatenate([y_validation, y_validation_municipality])
        
        X_test_municipality, y_test_municipality=create_trainable_dataset(test_data, lags, forecasting_horizon)      
        
        entry_test_municipality = {'municipality_id': municipality_id, 'municipality': municipality, 'max_date_to_fit': max_date_to_fit, 'X': X_test_municipality, 'y': y_test_municipality}
        
        df_test = pd.concat([df_test, pd.DataFrame.from_records([entry_test_municipality])])
    
    feature_transformer_train = MinMaxScaler()
    target_transformer_train = MinMaxScaler()
    feature_transformer_train = feature_transformer_train.fit(X_train)
    target_transformer_train = target_transformer_train.fit(y_train)
    
    X_train = feature_transformer_train.transform(X_train)
    y_train = target_transformer_train.transform(y_train)
    
    X_validation = feature_transformer_train.transform(X_validation)
    y_validation = target_transformer_train.transform(y_validation)
    
    feature_transformer_train_validation = MinMaxScaler()
    target_transformer_train_validation = MinMaxScaler()
    
    np.savetxt('data/X_train_'+str(max_date_to_fit)+'_'+str(lags)+'.csv', X_train, delimiter=',')
    np.savetxt('data/y_train_'+str(max_date_to_fit)+'_'+str(lags)+'.csv', y_train, delimiter=',')
    np.savetxt('data/X_validation_' + str(max_date_to_fit) +'_' + str(lags) +'.csv', X_validation, delimiter=',')
    np.savetxt('data/y_validation_' + str(max_date_to_fit) +'_' + str(lags) +'.csv', y_validation, delimiter=',')
    df_test.to_csv('data/df_test_'+str(max_date_to_fit)+'_'+str(lags)+'.csv', index=False)
    
    dump(target_transformer_train, open('data/target_transformer_train_' + str(max_date_to_fit) + '_' + str(lags) + '.pkl', 'wb'))
    dump(feature_transformer_train, open('data/feature_transformer_train_' + str(max_date_to_fit) + '_' + str(lags) + '.pkl', 'wb'))

max_date_to_fit:  2020-04-25 00:00:00
max_date_to_fit:  2020-07-18 00:00:00
max_date_to_fit:  2020-10-10 00:00:00
max_date_to_fit:  2021-01-02 00:00:00
max_date_to_fit:  2021-03-27 00:00:00
max_date_to_fit:  2021-06-19 00:00:00
max_date_to_fit:  2021-09-11 00:00:00
max_date_to_fit:  2021-12-04 00:00:00
max_date_to_fit:  2022-02-26 00:00:00


In [8]:
df_municipality_test_data

Unnamed: 0,DATA,CODIGO_MUNICIPIO_6,NOVOS_CASOS_SRAG,OBITOS_NOVOS,OBITOS,OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS,TAXA_OBITOS_NOVOS,TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS,TAXA_OBITOS,casosNovos,...,NUMERO_REPRODUCAO_EFETIVO_QUANTIL_0.975,NUMERO_REPRODUCAO_EFETIVO_ATRASADO_MEDIA,NUMERO_REPRODUCAO_EFETIVO_ATRASADO_VARIANCIA,NUMERO_REPRODUCAO_EFETIVO_ATRASADO_QUANTIL_0.025,NUMERO_REPRODUCAO_EFETIVO_ATRASADO_MEDIANA,NUMERO_REPRODUCAO_EFETIVO_ATRASADO_QUANTIL_0.975,TAXA_FATALIDADE,EPIDEMIC_EPISODE,ONSET_NUMERO_REPRODUCAO_EFETIVO_MEDIA,ONSET_TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS
33698,2021-12-05,317020,3.0,0.0,3059.0,0.142857,0.000000,0.020030,428.892703,24.0,...,1.052938,1.003148,0.007287,0.842847,1.000682,1.179371,0.632911,2.0,6.0,
33699,2021-12-06,317020,1.0,0.0,3059.0,0.142857,0.000000,0.020030,428.892703,18.0,...,1.045641,1.049128,0.007702,0.881506,1.046657,1.228367,0.625000,2.0,6.0,
33700,2021-12-07,317020,0.0,0.0,3059.0,0.000000,0.000000,0.000000,428.892703,28.0,...,1.036591,1.087234,0.008045,0.919265,1.084616,1.267765,0.000000,2.0,6.0,
33701,2021-12-08,317020,1.0,0.0,3059.0,0.000000,0.000000,0.000000,428.892703,10.0,...,1.036551,1.113985,0.008262,0.946848,1.111619,1.298859,0.000000,2.0,6.0,
33702,2021-12-09,317020,1.0,0.0,3059.0,0.000000,0.000000,0.000000,428.892703,19.0,...,1.034534,1.138672,0.008368,0.966238,1.136161,1.327131,0.000000,2.0,6.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33861,2022-05-17,317020,7.0,0.0,3183.0,0.571429,0.000000,0.080118,446.278350,118.0,...,1.686392,1.179936,0.003339,1.068834,1.178977,1.295891,1.777778,3.0,7.0,
33862,2022-05-18,317020,10.0,0.0,3183.0,0.285714,0.000000,0.040059,446.278350,130.0,...,1.660425,1.182694,0.003253,1.073723,1.181768,1.297460,0.796813,3.0,7.0,
33863,2022-05-19,317020,7.0,0.0,3183.0,0.285714,0.000000,0.040059,446.278350,155.0,...,1.630597,1.184634,0.003176,1.077211,1.183761,1.298646,0.900901,3.0,7.0,
33864,2022-05-20,317020,3.0,0.0,3183.0,0.285714,0.000000,0.040059,446.278350,145.0,...,1.596547,1.188817,0.003103,1.081760,1.187952,1.300462,1.010101,3.0,7.0,


In [9]:
X_test_municipality

array([[0.02002955, 0.02002955, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.02002955, 0.02002955,
        0.02002955, 0.0400591 , 0.0400591 , 0.0400591 , 0.0400591 ,
        0.0400591 , 0.0400591 , 0.0400591 , 0.0400591 , 0.0400591 ,
        0.0400591 , 0.0400591 , 0.02002955, 0.0400591 , 0.0400591 ,
        0.0400591 , 0.0400591 , 0.0400591 , 0.0400591 , 0.0400591 ,
        0.0400591 , 0.0400591 , 0.02002955, 0.02002955, 0.02002955,
        0.02002955, 0.02002955, 0.02002955, 0.0400591 , 0.0400591 ,
        0.06008864, 0.08011819, 0.12017729, 0.18026593, 0.22032502,
        0.20029548, 0.20029548, 0.22032502, 0.24035457, 0.24035457,
        0.22032502, 0.24035457, 0.28041367, 0.3805614 , 0.3805614 ,
        0.34050231, 0.36053186, 0.32047276, 0.28041367, 0.36053186,
        0.32047276, 0.36053186, 0.40059095, 0.46067959, 0.48070914,
        0.48070914, 0.3805614 , 0.34050231, 0.34050231, 0.34050231,
        0.28041367, 0.30044321, 0.26038412, 0.28