# Preparing data for forecasting

In [1]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller
import numpy as np

## Reading data

In [2]:
df = pd.read_csv('../data/input/df_ts_epidemic_episodes.csv', index_col=0)
df.DATA = pd.to_datetime(df.DATA, infer_datetime_format=True)

In [3]:
list_municipalities = df['MUNICIPIO'].unique()

In [4]:
date = pd.to_datetime('2021-06-20')

In [5]:
date_1 = date + pd.DateOffset(days=84)
date_1

In [6]:
date_2 = date_1 + pd.DateOffset(days=84)
date_2

In [7]:
date_3 = date_2 + pd.DateOffset(days=84)
date_3

In [8]:
epidemiologic_weeks = [date_1, date_2, date_3]

In [9]:
FORECASTING_HORIZON = 84
NUMBER_OF_SAMPLES = 20

## Original dataset

In [10]:
df = df.sort_values(by=['MUNICIPIO','DATA'])

In [11]:
df.columns

In [12]:
max_date = epidemiologic_weeks[-1] + pd.Timedelta(days=FORECASTING_HORIZON-1)

In [13]:
df_original = df.loc[(df['DATA']>=date_1) & (df['DATA']<=max_date)]

## Noised dataset

In [14]:
df_noised = pd.DataFrame()
for municipality in list_municipalities:
    df_municipality = df_original.loc[df['MUNICIPIO']==municipality]
    
    for i in range(NUMBER_OF_SAMPLES):
        ts_data = df_municipality.set_index('DATA')['TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS'] 
        noise = np.random.uniform(low=-0.01, high=0.01, size=len(ts_data))
        noise_scaled = noise * ts_data
        ts_data_noised = ts_data + noise_scaled
        ts_data_noised[ts_data_noised < 0] = 0
        
        df_municipality_noised = df_municipality.copy()
        df_municipality_noised['TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS_NOISED'] = ts_data_noised.values
        df_municipality_noised['AMOSTRA'] = i  
        
        df_noised = pd.concat([df_noised, df_municipality_noised])

In [15]:
import matplotlib.pyplot as plt

plt.plot(df_original[df_original['MUNICIPIO']=='Belo Horizonte']['DATA'], df_original[df_original['MUNICIPIO']=='Belo Horizonte']['TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS'], color='blue')
plt.plot(df_noised[(df_noised['MUNICIPIO']=='Belo Horizonte') & (df_noised['AMOSTRA']==1)]['DATA'], df_noised[(df_noised['MUNICIPIO']=='Belo Horizonte') & (df_noised['AMOSTRA']==1)]['TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS_NOISED'], color='red')
plt.show()

In [16]:
diff = df_original[df_original['MUNICIPIO']=='Belo Horizonte']['TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS'] - df_noised[(df_noised['MUNICIPIO']=='Belo Horizonte') & (df_noised['AMOSTRA']==1)]['TAXA_OBITOS_NOVOS_MEDIA_MOVEL_7_DIAS_NOISED']

In [17]:
import matplotlib.pyplot as plt

plt.plot(diff, color='blue')
plt.show()

In [18]:
df_noised[['MUNICIPIO','AMOSTRA']].drop_duplicates()

## Concatenating with basic datasets

In [19]:
df_original_basic = pd.read_csv('data/df_original.csv')
df_noised_basic = pd.read_csv('data/df_noised.csv')

In [20]:
df_original = pd.concat([df_original_basic, df_original], ignore_index=True).reset_index(drop=True)
df_noised = pd.concat([df_noised_basic, df_noised], ignore_index=True).reset_index(drop=True)

## Saving data

In [21]:
df_original.to_csv('data/df_original_extended.csv', index=False)
df_noised.to_csv('data/df_noised_extended.csv', index=False)

In [22]:
df_original['DATA'] = pd.to_datetime(df_original['DATA'])

In [23]:
df_original[df_original['MUNICIPIO']=='São Paulo']['DATA'].max()-df_original[df_original['MUNICIPIO']=='São Paulo']['DATA'].min()

In [24]:
df_original[df_original['MUNICIPIO']=='São Paulo']