## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import glob
import datetime

In [54]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

## Helper Functions

In [3]:
def filter_dataframes(path, min_date):
    '''
    Return the concated dataframe for all the files in a folder with some filters over date and columns
    
    path: folder path (str)
    min_date: minimum date (str YYYY-MM-DD)
    '''
    
    appended_data = []
    
    for filename in glob.glob(f'{path}*.csv'):
        
        file = filename.split('\\')[1]
        print('reading:', file)
        
        df = pd.read_csv(path + file)
        df['Data'] = pd.to_datetime(df['Data'])
        
        min_date = pd.to_datetime(min_date)
        
        df = df[df['Data'] >= min_date ].drop(columns=['index', 'state', 'station', 'station_code', 'latitude', 'longitude', 'height'])
        
        appended_data.append(df) 
        
    return pd.concat(appended_data)

## Readind Weather Data

In [4]:
df_weather = filter_dataframes('original_bases/', '2017-01-01')

reading: central_west.csv
reading: north.csv
reading: northeast.csv
reading: south.csv
reading: southeast.csv


In [5]:
# Treating invalid information for columns

for col in df_weather.drop(columns=['Data', 'Hora', 'region']).columns:
    
    print(f'treating data for {col}', end=' ')
    min_value = df_weather[col].min()
    
    df_weather = df_weather[df_weather[col]!=min_value]
    
    print(f'New min value = {df_weather[col].min()}')

treating data for PRECIPITAÇÃO TOTAL, HORÁRIO (mm) New min value = 0.0
treating data for PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB) New min value = 720.1
treating data for PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB) New min value = 720.3
treating data for PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB) New min value = 720.0
treating data for RADIACAO GLOBAL (Kj/m²) New min value = 0
treating data for TEMPERATURA DO AR - BULBO SECO, HORARIA (°C) New min value = -9.0
treating data for TEMPERATURA DO PONTO DE ORVALHO (°C) New min value = -42.5
treating data for TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C) New min value = -8.2
treating data for TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C) New min value = -8.6
treating data for TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C) New min value = -35.0
treating data for TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C) New min value = -35.0
treating data for UMIDADE REL. MAX. NA HORA ANT. (AUT) (%) New min value = 3
treating data for 

In [6]:
# Fixing Dtype for Hour column

df_weather['Hora'] = pd.to_datetime(df_weather['Hora']).dt.hour

In [7]:
df_left = df_weather.drop(columns=['PRECIPITAÇÃO TOTAL, HORÁRIO (mm)']).groupby(['Data', 'Hora']).mean(numeric_only=True).reset_index()
df_right = df_weather.groupby(['Data', 'Hora'])[['PRECIPITAÇÃO TOTAL, HORÁRIO (mm)']].sum(numeric_only=True).reset_index()

df_weather = pd.merge(df_left, df_right, how='inner', on=['Data', 'Hora'])
del df_left, df_right

### Read Population Data

In [8]:
df_pop = pd.read_csv('original_bases/populacao_municipio.csv')
df_pop = df_pop.groupby("ano")[["populacao"]].sum().reset_index()

In [9]:
df_pop["last_year"] = df_pop["ano"]-1
df_pop = df_pop.merge(df_pop, how='inner',left_on='last_year',right_on='ano',suffixes=('', '_passado'))
df_pop = df_pop.drop(columns=["last_year","ano_passado","last_year_passado"])
df_pop = df_pop[df_pop["ano"]>=2017]
df_pop["variacao"]=df_pop["populacao"]-df_pop["populacao_passado"]

In [10]:
df_data = df_weather[["Data"]].drop_duplicates()
df_data["year"] = df_data["Data"].dt.year

In [11]:
df_pop = df_pop.merge(df_data, how='inner',left_on='ano',right_on='year',suffixes=('', ''))
df_pop = df_pop.drop(columns=["year"])

In [12]:
df_pop["day_of_year"]=df_pop["Data"].dt.day_of_year
df_pop["populacao_dia"]=df_pop["populacao_passado"] + df_pop["day_of_year"]*df_pop["variacao"]/365

### Read Industry production

In [142]:
df_industry = pd.read_csv('original_bases/industry_ibge.csv', sep=';', ).reset_index()

In [143]:
df_industry = df_industry.transpose().reset_index(drop=True).drop(columns=4)

In [144]:
df_industry = df_industry.rename(columns={0:'Data', 
                                          1: 'industria_geral', 
                                          2: 'industria_extrativa', 
                                          3:'industria_transformacao'})[2:].reset_index(drop=True)

In [145]:
df_industry['Data'] = df_industry['Data'].apply(lambda x: x.split(' '))
df_industry['ano'] = df_industry['Data'].apply(lambda x: x[1])
df_industry['mes'] = df_industry['Data'].apply(lambda x: x[0])

In [146]:
df_industry['mes'] = df_industry['mes'].apply(lambda x: 'january' if x=='janeiro' else 
                                    ('february' if x=='fevereiro' else 
                                    ('march' if x=='março' else
                                    ('april' if x=='abril' else
                                    ('may' if x=='maio' else
                                    ('june' if x=='junho' else
                                    ('july' if x=='julho' else
                                    ('august' if x=='agosto' else
                                    ('september' if x=='setembro' else
                                    ('october' if x=='outubro' else
                                    ('november' if x=='novembro' else
                                    ('december' if x=='dezembro' else x))))))))))))

In [147]:
df_industry['Data'] = df_industry['mes'] + '/' + df_industry['ano']
df_industry['Data'] = pd.to_datetime(df_industry['Data'])

In [148]:
df_industry['ano'] = df_industry['Data'].dt.year
df_industry['mes'] = df_industry['Data'].dt.month

### Acquiring target information

In [114]:
df_target = pd.read_csv('original_bases/energy_demand_hourly_brazil.csv')

In [115]:
df_target['Data'] = pd.to_datetime(df_target['index'])

In [116]:
df_target['index'] = df_target['index'].apply(lambda x: x.split()[0])

In [117]:
df_target['Hora'] = df_target['Data'].dt.hour

In [118]:
df_target['Data'] = pd.to_datetime(df_target['index'])

In [119]:
df_target.drop(columns=['index'], inplace=True)

## Joining bases

In [166]:
df_final = pd.merge(df_target, df_weather, how='inner', on=['Data', 'Hora'])

In [167]:
df_final = pd.merge(df_final, df_pop[['Data','populacao_dia']], how='inner', on=['Data'])

In [168]:
df_final['ano'] = df_final['Data'].dt.year
df_final['mes'] = df_final['Data'].dt.month

In [169]:
df_final = pd.merge(df_final, df_industry.drop(columns=['Data']), how='inner', on=['mes', 'ano'])

In [170]:
df_final.to_csv('df_final.csv', index=False)