## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

## Helper Functions

In [3]:
def filter_dataframes(path, min_date):
    '''
    Return the concated dataframe for all the files in a folder with some filters over date and columns
    
    path: folder path (str)
    min_date: minimum date (str YYYY-MM-DD)
    '''
    
    appended_data = []
    
    for filename in glob.glob(f'{path}*.csv'):
        
        file = filename.split('\\')[1]
        print('reading:', file)
        
        df = pd.read_csv(path + file)
        df['Data'] = pd.to_datetime(df['Data'])
        
        min_date = pd.to_datetime(min_date)
        
        df = df[df['Data'] >= min_date ].drop(columns=['index', 'state', 'station', 'station_code', 'latitude', 'longitude', 'height'])
        
        appended_data.append(df) 
        
    return pd.concat(appended_data)

## Readind Data

In [5]:
df = filter_dataframes('original_bases/', '2017-01-01')

reading: central_west.csv
reading: north.csv
reading: northeast.csv
reading: south.csv
reading: southeast.csv


In [6]:
# Treating invalid information for columns

for col in df.drop(columns=['Data', 'Hora', 'region']).columns:
    
    print(f'treating data for {col}', end=' ')
    min_value = df[col].min()
    
    df = df[df[col]!=min_value]
    
    print(f'New min value = {df[col].min()}')

treating data for PRECIPITAÇÃO TOTAL, HORÁRIO (mm) New min value = 0.0
treating data for PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB) New min value = 720.1
treating data for PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB) New min value = 720.3
treating data for PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB) New min value = 720.0
treating data for RADIACAO GLOBAL (Kj/m²) New min value = 0
treating data for TEMPERATURA DO AR - BULBO SECO, HORARIA (°C) New min value = -9.0
treating data for TEMPERATURA DO PONTO DE ORVALHO (°C) New min value = -42.5
treating data for TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C) New min value = -8.2
treating data for TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C) New min value = -8.6
treating data for TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C) New min value = -35.0
treating data for TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C) New min value = -35.0
treating data for UMIDADE REL. MAX. NA HORA ANT. (AUT) (%) New min value = 3
treating data for 

In [7]:
# Fixing Dtype for Hour column

df['Hora'] = pd.to_datetime(df['Hora']).dt.hour

In [8]:
df_left = df.drop(columns=['PRECIPITAÇÃO TOTAL, HORÁRIO (mm)']).groupby(['Data', 'Hora']).mean(numeric_only=True).reset_index()
df_right = df.groupby(['Data', 'Hora'])[['PRECIPITAÇÃO TOTAL, HORÁRIO (mm)']].sum(numeric_only=True).reset_index()

df = pd.merge(df_left, df_right, how='inner', on=['Data', 'Hora'])
del df_left, df_right

In [9]:
df.head()

Unnamed: 0,Data,Hora,"PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)",PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB),PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB),RADIACAO GLOBAL (Kj/m²),"TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)",TEMPERATURA DO PONTO DE ORVALHO (°C),TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C),TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C),TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C),TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C),UMIDADE REL. MAX. NA HORA ANT. (AUT) (%),UMIDADE REL. MIN. NA HORA ANT. (AUT) (%),"UMIDADE RELATIVA DO AR, HORARIA (%)","VENTO, DIREÇÃO HORARIA (gr) (° (gr))","VENTO, RAJADA MAXIMA (m/s)","VENTO, VELOCIDADE HORARIA (m/s)","PRECIPITAÇÃO TOTAL, HORÁRIO (mm)"
0,2017-01-01,0,974.566667,974.758333,973.666667,2.0,23.675,21.066667,24.433333,23.425,21.616667,20.666667,86.833333,83.0,86.083333,226.25,5.808333,1.816667,22.0
1,2017-01-01,1,975.609091,975.663636,974.9,1.545455,23.472727,21.136364,24.3,23.409091,21.436364,20.818182,87.727273,82.272727,87.181818,168.181818,6.490909,2.063636,24.8
2,2017-01-01,2,960.715385,960.930769,960.276923,5.0,22.561538,20.523077,23.269231,22.430769,20.961538,20.346154,89.307692,85.846154,88.615385,203.0,6.161538,2.646154,11.2
3,2017-01-01,3,980.014286,980.342857,979.914286,93.571429,24.1,21.685714,24.371429,23.885714,22.028571,21.457143,89.0,84.571429,86.714286,243.571429,4.214286,1.757143,0.0
4,2017-01-01,4,983.5125,983.8375,983.225,18.375,23.6625,21.5875,24.075,23.55,21.95,21.525,89.625,87.375,88.625,185.125,3.7875,1.475,2.6


### Acquiring target information

In [10]:
df_target = pd.read_csv('energy_demand_hourly_brazil.csv')

In [11]:
df_target['Data'] = pd.to_datetime(df_target['index'])

In [12]:
df_target['index'] = df_target['index'].apply(lambda x: x.split()[0])

In [13]:
df_target['Hora'] = df_target['Data'].dt.hour

In [14]:
df_target['Data'] = pd.to_datetime(df_target['index'])

In [15]:
df_target.drop(columns=['index'], inplace=True)

In [16]:
df_target

Unnamed: 0,hourly_demand,Data,Hora
0,34673.900,2000-01-01,0
1,33503.000,2000-01-01,1
2,32287.600,2000-01-01,2
3,31059.400,2000-01-01,3
4,30272.700,2000-01-01,4
...,...,...,...
201313,75825.344,2022-12-31,20
201314,69894.955,2022-12-31,21
201315,64612.355,2022-12-31,22
201316,61215.376,2022-12-31,23


## Joining bases

In [17]:
df_final = pd.merge(df_target, df, how='inner', on=['Data', 'Hora'])

In [20]:
df_final.to_csv('df_final.csv', index=False)