# 🧪  Recolha e Limpeza

In [1]:
# Importar bibliotecas necessárias
# !pip install pandas
import pandas as pd

In [2]:
# Abrir o ficheiro com ; como separador
df = pd.read_csv('datasets/consumos_horario_codigo_postal.csv', sep=';')

In [3]:
# Confirmar a leitura correta das colunas
df.columns

Index(['Date/Time', 'Date', 'Hour', 'Zip Code', 'Active Energy (kWh)',
       'Day of the Week'],
      dtype='object')

In [4]:
# Verificar tipos de dados
df.dtypes  # mostra o tipo de cada coluna

Date/Time               object
Date                    object
Hour                    object
Zip Code                object
Active Energy (kWh)    float64
Day of the Week         object
dtype: object

In [5]:
# Verificar n.º de registos
df.shape  # mostra (número de linhas, número de colunas)

(3727439, 6)

In [6]:
# Visualizar primeiras linhas
df.head()  # primeiros 5 registos

Unnamed: 0,Date/Time,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week
0,2023-03-19T00:00:00+00:00,2023-03-19,00:00,2025,4596.739709,Domingo
1,2023-09-23T12:00:00+01:00,2023-09-23,12:00,4405,14711.438243,Sábado
2,2023-02-15T05:00:00+00:00,2023-02-15,05:00,1600,21440.512557,Quarta
3,2023-02-05T11:00:00+00:00,2023-02-05,11:00,3030,16974.037997,Domingo
4,2023-02-13T09:00:00+00:00,2023-02-13,09:00,4720,7247.291623,Segunda


In [7]:
# Visualizar últimas linhas
df.tail()  # últimos 5 registos

Unnamed: 0,Date/Time,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week
3727434,2023-02-09T10:00:00+00:00,2023-02-09,10:00,4405,19433.670144,Quinta
3727435,2023-02-14T10:00:00+00:00,2023-02-14,10:00,4870,2314.309648,Terça
3727436,2023-02-07T15:00:00+00:00,2023-02-07,15:00,4475,113968.770238,Terça
3727437,2023-02-23T21:00:00+00:00,2023-02-23,21:00,5350,1731.337009,Quinta
3727438,2023-02-19T12:00:00+00:00,2023-02-19,12:00,3610,2519.57506,Domingo


# Engenharia de features
| Feature    | Para que serve?                                |
|------------|------------------------------------------------|
| Day        | Captura ciclos diários                         |
| Month      | Captura sazonalidade mensal                    |
| Is_Weekend | 1 se for sábado ou domingo, senão 0            |
| Is_Night   | Útil porque o consumo e diferente de madrugada |
| Season     | 	Captura efeitos climáticos ou sazonais amplos |

In [8]:
# Eliminar a coluna Date/Time
df.drop(columns=['Date/Time'], inplace=True)

#  Criar a coluna Datetime
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Hour'], format='%Y-%m-%d %H:%M')

# Extrair a hora como número inteiro (ex: 23 de '23:00')
df['Hour'] = pd.to_datetime(df['Hour'], format='%H:%M').dt.hour

#  Criar a coluna Day
df['Day'] = df['Datetime'].dt.day

#  Criar a coluna Month
df['Month'] = df['Datetime'].dt.month

#  Criar a coluna Year
df['Year'] = df['Datetime'].dt.year

#  Criar a coluna Is_Weekend
df['Is_Weekend'] = df['Day of the Week'].isin(['Sábado', 'Domingo']).astype(int)

#  Criar a coluna Is_Night
df['Is_Night'] = df['Hour'].between(0, 6).astype(int)

#  Criar a coluna Season
def get_season(month):
    if month in [12, 1, 2]:
        return 'Inverno'
    elif month in [3, 4, 5]:
        return 'Primavera'
    elif month in [6, 7, 8]:
        return 'Verão'
    else:
        return 'Outono'

df['Season'] = df['Month'].apply(get_season)


df.head()

Unnamed: 0,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week,Datetime,Day,Month,Year,Is_Weekend,Is_Night,Season
0,2023-03-19,0,2025,4596.739709,Domingo,2023-03-19 00:00:00,19,3,2023,1,1,Primavera
1,2023-09-23,12,4405,14711.438243,Sábado,2023-09-23 12:00:00,23,9,2023,1,0,Outono
2,2023-02-15,5,1600,21440.512557,Quarta,2023-02-15 05:00:00,15,2,2023,0,1,Inverno
3,2023-02-05,11,3030,16974.037997,Domingo,2023-02-05 11:00:00,5,2,2023,1,0,Inverno
4,2023-02-13,9,4720,7247.291623,Segunda,2023-02-13 09:00:00,13,2,2023,0,0,Inverno


In [9]:
 # Tarefa: Anotar problemas visíveis
df.dtypes  #  Verifica se os tipos de dados fazem sentido

Date                           object
Hour                            int32
Zip Code                       object
Active Energy (kWh)           float64
Day of the Week                object
Datetime               datetime64[ns]
Day                             int32
Month                           int32
Year                            int32
Is_Weekend                      int32
Is_Night                        int32
Season                         object
dtype: object

In [10]:
# Verifica valores ausentes ou duplicados
df.isnull().sum()
df.duplicated().sum()

0

In [11]:
# ordenar os dados pela coluna Datetime
df = df.sort_values(by='Datetime').reset_index(drop=True)

In [12]:
# Verifica se está ordenado corretamente
df.head()  # Ver primeiros valores

Unnamed: 0,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week,Datetime,Day,Month,Year,Is_Weekend,Is_Night,Season
0,2022-11-01,0,5000,11175.021824,Terça,2022-11-01,1,11,2022,0,1,Outono
1,2022-11-01,0,2450,7376.909868,Terça,2022-11-01,1,11,2022,0,1,Outono
2,2022-11-01,0,5230,839.171319,Terça,2022-11-01,1,11,2022,0,1,Outono
3,2022-11-01,0,3620,2849.440465,Terça,2022-11-01,1,11,2022,0,1,Outono
4,2022-11-01,0,2800,5109.779921,Terça,2022-11-01,1,11,2022,0,1,Outono


In [13]:
df.tail()  # Ver últimos valores

Unnamed: 0,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week,Datetime,Day,Month,Year,Is_Weekend,Is_Night,Season
3727434,2023-09-30,23,4900,8312.043233,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono
3727435,2023-09-30,23,8300,3320.150528,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono
3727436,2023-09-30,23,4410,10211.948003,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono
3727437,2023-09-30,23,4800,5922.919257,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono
3727438,2023-09-30,23,5160,1487.604437,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono


In [14]:
# Guardar versão limpa do dataset como .csv
df = df[df['Active Energy (kWh)'] >= 0]
df.to_csv('datasets/consumo_eredes_limpo.csv', index=False)

### Dataset Limpo
- Nome: `consumo_eredes_limpo.csv`
- Colunas mantidas: `Zip Code`, `Active Energy (kWh)`, `Day of the Week`
- Colunas criadas: `Datetime`
- Index ordenado cronologicamente