# 🧪 Recolha e Exploração de Dados

In [323]:
# Importar bibliotecas necessárias
import pandas as pd

In [324]:
# Abrir o ficheiro com ; como separador
df = pd.read_csv('datasets/consumos_horario_codigo_postal.csv', sep=';')

In [325]:
# Confirmar a leitura correta das colunas
df.columns

Index(['Date/Time', 'Date', 'Hour', 'Zip Code', 'Active Energy (kWh)',
       'Day of the Week'],
      dtype='object')

In [326]:
# Verificar tipos de dados
df.dtypes  # mostra o tipo de cada coluna

Date/Time               object
Date                    object
Hour                    object
Zip Code                object
Active Energy (kWh)    float64
Day of the Week         object
dtype: object

In [327]:
# Verificar n.º de registos
df.shape  # mostra (número de linhas, número de colunas)

(3727439, 6)

In [328]:
# Visualizar primeiras linhas
df.head()  # primeiros 5 registos

Unnamed: 0,Date/Time,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week
0,2023-03-19T00:00:00+00:00,2023-03-19,00:00,2025,4596.739709,Domingo
1,2023-09-23T12:00:00+01:00,2023-09-23,12:00,4405,14711.438243,Sábado
2,2023-02-15T05:00:00+00:00,2023-02-15,05:00,1600,21440.512557,Quarta
3,2023-02-05T11:00:00+00:00,2023-02-05,11:00,3030,16974.037997,Domingo
4,2023-02-13T09:00:00+00:00,2023-02-13,09:00,4720,7247.291623,Segunda


In [329]:
# Visualizar últimas linhas
df.tail()  # últimos 5 registos

Unnamed: 0,Date/Time,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week
3727434,2023-02-09T10:00:00+00:00,2023-02-09,10:00,4405,19433.670144,Quinta
3727435,2023-02-14T10:00:00+00:00,2023-02-14,10:00,4870,2314.309648,Terça
3727436,2023-02-07T15:00:00+00:00,2023-02-07,15:00,4475,113968.770238,Terça
3727437,2023-02-23T21:00:00+00:00,2023-02-23,21:00,5350,1731.337009,Quinta
3727438,2023-02-19T12:00:00+00:00,2023-02-19,12:00,3610,2519.57506,Domingo


In [330]:
#  Criar a coluna Datetime
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Hour'], format='%Y-%m-%d %H:%M')

# Eliminar a coluna Date/Time, Date, Hour
df.drop(columns=['Date/Time', 'Date', 'Hour'], inplace=True)

df.head()

Unnamed: 0,Zip Code,Active Energy (kWh),Day of the Week,Datetime
0,2025,4596.739709,Domingo,2023-03-19 00:00:00
1,4405,14711.438243,Sábado,2023-09-23 12:00:00
2,1600,21440.512557,Quarta,2023-02-15 05:00:00
3,3030,16974.037997,Domingo,2023-02-05 11:00:00
4,4720,7247.291623,Segunda,2023-02-13 09:00:00


In [331]:
# Extrair ano e mês
#df['Year'] = df['Datetime'].dt.year
#df['Month'] = df['Datetime'].dt.month

In [332]:
# Ver os anos e meses únicos
#df.groupby(['Year', 'Month']).size()

In [333]:
 # Tarefa: Anotar problemas visíveis
#df.columns # Verifica se as colunas foram bem lidas

In [334]:
 # Tarefa: Anotar problemas visíveis
df.dtypes  #  Verifica se os tipos de dados fazem sentido

Zip Code                       object
Active Energy (kWh)           float64
Day of the Week                object
Datetime               datetime64[ns]
dtype: object

In [335]:
# Verifica valores ausentes ou duplicados
df.isnull().sum()
df.duplicated().sum()

np.int64(0)

In [336]:
# ordenar os dados pela coluna Datetime
df = df.sort_values(by='Datetime').reset_index(drop=True)

In [337]:
# Verifica se está ordenado corretamente
df.head()  # Ver primeiros valores

Unnamed: 0,Zip Code,Active Energy (kWh),Day of the Week,Datetime
0,5000,11175.021824,Terça,2022-11-01
1,2450,7376.909868,Terça,2022-11-01
2,5230,839.171319,Terça,2022-11-01
3,3620,2849.440465,Terça,2022-11-01
4,2800,5109.779921,Terça,2022-11-01


In [338]:
df.tail()  # Ver últimos valores

Unnamed: 0,Zip Code,Active Energy (kWh),Day of the Week,Datetime
3727434,4900,8312.043233,Sábado,2023-09-30 23:00:00
3727435,8300,3320.150528,Sábado,2023-09-30 23:00:00
3727436,4410,10211.948003,Sábado,2023-09-30 23:00:00
3727437,4800,5922.919257,Sábado,2023-09-30 23:00:00
3727438,5160,1487.604437,Sábado,2023-09-30 23:00:00


In [339]:
# Guardar versão limpa do dataset como .csv
df.to_csv('datasets/consumo_eredes_limpo.csv', index=False)

### Dataset Limpo
- Nome: `consumo_eredes_limpo.csv`
- Colunas mantidas: `Zip Code`, `Active Energy (kWh)`, `Day of the Week`
- Colunas criadas: `Datetime`
- Index ordenado cronologicamente