# 🧪  Recolha e Limpeza

In [5]:
# Importar bibliotecas necessárias
import pandas as pd

In [6]:
# Abrir o ficheiro com ; como separador
df = pd.read_csv('datasets/consumos_horario_codigo_postal.csv', sep=';')

In [7]:
# Confirmar a leitura correta das colunas
df.columns

Index(['Date/Time', 'Date', 'Hour', 'Zip Code', 'Active Energy (kWh)',
       'Day of the Week'],
      dtype='object')

In [8]:
# Verificar tipos de dados
df.dtypes  # mostra o tipo de cada coluna

Date/Time               object
Date                    object
Hour                    object
Zip Code                object
Active Energy (kWh)    float64
Day of the Week         object
dtype: object

In [9]:
# Verificar n.º de registos
df.shape  # mostra (número de linhas, número de colunas)

(3727439, 6)

In [10]:
# Visualizar primeiras linhas
df.head()  # primeiros 5 registos

Unnamed: 0,Date/Time,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week
0,2023-09-28T17:00:00+01:00,2023-09-28,17:00,2645,13458.564418,Quinta
1,2022-12-29T01:00:00+00:00,2022-12-29,01:00,6185,277.57079,Quinta
2,2022-11-06T11:00:00+00:00,2022-11-06,11:00,4750,18590.29298,Domingo
3,2022-11-06T19:00:00+00:00,2022-11-06,19:00,4940,2751.529461,Domingo
4,2022-11-07T11:00:00+00:00,2022-11-07,11:00,5450,7265.089752,Segunda


In [11]:
# Visualizar últimas linhas
df.tail()  # últimos 5 registos

Unnamed: 0,Date/Time,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week
3727434,2022-11-01T14:00:00+00:00,2022-11-01,14:00,4585,7878.397224,Terça
3727435,2022-11-14T15:00:00+00:00,2022-11-14,15:00,4820,20667.292046,Segunda
3727436,2022-11-15T14:00:00+00:00,2022-11-15,14:00,4605,2516.984028,Terça
3727437,2022-11-23T10:00:00+00:00,2022-11-23,10:00,1675,4378.91039,Quarta
3727438,2022-11-21T00:00:00+00:00,2022-11-21,00:00,2090,2748.289623,Segunda


# Engenharia de features
| Feature    | Para que serve?                                |
|------------|------------------------------------------------|
| Day        | Captura ciclos diários                         |
| Month      | Captura sazonalidade mensal                    |
| Is_Weekend | 1 se for sábado ou domingo, senão 0            |
| Is_Night   | Útil porque o consumo e diferente de madrugada |
| Season     | 	Captura efeitos climáticos ou sazonais amplos |

In [12]:
# Eliminar a coluna Date/Time
df.drop(columns=['Date/Time'], inplace=True)

#  Criar a coluna Datetime
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Hour'], format='%Y-%m-%d %H:%M')

# Extrair a hora como número inteiro (ex: 23 de '23:00')
df['Hour'] = pd.to_datetime(df['Hour'], format='%H:%M').dt.hour

#  Criar a coluna Day
df['Day'] = df['Datetime'].dt.day

#  Criar a coluna Month
df['Month'] = df['Datetime'].dt.month

#  Criar a coluna Year
df['Year'] = df['Datetime'].dt.year

#  Criar a coluna Is_Weekend
df['Is_Weekend'] = df['Day of the Week'].isin(['Sábado', 'Domingo']).astype(int)

#  Criar a coluna Is_Night
df['Is_Night'] = df['Hour'].between(0, 6).astype(int)

#  Criar a coluna Season
def get_season(month):
    if month in [12, 1, 2]:
        return 'Inverno'
    elif month in [3, 4, 5]:
        return 'Primavera'
    elif month in [6, 7, 8]:
        return 'Verão'
    else:
        return 'Outono'

df['Season'] = df['Month'].apply(get_season)


df.head()

Unnamed: 0,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week,Datetime,Day,Month,Year,Is_Weekend,Is_Night,Season
0,2023-09-28,17,2645,13458.564418,Quinta,2023-09-28 17:00:00,28,9,2023,0,0,Outono
1,2022-12-29,1,6185,277.57079,Quinta,2022-12-29 01:00:00,29,12,2022,0,1,Inverno
2,2022-11-06,11,4750,18590.29298,Domingo,2022-11-06 11:00:00,6,11,2022,1,0,Outono
3,2022-11-06,19,4940,2751.529461,Domingo,2022-11-06 19:00:00,6,11,2022,1,0,Outono
4,2022-11-07,11,5450,7265.089752,Segunda,2022-11-07 11:00:00,7,11,2022,0,0,Outono


In [13]:
 # Tarefa: Anotar problemas visíveis
df.dtypes  #  Verifica se os tipos de dados fazem sentido

Date                           object
Hour                            int32
Zip Code                       object
Active Energy (kWh)           float64
Day of the Week                object
Datetime               datetime64[ns]
Day                             int32
Month                           int32
Year                            int32
Is_Weekend                      int32
Is_Night                        int32
Season                         object
dtype: object

In [14]:
# Verifica valores ausentes ou duplicados
df.isnull().sum()
df.duplicated().sum()

0

In [15]:
# ordenar os dados pela coluna Datetime
df = df.sort_values(by='Datetime').reset_index(drop=True)

In [16]:
# Verifica se está ordenado corretamente
df.head()  # Ver primeiros valores

Unnamed: 0,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week,Datetime,Day,Month,Year,Is_Weekend,Is_Night,Season
0,2022-11-01,0,5350,797.992437,Terça,2022-11-01,1,11,2022,0,1,Outono
1,2022-11-01,0,3780,10290.2904,Terça,2022-11-01,1,11,2022,0,1,Outono
2,2022-11-01,0,7595,422.903665,Terça,2022-11-01,1,11,2022,0,1,Outono
3,2022-11-01,0,2950,19649.659297,Terça,2022-11-01,1,11,2022,0,1,Outono
4,2022-11-01,0,7320,974.728282,Terça,2022-11-01,1,11,2022,0,1,Outono


In [17]:
df.tail()  # Ver últimos valores

Unnamed: 0,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week,Datetime,Day,Month,Year,Is_Weekend,Is_Night,Season
3727434,2023-09-30,23,4560,9738.156651,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono
3727435,2023-09-30,23,2845,7462.400572,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono
3727436,2023-09-30,23,3025,3889.286949,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono
3727437,2023-09-30,23,5340,2527.463359,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono
3727438,2023-09-30,23,7780,34065.882374,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono


In [18]:
# Guardar versão limpa do dataset como .csv
df = df[df['Active Energy (kWh)'] >= 0]
df.to_csv('datasets/consumo_eredes_limpo.csv', index=False)

### Dataset Limpo
- Nome: `consumo_eredes_limpo.csv`
- Colunas mantidas: `Zip Code`, `Active Energy (kWh)`, `Day of the Week`
- Colunas criadas: `Datetime`
- Index ordenado cronologicamente