# EDA

## Imports

In [4]:
import pandas as pd

## Load data

Index is set as `data_block_id` because this is how data should be received.

In [5]:
client = pd.read_csv(
    "predict-energy-behavior-of-prosumers/client.csv",
    index_col="data_block_id",
    parse_dates=["date"],
)
electricity_prices = pd.read_csv(
    "predict-energy-behavior-of-prosumers/electricity_prices.csv",
    index_col="data_block_id",
    parse_dates=["forecast_date", "origin_date"],
)
forecast_weather = pd.read_csv(
    "predict-energy-behavior-of-prosumers/forecast_weather.csv",
    index_col="data_block_id",
    parse_dates=["forecast_datetime", "origin_datetime"],
)
gas_prices = pd.read_csv(
    "predict-energy-behavior-of-prosumers/gas_prices.csv",
    index_col="data_block_id",
    parse_dates=["forecast_date", "origin_date"],
)
historical_weather = pd.read_csv(
    "predict-energy-behavior-of-prosumers/historical_weather.csv",
    index_col="data_block_id",
    parse_dates=["datetime"],
)
train = pd.read_csv(
    "predict-energy-behavior-of-prosumers/train.csv",
    index_col="data_block_id",
    parse_dates=["datetime"],
)
weather_station_to_county_mapping = pd.read_csv(
    "predict-energy-behavior-of-prosumers/weather_station_to_county_mapping.csv"
)

In [14]:
forecast_weather.describe()

Unnamed: 0,latitude,longitude,origin_datetime,hours_ahead,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,forecast_datetime,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation
count,3424512.0,3424512.0,3424512,3424512.0,3424512.0,3424512.0,3424512.0,3424512.0,3424512.0,3424512.0,3424512.0,3424512.0,3424512,3424512.0,3424510.0,3424512.0,3424512.0
mean,58.65,24.95,2022-07-16 01:32:18.461540352,24.5,5.743913,2.411946,0.3946654,0.4346453,0.3590693,0.6819927,1.255446,0.725011,2022-07-17 02:02:18.461538304,151.1882,110.7642,2.533923e-05,7.863859e-05
min,57.6,21.7,2021-09-01 02:00:00,1.0,-27.4994,-29.68357,0.0,0.0,0.0,0.0,-17.57718,-22.11612,2021-09-01 03:00:00,-0.7733333,-0.3258333,-3.814697e-06,-1.529098e-05
25%,58.125,23.2,2022-02-07 01:00:00,12.75,0.2606445,-2.364355,0.0,0.0003356934,0.0,0.2648926,-1.466691,-1.978108,2022-02-07 19:45:00,0.0,0.0,0.0,0.0
50%,58.65,24.95,2022-07-16 02:00:00,24.5,4.872705,1.835596,0.08866882,0.2305453,0.1011963,0.977267,1.468681,0.9433203,2022-07-17 02:30:00,0.0,0.6044444,0.0,0.0
75%,59.175,26.7,2022-12-22 01:00:00,36.25,11.14639,7.302026,0.9784851,0.9994587,0.9000854,1.0,3.813533,3.507629,2022-12-23 07:15:00,212.8447,144.1723,0.0,2.765656e-05
max,59.7,28.2,2023-05-30 02:00:00,48.0,31.81069,23.68057,1.000008,1.000008,1.000008,1.000008,22.5732,19.31437,2023-06-01 02:00:00,954.4222,848.7144,0.004832983,0.01651621
std,0.6873865,2.015565,,13.8534,7.844206,7.121432,0.4440425,0.4386346,0.4201556,0.4009629,3.9953,4.223752,,256.5069,187.4444,0.000122284,0.000278088


### Missing values

Missing values:
- Forecast weather `surface_solar_radiation_downwards` has 2
- train `target` has 528

In [15]:
print("client.csv")
print(client.isna().sum())

print("\nelectricity_prices.csv")
print(electricity_prices.isna().sum())

print("\nforecast_weather.csv")
print(forecast_weather.isna().sum())

print("\ngas_prices.csv")
print(gas_prices.isna().sum())

print("\nhistorical_weather.csv")
print(historical_weather.isna().sum())

print("\ntrain.csv")
print(train.isna().sum())

client.csv
product_type          0
county                0
eic_count             0
installed_capacity    0
is_business           0
date                  0
dtype: int64

electricity_prices.csv
forecast_date    0
euros_per_mwh    0
origin_date      0
dtype: int64

forecast_weather.csv
latitude                             0
longitude                            0
origin_datetime                      0
hours_ahead                          0
temperature                          0
dewpoint                             0
cloudcover_high                      0
cloudcover_low                       0
cloudcover_mid                       0
cloudcover_total                     0
10_metre_u_wind_component            0
10_metre_v_wind_component            0
forecast_datetime                    0
direct_solar_radiation               0
surface_solar_radiation_downwards    2
snowfall                             0
total_precipitation                  0
dtype: int64

gas_prices.csv
forecast_date           

#### Train missing values

Missing values appear to be missing during DST changes. Since data is provided hourly, Enefit's system might have recorded either none target consumption at 3.00 am when DST starts or 2 values when DST ends, which could probably result in a NA value.

In [25]:
train[train["target"].isna()]["datetime"].value_counts()

datetime
2022-10-30 03:00:00    136
2022-03-27 03:00:00    134
2023-03-26 03:00:00    132
2021-10-31 03:00:00    126
Name: count, dtype: int64

#### Forecast weather missing values

Might be a technical error at random but the missing values are located in the sea so they will be filtered out when locating weather to the counties.

In [27]:
forecast_weather[forecast_weather["surface_solar_radiation_downwards"].isna()]

Unnamed: 0_level_0,latitude,longitude,origin_datetime,hours_ahead,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,forecast_datetime,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation
data_block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
345,59.7,23.7,2022-08-11 02:00:00,3,19.043604,16.849023,0.908295,0.0,0.0,0.908295,5.913838,7.62013,2022-08-11 05:00:00,17.096667,,0.0,0.0
345,59.7,23.7,2022-08-11 02:00:00,4,18.796777,16.994287,0.844788,0.0,0.0,0.844788,5.421923,8.103373,2022-08-11 06:00:00,206.41375,,0.0,0.0
