In [1]:
## import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# date and time utilities from the standard library
from datetime import datetime, date, timedelta
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
# library to suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# custom library 
# import enefit

In [2]:
# update data directory path
DATA_DIR = '/kaggle/input/predict-energy-behavior-of-prosumers'

In [3]:
## get the full path of all files
# walk through the every directory inside DATA_DIR 
for dirpath, dirnames, filenames in os.walk(DATA_DIR):
    # within each directory, iterate over every filename
    for filename in filenames:
        # join the directory path and filename
        print(os.path.join(dirpath, filename))

/kaggle/input/predict-energy-behavior-of-prosumers/client.csv
/kaggle/input/predict-energy-behavior-of-prosumers/gas_prices.csv
/kaggle/input/predict-energy-behavior-of-prosumers/electricity_prices.csv
/kaggle/input/predict-energy-behavior-of-prosumers/weather_station_to_county_mapping.csv
/kaggle/input/predict-energy-behavior-of-prosumers/public_timeseries_testing_util.py
/kaggle/input/predict-energy-behavior-of-prosumers/historical_weather.csv
/kaggle/input/predict-energy-behavior-of-prosumers/county_id_to_name_map.json
/kaggle/input/predict-energy-behavior-of-prosumers/train.csv
/kaggle/input/predict-energy-behavior-of-prosumers/forecast_weather.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/sample_submission.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/client.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/gas_prices.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/electricity

In [4]:
# read the CSV files into DataFrames
train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
gas_df = pd.read_csv(os.path.join(DATA_DIR, "gas_prices.csv"))
electricity_df = pd.read_csv(os.path.join(DATA_DIR, "electricity_prices.csv"))
client_df = pd.read_csv(os.path.join(DATA_DIR, "client.csv"))
fw_df = pd.read_csv(os.path.join(DATA_DIR, "forecast_weather.csv"))
hw_df = pd.read_csv(os.path.join(DATA_DIR, "historical_weather.csv"))

# read a file from a different directory
# locations = pd.read_csv("/kaggle/input/fabiendaniels-mapping-locations-and-county-codes/county_lon_lats.csv")

In [5]:
train.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0
1,0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0
2,0,0,2,0.0,0,2021-09-01 00:00:00,0,2,1
3,0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1
4,0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2


In [6]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
county,2018352.0,7.297034,4.78099,0.0,3.0,7.0,11.0,15.0
is_business,2018352.0,0.5368261,0.498642,0.0,0.0,1.0,1.0,1.0
product_type,2018352.0,1.898927,1.081766,0.0,1.0,2.0,3.0,3.0
target,2017824.0,274.8556,909.502378,0.0,0.378,31.133,180.2062,15480.274
is_consumption,2018352.0,0.5,0.5,0.0,0.0,0.5,1.0,1.0
data_block_id,2018352.0,321.8746,182.634314,0.0,166.0,323.0,479.0,637.0
row_id,2018352.0,1009176.0,582648.179597,0.0,504587.75,1009175.5,1513763.0,2018351.0
prediction_unit_id,2018352.0,33.04538,19.590594,0.0,16.0,33.0,50.0,68.0


In [7]:
gas_df.head()

Unnamed: 0,forecast_date,lowest_price_per_mwh,highest_price_per_mwh,origin_date,data_block_id
0,2021-09-01,45.23,46.32,2021-08-31,1
1,2021-09-02,45.62,46.29,2021-09-01,2
2,2021-09-03,45.85,46.4,2021-09-02,3
3,2021-09-04,46.3,46.8,2021-09-03,4
4,2021-09-05,46.3,46.58,2021-09-04,5


In [8]:
electricity_df.head()

Unnamed: 0,forecast_date,euros_per_mwh,origin_date,data_block_id
0,2021-09-01 00:00:00,92.51,2021-08-31 00:00:00,1
1,2021-09-01 01:00:00,88.9,2021-08-31 01:00:00,1
2,2021-09-01 02:00:00,87.35,2021-08-31 02:00:00,1
3,2021-09-01 03:00:00,86.88,2021-08-31 03:00:00,1
4,2021-09-01 04:00:00,88.43,2021-08-31 04:00:00,1


In [9]:
fw_df.head()

Unnamed: 0,latitude,longitude,origin_datetime,hours_ahead,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,data_block_id,forecast_datetime,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation
0,57.6,21.7,2021-09-01 02:00:00,1,15.655786,11.553613,0.904816,0.019714,0.0,0.905899,-0.411328,-9.106137,1,2021-09-01 03:00:00,0.0,0.0,0.0,0.0
1,57.6,22.2,2021-09-01 02:00:00,1,13.003931,10.689844,0.886322,0.004456,0.0,0.886658,0.206347,-5.355405,1,2021-09-01 03:00:00,0.0,0.0,0.0,0.0
2,57.6,22.7,2021-09-01 02:00:00,1,14.206567,11.671777,0.729034,0.005615,0.0,0.730499,1.451587,-7.417905,1,2021-09-01 03:00:00,0.0,0.0,0.0,0.0
3,57.6,23.2,2021-09-01 02:00:00,1,14.844507,12.264917,0.336304,0.074341,0.000626,0.385468,1.090869,-9.163999,1,2021-09-01 03:00:00,0.0,0.0,0.0,0.0
4,57.6,23.7,2021-09-01 02:00:00,1,15.293848,12.458887,0.102875,0.088074,1.5e-05,0.17659,1.268481,-8.975766,1,2021-09-01 03:00:00,0.0,0.0,0.0,0.0


In [10]:
hw_df.head()

Unnamed: 0,datetime,temperature,dewpoint,rain,snowfall,surface_pressure,cloudcover_total,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation,diffuse_radiation,latitude,longitude,data_block_id
0,2021-09-01 00:00:00,14.2,11.6,0.0,0.0,1015.9,31,31,0,11,7.083333,8,0.0,0.0,0.0,57.6,21.7,1.0
1,2021-09-01 00:00:00,13.9,11.5,0.0,0.0,1010.7,33,37,0,0,5.111111,359,0.0,0.0,0.0,57.6,22.2,1.0
2,2021-09-01 00:00:00,14.0,12.5,0.0,0.0,1015.0,31,34,0,0,6.333333,355,0.0,0.0,0.0,57.6,22.7,1.0
3,2021-09-01 00:00:00,14.6,11.5,0.0,0.0,1017.3,0,0,0,0,8.083333,297,358.0,277.0,81.0,57.6,23.2,1.0
4,2021-09-01 00:00:00,15.7,12.9,0.0,0.0,1014.0,22,25,0,0,8.416667,5,0.0,0.0,0.0,57.6,23.7,1.0


Preprocessing Data

In [11]:
'''
################## data
# drop missing values of target 
data.dropna(subset=['target'], inplace=True)

# convert datetime to UTC
data['datetime'] = pd.to_datetime(data['datetime'], utc=True)

# add year column in train data
data['year'] = data['datetime'].dt.year

# add month column
data['month'] = data['datetime'].dt.month

# add hour column
data['hour'] = data['datetime'].dt.hour

# add day of week column
data['dayofweek'] = data['datetime'].dt.dayofweek

# add day of year column
data['dayofyear'] = data['datetime'].dt.dayofyear


################## electricity
# rename 'forecast_date' column to 'datetime' for consistency before merging
electricity = electricity.rename(columns={'forecast_date': 'datetime'}) 

# convert datetime to UTC
electricity['datetime'] = pd.to_datetime(electricity['datetime'], utc=True)


################## client
# reduce 'block_id' in the 'client' DataFrame by 2 to match 'train' DataFrame
client['data_block_id'] -= 2


################## location
# remove the 'Unnamed: 0' column from the 'locations' DataFrame
locations.drop('Unnamed: 0', axis=1, inplace=True)


################## forecast_weather
# round 'latitude' and 'longitude' to 1 decimal place
forecast_weather[['latitude', 'longitude']] = forecast_weather[['latitude', 'longitude']].astype(float).round(1)

# merge 'forecast_weather' with 'locations' on coordinates to add county information
forecast_weather = forecast_weather.merge(locations, how='left', on=['longitude', 'latitude'])

# drop missing values
forecast_weather.dropna(axis=0, inplace=True)

# convert the 'county' column to integer data type
forecast_weather['county'] = forecast_weather['county'].astype('int64')

# drop the specified columns as they are not needed for further analysis
forecast_weather.drop(['origin_datetime', 'latitude', 'longitude', 'hours_ahead', 'data_block_id'], axis=1, inplace=True)

# rename the 'forecast_datetime' column to 'datetime' for consistency with other datasets
forecast_weather.rename(columns={'forecast_datetime': 'datetime'}, inplace=True)

# convert the 'datetime' column to UTC
forecast_weather['datetime'] = pd.to_datetime(forecast_weather['datetime'], utc=True)

# group by the hour component of 'datetime' and calculate the mean of all other numeric columns
forecast_weather_datetime = forecast_weather.groupby(
    forecast_weather['datetime'].dt.to_period('h')
)[list(forecast_weather.drop(['county', 'datetime'], axis=1).columns)].mean().reset_index()


################## hist_weather
# round 'latitude' and 'longitude' to 1 decimal place
hist_weather[['latitude', 'longitude']] = hist_weather[['latitude', 'longitude']].astype(float).round(1)

# add county information to 'hist_weather' based on matching coordinates
hist_weather = hist_weather.merge(locations, how='left', on=['longitude', 'latitude'])

# remove rows with any missing values
hist_weather.dropna(axis=0, inplace=True)

# remove 'latitude' and 'longitude' columns as they are no longer needed
hist_weather.drop(['latitude', 'longitude'], axis=1, inplace=True)

# change the 'county' column to integer datatype
hist_weather['county'] = hist_weather['county'].astype(int64)

# convert 'datetime' column to UTC
hist_weather['datetime'] = pd.to_datetime(hist_weather['datetime'], utc=True)

################## hist_
# 1. convert the 'datetime' column to a period indexd by hour and assign it to a new variable
hour_period = hist_weather['datetime'].dt.to_period('h')

# 2. drop the columns we don't want to include in the mean calculation


# drop duplicated column
hist_weather_datetime.drop_duplicates(inplace=True)

hist_weather_datetime_county.drop_duplicates(inplace=True)

# drop datetime column
hist_weather_datetime.drop('datetime', axis=1, inplace=True)

hist_weather_datetime_county.drop('datetime', axis=1, inplace=True)
'''

"\n################## data\n# drop missing values of target \ndata.dropna(subset=['target'], inplace=True)\n\n# convert datetime to UTC\ndata['datetime'] = pd.to_datetime(data['datetime'], utc=True)\n\n# add year column in train data\ndata['year'] = data['datetime'].dt.year\n\n# add month column\ndata['month'] = data['datetime'].dt.month\n\n# add hour column\ndata['hour'] = data['datetime'].dt.hour\n\n# add day of week column\ndata['dayofweek'] = data['datetime'].dt.dayofweek\n\n# add day of year column\ndata['dayofyear'] = data['datetime'].dt.dayofyear\n\n\n################## electricity\n# rename 'forecast_date' column to 'datetime' for consistency before merging\nelectricity = electricity.rename(columns={'forecast_date': 'datetime'}) \n\n# convert datetime to UTC\nelectricity['datetime'] = pd.to_datetime(electricity['datetime'], utc=True)\n\n\n################## client\n# reduce 'block_id' in the 'client' DataFrame by 2 to match 'train' DataFrame\nclient['data_block_id'] -= 2\n\n\n#

In [12]:
# def preTrain(data, client, histweather)