In [1]:
import pandas as pd

train = pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/train.csv')
test = pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/test.csv')

In [2]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2018352 entries, 0 to 2018351
Data columns (total 9 columns):
 #   Column              Dtype  
---  ------              -----  
 0   county              int64  
 1   is_business         int64  
 2   product_type        int64  
 3   target              float64
 4   is_consumption      int64  
 5   datetime            object 
 6   data_block_id       int64  
 7   row_id              int64  
 8   prediction_unit_id  int64  
dtypes: float64(1), int64(7), object(1)
memory usage: 138.6+ MB


In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12480 entries, 0 to 12479
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   county               12480 non-null  int64 
 1   is_business          12480 non-null  int64 
 2   product_type         12480 non-null  int64 
 3   is_consumption       12480 non-null  int64 
 4   prediction_datetime  12480 non-null  object
 5   data_block_id        12480 non-null  int64 
 6   row_id               12480 non-null  int64 
 7   prediction_unit_id   12480 non-null  int64 
 8   currently_scored     12480 non-null  bool  
dtypes: bool(1), int64(7), object(1)
memory usage: 792.3+ KB


In [4]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [5]:
# Convert datetime column to numerical format for training
train['datetime'] = pd.to_datetime(train['datetime'])
train['datetime'] = train['datetime'].astype(int) / 10**9  # Convert to seconds since epoch

In [6]:
test['prediction_datetime'] = pd.to_datetime(test['prediction_datetime'])
test['prediction_datetime'] = test['prediction_datetime'].astype(int) / 10**9  # Convert to seconds since epoch

In [7]:
# Prepare the training data
X = train.drop(columns=['target'])
y = train['target']

In [8]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import numpy as np

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Initialize the model
model = xgb.XGBRegressor(objective='reg:squarederror')

mae_scores = []

for train_index, val_index in kf.split(X_train):
    # Split the data into training and validation sets
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Scale the features
    X_train_fold_scaled = scaler.fit_transform(X_train_fold)
    X_val_fold_scaled = scaler.transform(X_val_fold)

    # Train the model
    model.fit(X_train_fold_scaled, y_train_fold)

    # Make predictions on the validation set
    y_val_pred = model.predict(X_val_fold_scaled)

    # Evaluate MAE
    mae = mean_absolute_error(y_val_fold, y_val_pred)
    mae_scores.append(mae)

# Calculate the average MAE
average_mae = np.mean(mae_scores)
average_mae

116.96176464684001

In [10]:
import pickle

# Save the trained model into a pickle file
with open('enefit_predict_energy_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [11]:
X_test = test.drop(columns=['currently_scored'])

In [14]:
# Predict on the test set
y_test_pred = model.predict(X_test)

In [15]:
# Identify prosumers (producers and consumers) based on the predictions

# Add predictions to the test dataset
test['predicted_target'] = y_test_pred

# Define a threshold to identify prosumers (e.g., based on predicted target values)
# Setup prosumers with predicted target values between 10 and 100
prosumers = test[(test['predicted_target'] > 10) & (test['predicted_target'] < 100)]

# Calculate the potential reduction in energy imbalance costs
# Assuming the cost reduction is proportional to the number of prosumers
cost_reduction = len(prosumers) * 10  # Example: $10 reduction per prosumer

In [16]:
prosumers

Unnamed: 0,county,is_business,product_type,is_consumption,prediction_datetime,data_block_id,row_id,prediction_unit_id,currently_scored,predicted_target
6,0,1,0,0,1.685232e+09,634,2005878,3,False,67.722549
7,0,1,0,1,1.685232e+09,634,2005879,3,False,67.722549
14,1,0,1,0,1.685232e+09,634,2005886,6,False,85.983429
15,1,0,1,1,1.685232e+09,634,2005887,6,False,85.983429
20,2,0,1,0,1.685232e+09,634,2005892,62,False,17.239307
...,...,...,...,...,...,...,...,...,...,...
12461,14,0,1,1,1.685574e+09,637,2018333,53,False,17.239307
12470,15,0,1,0,1.685574e+09,637,2018342,57,False,17.239307
12471,15,0,1,1,1.685574e+09,637,2018343,57,False,17.239307
12474,15,1,0,0,1.685574e+09,637,2018346,64,False,15.764130


In [17]:
cost_reduction

36480

- The potential reduction in energy imbalance costs is $36,480