In [197]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from gplearn.genetic import SymbolicRegressor

In [198]:
autumn_data = pd.read_csv('../autumn_data.csv')
autumn_data.shape

(2201, 4)

In [199]:
autumn_data.head()

Unnamed: 0,datetime,active_electricity,outdoor_temperature,indoor_temperature
0,01-10-22 3:00,693.1875,27.769607,21.416667
1,01-10-22 4:00,460.0375,27.924395,21.083333
2,01-10-22 5:00,448.0125,28.079184,21.25
3,01-10-22 6:00,462.2875,28.233972,21.583333
4,01-10-22 7:00,449.25,28.388761,21.916667


In [200]:
def preprocess_dates_string(dataframe, date_column_name):
    for i in range(len(dataframe)):
        if len(dataframe.iloc[i][0].split(' ')[-1].split(':')[0]) < 2:
            dataframe.at[i, date_column_name] = dataframe.iloc[i][0].split(' ')[0] + " 0" + \
                                                dataframe.iloc[i][0].split(' ')[-1]
    return dataframe

In [201]:
dataset = preprocess_dates_string(autumn_data, 'datetime')

  if len(dataframe.iloc[i][0].split(' ')[-1].split(':')[0]) < 2:
  dataframe.at[i, date_column_name] = dataframe.iloc[i][0].split(' ')[0] + " 0" + \
  dataframe.iloc[i][0].split(' ')[-1]


In [202]:
dataset = dataset[:len(dataset) - 24 * 2]
date_time = pd.to_datetime(dataset.datetime, format='%d-%m-%y %H:%M')

In [203]:
holiday_dates = ['2022-10-28', '2022-11-11', '2022-11-17', '2022-12-25', '2022-12-26']
holiday_cal = pd.DatetimeIndex(holiday_dates)
holidays = [1 if dt.date() in holiday_cal.date or dt.dayofweek == 5 or dt.dayofweek == 6 else 0 for dt in date_time]

In [204]:
work_hour_range = range(8, 16)
work_hours = [1 if dt.time().hour in work_hour_range and holidays[num] != 1 else 0 for num, dt in enumerate(date_time)]
dataset['is_work_hour'] = work_hours

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['is_work_hour'] = work_hours


In [205]:
dataset = dataset[dataset['is_work_hour'] == 1].drop(['is_work_hour'], axis=1)

In [206]:
dataset['datetime'] = pd.to_datetime(dataset.datetime, format='%d-%m-%y %H:%M')

In [207]:
dataset.head()

Unnamed: 0,datetime,active_electricity,outdoor_temperature,indoor_temperature
53,2022-10-03 08:00:00,549.55,28.513594,23.25
54,2022-10-03 09:00:00,629.4,28.996465,23.583333
55,2022-10-03 10:00:00,732.05,28.892873,23.916667
56,2022-10-03 11:00:00,1566.0,28.587663,24.25
57,2022-10-03 12:00:00,3061.65,28.182514,24.583333


In [208]:
def prepare_timeseries_data(data):
    data['past_electricity'] = data.shift(8)['active_electricity'].dropna()
    return data.dropna()

In [209]:
dataset = prepare_timeseries_data(dataset)

In [210]:
def add_ids(dataframe):
    ids = range(1, len(dataframe)+1)
    dataframe['id'] = ids
    return dataframe

In [211]:
dataset = add_ids(dataset)

In [212]:
dataset.head()

Unnamed: 0,datetime,active_electricity,outdoor_temperature,indoor_temperature,past_electricity,id
77,2022-10-04 08:00:00,378.125,26.889868,18.5,549.55,1
78,2022-10-04 09:00:00,687.1625,27.265097,20.5,629.4,2
79,2022-10-04 10:00:00,618.7375,27.271529,22.5,732.05,3
80,2022-10-04 11:00:00,1016.6875,27.142694,23.5,1566.0,4
81,2022-10-04 12:00:00,1224.05,27.01386,24.166667,3061.65,5


In [213]:
dataset.set_index(['id', 'datetime'], inplace=True)

In [214]:
dataset.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,active_electricity,outdoor_temperature,indoor_temperature,past_electricity
id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2022-10-04 08:00:00,378.125,26.889868,18.5,549.55
2,2022-10-04 09:00:00,687.1625,27.265097,20.5,629.4
3,2022-10-04 10:00:00,618.7375,27.271529,22.5,732.05
4,2022-10-04 11:00:00,1016.6875,27.142694,23.5,1566.0
5,2022-10-04 12:00:00,1224.05,27.01386,24.166667,3061.65


In [215]:
X = dataset.copy().drop(['active_electricity'], axis=1)
y = dataset.copy()['active_electricity']

In [216]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [217]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(401, 3) (401,)
(71, 3) (71,)


In [218]:
regressor = SymbolicRegressor(
    feature_names = X_train.columns,
    random_state = 0
)

gp_model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', regressor)
])

In [219]:
gp_model_pipeline.fit(X_train.values, y_train.values)

In [220]:
y_pred = gp_model_pipeline.predict(X_test.values)

In [221]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error

def compute_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred, squared=True)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100

    return np.round(mse, 2), np.round(rmse, 2), np.round(mae, 2), np.round(mape, 2)

In [222]:
print(regressor._program)

add(sub(add(sub(add(0.005, add(mul(add(add(add(0.561, outdoor_temperature), div(0.941, 0.005)), mul(sub(div(0.941, 0.005), mul(indoor_temperature, add(div(sub(div(indoor_temperature, past_electricity), add(outdoor_temperature, -0.107)), add(0.431, past_electricity)), div(0.941, 0.005)))), sub(0.805, -0.514))), past_electricity), add(0.431, past_electricity))), add(sub(mul(outdoor_temperature, outdoor_temperature), mul(div(0.941, 0.005), sub(0.805, -0.514))), add(mul(outdoor_temperature, past_electricity), mul(0.928, -0.781)))), add(sub(0.805, -0.514), div(0.941, 0.005))), add(sub(mul(outdoor_temperature, outdoor_temperature), add(add(mul(div(0.941, 0.005), sub(0.805, -0.514)), div(0.941, 0.005)), add(sub(div(0.941, 0.005), add(add(past_electricity, -0.873), 0.941)), add(div(sub(div(indoor_temperature, past_electricity), add(outdoor_temperature, -0.107)), sub(add(outdoor_temperature, outdoor_temperature), add(sub(div(0.941, 0.005), mul(indoor_temperature, 0.498)), add(div(sub(div(indoor

In [223]:
metric = compute_metrics(y_test, y_pred)
print('MSE:',metric[0], ' RMSE:',metric[1],' MAE:',metric[2], ' MAPE%:',metric[3])

MSE: 665552.52  RMSE: 815.81  MAE: 578.27  MAPE%: 39.5




In [224]:
import pickle as pkl

X_train['y'] = y_train
X_train.to_csv("energy_train_data.csv")

X_test['y'] = y_test
X_test.to_csv("energy_test_data.csv")

with open("energy_gp_model.pkl", "wb") as f:
    pkl.dump(gp_model_pipeline, f)