In [None]:
import pandas as pd
import json
import datetime
import time
import os
import numpy as np
import lightgbm as lgb
from config import DATA_CONSUMPTION_PROCESSED_FILE, DATA_WEATHER_PROCESSED_FILE, DATA_CLUSTERS_FILE,  DATA_METADATA_PROCESSED_FILE, DATA_HOLIDAYS_PROCESSED_FILE, DATA_ISO_CONSUMPTION_PROCESSED_FILE, DATA_ENTHALPY_GRADIENTS_PROCESSED_FILE, DATA_SOLAR_GAINS_PROCESSED_FILE, DATA_GBM_CONSUMPTION_PROCESSED_FILE
from lgbm_imputer import imputer
from lgbm_consumption_module import LGBM_regression
import matplotlib
from sklearn.model_selection import GroupShuffleSplit,TimeSeriesSplit
from auxiliary import week_of_month
import warnings
warnings.filterwarnings('ignore')

In [None]:
output_path = DATA_GBM_CONSUMPTION_PROCESSED_FILE

In [None]:
real_consumption_df = pd.read_csv(DATA_CONSUMPTION_PROCESSED_FILE)
real_consumption_df['timestamp'] = pd.to_datetime(real_consumption_df['timestamp'])
print(real_consumption_df.shape[0])
weather_data_df = pd.read_csv(DATA_WEATHER_PROCESSED_FILE)
weather_data_df['timestamp'] = pd.to_datetime(weather_data_df['timestamp'])
metadata_df = pd.read_excel(DATA_METADATA_PROCESSED_FILE, sheets='SENSORS')[['smapee', 'ID_CEA', 'INTERVENTION','BEDROOMS', 'OCCUPATION', 'SALARY', 'TREATMENT']]
holidays_df = pd.read_csv(DATA_HOLIDAYS_PROCESSED_FILE)
holidays_df['timestamp'] = pd.to_datetime(holidays_df['timestamp'])
gradients_df = pd.read_csv(DATA_ENTHALPY_GRADIENTS_PROCESSED_FILE)
gradients_df['timestamp'] = pd.to_datetime(gradients_df['timestamp'])
solar_gains_df = pd.read_csv(DATA_SOLAR_GAINS_PROCESSED_FILE)
solar_gains_df['timestamp'] = pd.to_datetime(solar_gains_df['timestamp'])
iso_consumption_df = pd.read_csv(DATA_ISO_CONSUMPTION_PROCESSED_FILE)
iso_consumption_df['timestamp'] = pd.to_datetime(iso_consumption_df['timestamp'])

In [None]:
# merge all the fields
df = real_consumption_df.merge(metadata_df, left_on=['smapee', 'INTERVENTION'], right_on=['smapee', 'INTERVENTION'])
df = df.merge(weather_data_df, left_on='timestamp', right_on='timestamp')
df = df.merge(holidays_df, left_on='timestamp', right_on='timestamp')
df = df.merge(gradients_df, left_on='timestamp', right_on='timestamp')
df = df.merge(solar_gains_df, left_on=['timestamp','ID_CEA'], right_on=['timestamp','ID_CEA'])
df = df.merge(iso_consumption_df, left_on=['timestamp','ID_CEA'], right_on=['timestamp','ID_CEA'])

In [None]:
#important so we can do the timeseries split for validation data
df = df.drop_duplicates(['consumption_kWh','smapee','timestamp'])
df['dayofweek'] = np.array(df['timestamp'].dt.dayofweek, dtype=np.uint8)
df['weekday'] = df['dayofweek'].apply(lambda x: 1 if 0<=x<5 else 0)
df['calendar_wom'] = df['timestamp'].apply(week_of_month)
df.reset_index(inplace=True, drop=True)
df

In [None]:
#get the period used for training (period before the intervention)
pre_intervention_period = ['2018-01-01', '2018-04-20']
range_pre_intervention_period = pd.date_range(start=pre_intervention_period[0], end=pre_intervention_period[1], freq='D')

In [None]:
#data_train = df[df['timestamp'].isin(range_pre_intervention_period)]
#data_train = data_train.reset_index(drop=True)
data_train = df[df['TREATMENT']=='CONTROL']
data_train

In [None]:
%load_ext autoreload
%autoreload 2
target_feature_name = 'consumption_kWh'
numerical_features_list = ['DEG_C_kJperKg',
                           'DEG_DEHUM_kJperKg',
                           'solar_gain_Whm2',
                           'ISO_consumption_Whm2',
                           'Wind_ms']
categorical_features_list = ['smapee',
                             'holiday',
                             'school_holiday',
                             'teaching_time',
                             'dayofweek',
                             'SALARY',
                             'BEDROOMS',
                             'calendar_wom',
                             'weekday'
                            ]
get_best_parameters = False
# {'boosting': 'dart', 'learning_rate': 0.01, 'max_depth': -1, 'min_data_in_leaf': 100, 'num_iterations': 20000, 'num_leaves': 16}
# -0.4012485277365735
# Fold  5 RMSLE: 0.458409 iwth scaler in X
# Full RMSLE score 1.035892
for c in categorical_features_list:
    data_train[c] = data_train[c].astype('category')
params = {'learning_rate': 0.01,
          'num_leaves': 4,
          'max_depth': -1,
          'min_data_in_leaf': 100,
          'num_iterations': 20000,
          'boosting': 'gbdt',
          'objective': 'rmse',
          'metric': 'rmse'}
models, errors_train, errors_test = LGBM_regression(df=data_train,
                            params=params,
                            target_feature_name = target_feature_name,
                            numerical_features_list = numerical_features_list,
                            categorical_features_list = categorical_features_list,
                            get_best_parameters =get_best_parameters,
                            groupby='smapee')

In [None]:
errors_train, errors_test

In [None]:
splits = len(models)
font = {'family' : 'Arial',
    'size'   : 18}
for model in models:
    ax = lgb.plot_importance(model)
    matplotlib.rc('font', **font)

In [None]:
final_model = models[0]
features = numerical_features_list + categorical_features_list
for c in categorical_features_list:
    df[c] = df[c].astype('category')
df['GBM_consumption_kWh'] = np.exp(final_model.predict(df[features]))-1
df

In [None]:
visual = df.set_index(pd.to_datetime(df['timestamp']))
visual = visual[visual['smapee'] == "ID28096"]
visual = visual.loc['2018-01-01':'2018-04-20']
ax = visual[['consumption_kWh','GBM_consumption_kWh']].plot()
font = {'family' : 'Arial',
        'size'   : 18}
ax.set_xlabel("")
ax.set_ylabel("Daily Energy Consumption [kWh]")
ax.legend(["GBM Energy Consumption"]);
matplotlib.rc('font', **font)

In [None]:
visual = df.set_index(pd.to_datetime(df['timestamp']))
visual = visual[visual['smapee'] == "ID28096"]
visual = visual
ax = visual[['consumption_kWh','GBM_consumption_kWh']].plot()
font = {'family' : 'Arial',
        'size'   : 18}
ax.set_xlabel("")
ax.set_ylabel("Daily Energy Consumption [kWh]")
ax.legend(["GBM Energy Consumption"]);
matplotlib.rc('font', **font)

In [None]:
visual = df.set_index(pd.to_datetime(df['timestamp']))
visual = visual[visual['smapee'] == "ID28096"]
ax = visual['GBM_consumption_kWh'].plot(color = ['C4'])
font = {'family' : 'Arial',
        'size'   : 18}
ax.set_xlabel("")
ax.set_ylabel("Daily Energy Consumption [kWh]")
ax.legend(["GBM Energy Consumption"]);
matplotlib.rc('font', **font)

In [None]:
df[['timestamp','GBM_consumption_kWh','smapee','INTERVENTION']].to_csv(DATA_GBM_CONSUMPTION_PROCESSED_FILE, index=False)
df[['timestamp','GBM_consumption_kWh','smapee','INTERVENTION']]

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt
for c in categorical_features_list:
    df[c] = df[c].astype('float32')
corrMatrix = df[features].corr()
plot = sn.heatmap(corrMatrix, annot=False)
sn.set(font_scale=1)
plt.show()

In [None]:
plt.figure(figsize =(6,4))
plot = sn.heatmap(corrMatrix, annot=False)
sn.set(font_scale=0.8)
plot.set(yticklabels=[r'$x_{15}$',r'$x_{16}$',r'$x_{14}$',r'$x_{17}$', r'$x_{5}$',
                      r'$x_{1}$', r'$x_{11}$',r'$x_{12}$',r'$x_{13}$',r'$x_{7}$'])
plot.set_xticklabels([r'$x_{15}$',r'$x_{16}$',r'$x_{14}$',r'$x_{17}$', r'$x_{5}$',
                      r'$x_{1}$', r'$x_{11}$',r'$x_{12}$',r'$x_{13}$',r'$x_{7}$'],rotation=0)
plt.show()

In [None]:
df