In [14]:
import pandas as pd
import numpy as np
import datetime
import os

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost

from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import PolynomialFeatures

import joblib

%config Completer.use_jedi = False
%matplotlib inline  

daily_data_path = "Data/London/daily_dataset/daily_dataset/"
daily_weather_path = 'Data/London/weather_daily_darksky.csv'
f_energy_all_name = "energy_all.csv"
f_energy_clean_name = "energy_clean"
f_energy_avg_name = "energy_avg"
f_energy_avg_all_file = "energy_avg_all.csv"

mode_path = "Modes/"

In [2]:
dtypes_in = {'avg_energy':'float32', 'day': 'int8', 'month': 'int8', 'temperatureMax': 'float16'}
energy_data = pd.read_csv(f_energy_avg_name+'.csv', dtype=dtypes_in)

energy = energy_data.drop('avg_energy', axis=1)
energy_labels = energy_data['avg_energy'].copy()

In [3]:
class AttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_house_income_cat=True): # no *args or **kargs
        self.add_house_income_cat = add_house_income_cat
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        return X
    
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [4]:
pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', AttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

energy_tr = pipeline.fit_transform(energy)

# Testing Modes

In [7]:
tree_reg = DecisionTreeRegressor()
scores = cross_val_score(tree_reg, energy_tr, energy_labels, scoring="neg_mean_squared_error", cv=10)
tree_reg_rmse_scores = np.sqrt(-scores)
display_scores(tree_reg_rmse_scores)

Scores: [1.52808466 1.29914454 0.606339   0.50835662 1.34880589 1.84902667
 0.84112869 0.345187   0.78815926 1.81021472]
Mean: 1.0924447054610258
Standard deviation: 0.517439775255962


In [10]:
gb_reg = GradientBoostingRegressor()
scores = cross_val_score(gb_reg, energy_tr, energy_labels, scoring="neg_mean_squared_error", cv=10)
gb_reg_rmse_scores = np.sqrt(-scores)
display_scores(gb_reg_rmse_scores)

Scores: [1.37048352 1.0598776  0.51230995 0.33497726 0.75321174 1.38626535
 0.70434715 0.32221306 0.50975959 1.67342381]
Mean: 0.8626869032256481
Standard deviation: 0.4564140365417824


In [17]:
energy_train, energy_val, labels_train, labels_val = train_test_split(energy_tr, energy_labels)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(energy_train, labels_train)

errors = [mean_squared_error(labels_val, label_pred) for label_pred in gbrt.staged_predict(energy_val)]
bst_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators)
gbrt_best.fit(energy_train, labels_train)

GradientBoostingRegressor(max_depth=2, n_estimators=45)

In [18]:
scores = cross_val_score(gbrt_best, energy_tr, energy_labels, scoring="neg_mean_squared_error", cv=10)
gb_reg_rmse_scores = np.sqrt(-scores)
display_scores(gb_reg_rmse_scores)

Scores: [1.3868569  1.27830277 0.58934648 0.40661581 0.69926826 0.96493387
 0.83910302 0.45111377 0.56701631 1.6411979 ]
Mean: 0.88237550782808
Standard deviation: 0.40335336313011466


In [22]:
xgb_reg = xgboost.XGBRegressor(early_stopping_rounds=2)
xgb_reg.fit(energy_train, labels_train,
            eval_set=[(energy_val, labels_val)])
y_pred = xgb_reg.predict(energy_val)

[0]	validation_0-rmse:7.25088
[1]	validation_0-rmse:5.12850
[2]	validation_0-rmse:3.66541
[3]	validation_0-rmse:2.61963
[4]	validation_0-rmse:1.89319
[5]	validation_0-rmse:1.39998
[6]	validation_0-rmse:1.06276
[7]	validation_0-rmse:0.86298
[8]	validation_0-rmse:0.75724
[9]	validation_0-rmse:0.69308
[10]	validation_0-rmse:0.67022
[11]	validation_0-rmse:0.65239
[12]	validation_0-rmse:0.64915
[13]	validation_0-rmse:0.64531
[14]	validation_0-rmse:0.64723


In [25]:
np.sqrt(mean_squared_error(y_pred,labels_val))

0.6453072

In [36]:
xgb2_reg = xgboost.XGBRegressor(early_stopping_rounds=2)
scores = cross_val_score(xgb2_reg, energy_tr, energy_labels, scoring="neg_mean_squared_error", cv=10,
                         fit_params={'eval_set':[(energy_val, labels_val)], 'verbose': False})
gb_reg_rmse_scores = np.sqrt(-scores)
display_scores(gb_reg_rmse_scores)

Scores: [1.39821026 1.19031928 0.55932612 0.39950404 0.93197082 1.36573885
 0.74720067 0.28701079 0.6009082  1.68567778]
Mean: 0.9165866798259218
Standard deviation: 0.44945368758127063


In [29]:
forest_reg = RandomForestRegressor()
scores = cross_val_score(forest_reg, energy_tr, energy_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)

Scores: [1.4526763  1.13905048 0.57173363 0.37364241 0.94185528 1.38959616
 0.7663779  0.32125161 0.6522414  1.69755652]
Mean: 0.9305981695111145
Standard deviation: 0.4505315109485508


In [30]:
xgb_reg.feature_importances_

array([0.02541236, 0.36590272, 0.6086849 ], dtype=float32)

AttributeError: `feature_names_in_` is defined only when `X` has feature names that are all strings.