In [135]:
import pandas as pd
import numpy as np
import datetime
import os

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import PolynomialFeatures

import joblib

%config Completer.use_jedi = False
%matplotlib inline  

In [2]:
daily_data_path = "Data/London/daily_dataset/daily_dataset/"
daily_weather_path = 'Data/London/weather_daily_darksky.csv'
f_energy_all_name = "energy_all.csv"
f_energy_clean_name = "energy_clean"
f_energy_avg_name = "energy_avg"
f_energy_avg_all_file = "energy_avg_all.csv"

mode_path = "Modes/"

In [55]:
dtypes_in = {'avg_energy':'float32', 'day': 'int8', 'month': 'int8', 'temperatureMax': 'float16'}
energy_data = pd.read_csv(f_energy_avg_name+'.csv', dtype=dtypes_in)

energy = energy_data.drop('avg_energy', axis=1)
energy_labels = energy_data['avg_energy'].copy()

In [3]:
energy_data = pd.read_csv(f_energy_avg_all_file)

energy = energy_data.drop(['avg_energy','energy_sum','date'], axis=1)
energy_labels = energy_data['avg_energy'].copy()

In [4]:
class AttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_house_income_cat=True): # no *args or **kargs
        self.add_house_income_cat = add_house_income_cat
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        return X

In [5]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [56]:
pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', AttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

energy_tr = pipeline.fit_transform(energy)

# Testing models

In [133]:
lin_reg = LinearRegression()
scores = cross_val_score(lin_reg, energy_tr, energy_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)
display_scores(lin_rmse_scores)

Scores: [1.58406821 1.17030978 1.02256062 0.70777959 0.96801592 0.58796843
 1.20534667 0.93116969 0.55262087 1.48791163]
Mean: 1.0217751399227326
Standard deviation: 0.33274689851535155


In [132]:
sgd_reg = SGDRegressor(max_iter=100000, tol=1e-3, penalty=None, eta0=0.1, early_stopping=True , n_iter_no_change=500)
scores = cross_val_score(sgd_reg, energy_tr, energy_labels, scoring="neg_mean_squared_error", cv=10)
sgd_rmse_scores = np.sqrt(-scores)
display_scores(sgd_rmse_scores)

Scores: [1.58430113 1.16210123 1.02798446 0.72995153 0.98518538 0.58682618
 1.17903091 0.89409437 0.53998211 1.50122794]
Mean: 1.0190685240374258
Standard deviation: 0.33388679136299965


In [131]:
ridge_reg = Ridge(alpha=1.0,fit_intercept=True,normalize=False,copy_X=True,max_iter=None,tol=0.001,solver='auto',random_state=None)
scores = cross_val_score(ridge_reg, energy_tr, energy_labels, scoring="neg_mean_squared_error", cv=10)
ridge_rmse_scores = np.sqrt(-scores)
display_scores(ridge_rmse_scores)

Scores: [1.58445468 1.16942383 1.02343299 0.70849391 0.97106838 0.58759688
 1.20450552 0.92801152 0.5517625  1.48786382]
Mean: 1.021661404988929
Standard deviation: 0.33286192868397824


In [134]:
lasso_reg = Lasso()
scores = cross_val_score(lasso_reg, energy_tr, energy_labels, scoring="neg_mean_squared_error", cv=10)
lasso_rmse_scores = np.sqrt(-scores)
display_scores(lasso_rmse_scores)

Scores: [2.44980533 1.23348184 1.58990887 1.38786823 1.27259931 1.81367139
 1.38096029 1.87173692 0.78318593 1.49839123]
Mean: 1.5281609344201879
Standard deviation: 0.4245201357548615


In [136]:
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
scores = cross_val_score(elastic_net, energy_tr, energy_labels, scoring="neg_mean_squared_error", cv=10)
elastic_net_rmse_scores = np.sqrt(-scores)
display_scores(elastic_net_rmse_scores)

Scores: [1.67968347 1.17279994 1.0331664  0.71886383 0.988555   0.62019574
 1.20648126 0.83387765 0.55388513 1.45672759]
Mean: 1.0264236004235308
Standard deviation: 0.34456014218742886


In [76]:
sgd_reg.get_params().keys()

dict_keys(['alpha', 'average', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter_no_change', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [79]:
param_grid = {
     'tol': [1e-2,1e-3,1e-4],
     'eta0': [0.2, 0.1, 0.05, 0.01, 0.005],
     'n_iter_no_change': [10,100,500,1000],
     'penalty': [None, 'l1', 'l2']
}
  
sgd_reg = SGDRegressor(max_iter=1e5,early_stopping=True)

grid_search_sgd = GridSearchCV(sgd_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
grid_search_sgd.fit(energy_tr, energy_labels)

GridSearchCV(cv=5,
             estimator=SGDRegressor(early_stopping=True, max_iter=100000.0),
             n_jobs=-1,
             param_grid={'eta0': [0.2, 0.1, 0.05, 0.01, 0.005],
                         'n_iter_no_change': [10, 100, 500, 1000],
                         'penalty': [None, 'l1', 'l2'],
                         'tol': [0.01, 0.001, 0.0001]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [101]:
print('Best Score: %s' % np.sqrt(-grid_search_sgd.best_score_))
print('Best Hyperparameters: %s' % grid_search_sgd.best_params_)

Best Score: 1.0523780409327754
Best Hyperparameters: {'eta0': 0.2, 'n_iter_no_change': 10, 'penalty': 'l2', 'tol': 0.01}


In [103]:
cvres = grid_search_sgd.cv_results_
#cvres["mean_test_score"] = np.sqrt(cvres["mean_test_score"])

arr=sorted(zip(cvres["mean_test_score"], cvres["params"]), key=lambda x: x[0])
arr


[(nan, {'eta0': 0.2, 'n_iter_no_change': 10, 'penalty': None, 'tol': 0.01}),
 (nan, {'eta0': 0.2, 'n_iter_no_change': 10, 'penalty': None, 'tol': 0.001}),
 (nan, {'eta0': 0.2, 'n_iter_no_change': 10, 'penalty': None, 'tol': 0.0001}),
 (nan, {'eta0': 0.2, 'n_iter_no_change': 10, 'penalty': 'l1', 'tol': 0.01}),
 (nan, {'eta0': 0.2, 'n_iter_no_change': 10, 'penalty': 'l1', 'tol': 0.001}),
 (nan, {'eta0': 0.2, 'n_iter_no_change': 10, 'penalty': 'l1', 'tol': 0.0001}),
 (nan, {'eta0': 0.2, 'n_iter_no_change': 10, 'penalty': 'l2', 'tol': 0.01}),
 (nan, {'eta0': 0.2, 'n_iter_no_change': 10, 'penalty': 'l2', 'tol': 0.001}),
 (nan, {'eta0': 0.2, 'n_iter_no_change': 10, 'penalty': 'l2', 'tol': 0.0001}),
 (nan, {'eta0': 0.2, 'n_iter_no_change': 100, 'penalty': None, 'tol': 0.01}),
 (nan, {'eta0': 0.2, 'n_iter_no_change': 100, 'penalty': None, 'tol': 0.001}),
 (nan, {'eta0': 0.2, 'n_iter_no_change': 100, 'penalty': None, 'tol': 0.0001}),
 (nan, {'eta0': 0.2, 'n_iter_no_change': 100, 'penalty': 'l1'

In [70]:
param_grid = {
    'solver':['svd', 'cholesky', 'lsqr', 'sag'],
    'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100],
    'fit_intercept':[True, False],
    'normalize':[True, False]
}

ridge_reg = Ridge(copy_X=True,max_iter=None,tol=0.001,random_state=None)

grid_search_ridge = GridSearchCV(ridge_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
grid_search_ridge.fit(energy_tr, energy_labels)

GridSearchCV(cv=5, estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                         'fit_intercept': [True, False],
                         'normalize': [True, False],
                         'solver': ['svd', 'cholesky', 'lsqr', 'sag']},
             return_train_score=True, scoring='neg_mean_squared_error')

In [74]:
print('Best Score: %s' % np.sqrt(-grid_search_ridge.best_score_))
print('Best Hyperparameters: %s' % grid_search_ridge.best_params_)

Best Score: 1.0948967964741332
Best Hyperparameters: {'alpha': 0.001, 'fit_intercept': True, 'normalize': False, 'solver': 'sag'}


# Polynomial

In [143]:
poly_features = PolynomialFeatures(degree=2, include_bias=False)
energy_poly = poly_features.fit_transform(energy_tr)

In [145]:
lin_reg = LinearRegression()
scores = cross_val_score(lin_reg, energy_poly, energy_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)
display_scores(lin_rmse_scores)

Scores: [1.16485528 1.19532836 0.5104307  0.38627117 0.80472213 0.63674971
 0.61189358 0.35946428 0.42945679 1.6719289 ]
Mean: 0.7771100903496759
Standard deviation: 0.41144590296943656


In [59]:
sgd_reg = SGDRegressor(max_iter=10000, tol=1e-3, penalty='l2', eta0=0.1, early_stopping=True , n_iter_no_change=300)
scores = cross_val_score(sgd_reg, energy_poly, energy_labels, scoring="neg_mean_squared_error", cv=10)
sgd_rmse_scores = np.sqrt(-scores)
display_scores(sgd_rmse_scores)

Scores: [1.27188517 1.19949987 0.53477156 0.39147488 0.80282682 0.61923395
 0.59714114 0.40542909 0.41874184 1.59384503]
Mean: 0.783484934930716
Standard deviation: 0.40272288104591075


In [144]:
ridge_reg = Ridge(alpha=0.001,fit_intercept=True,normalize=False,copy_X=True,max_iter=None,tol=0.001,solver='auto',random_state=None)
scores = cross_val_score(ridge_reg, energy_poly, energy_labels, scoring="neg_mean_squared_error", cv=10)
sgd_rmse_scores = np.sqrt(-scores)
display_scores(sgd_rmse_scores)

Scores: [1.1644505  1.19440449 0.51082894 0.38631916 0.80521953 0.63610081
 0.61128051 0.36123454 0.42935372 1.67399922]
Mean: 0.7773191426032275
Standard deviation: 0.4116139277643332


In [142]:
lasso_reg = Lasso()
scores = cross_val_score(lasso_reg, energy_poly, energy_labels, scoring="neg_mean_squared_error", cv=10)
lasso_rmse_scores = np.sqrt(-scores)
display_scores(lasso_rmse_scores)

Scores: [2.3871998  1.29624157 1.80196017 1.58162771 1.40030141 1.60511922
 1.62830576 1.83898466 1.0473273  1.5976387 ]
Mean: 1.6184706287394455
Standard deviation: 0.3395496746363513


In [147]:
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.2)
scores = cross_val_score(elastic_net, energy_poly, energy_labels, scoring="neg_mean_squared_error", cv=10)
elastic_net_rmse_scores = np.sqrt(-scores)
display_scores(elastic_net_rmse_scores)

Scores: [1.27328968 1.17726604 0.55911766 0.43878771 0.7873851  0.64072227
 0.66082049 0.47711346 0.40945156 1.55933526]
Mean: 0.7983289214264154
Standard deviation: 0.378560005124695


In [126]:
param_grid = {
    'alpha': [1e-6,1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100],
    'fit_intercept':[True, False],
    'normalize':[True, False]
}

ridge_reg = Ridge(solver='auto',copy_X=True,max_iter=None,tol=0.001,random_state=None,fit_intercept=True,normalize=False)

grid_search_ridge = GridSearchCV(ridge_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
grid_search_ridge.fit(energy_poly, energy_labels)

GridSearchCV(cv=5, estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1,
                                   10, 100],
                         'fit_intercept': [True, False],
                         'normalize': [True, False]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [123]:
print('Best Score: %s' % np.sqrt(-grid_search_ridge.best_score_))
print('Best Hyperparameters: %s' % grid_search_ridge.best_params_)

Best Score: 0.89257741131697
Best Hyperparameters: {'alpha': 1e-05, 'fit_intercept': True, 'normalize': False}
