### Import libraries and load data

In [1]:
%matplotlib inline

# Import Libraries
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#importing Statsmodels
import statsmodels.api as sm
from statsmodels.tools.eval_measures import rmse
from scipy.stats import chisquare

#Scikitlearn libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error 
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('home_train_test.csv')
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Solar [kW],503909.0,0.076229,0.128427,0.0,0.003367,0.004283,0.083917,0.613883
temperature [C],503909.0,10.412179,10.618791,-24.8,2.094444,10.177778,19.033333,34.288889
humidity,503909.0,0.664085,0.194389,0.13,0.51,0.68,0.84,0.98
visibility,503909.0,9.253443,1.611188,0.27,9.42,10.0,10.0,10.0
pressure,503909.0,1016.301637,7.895188,986.4,1011.29,1016.53,1021.48,1042.46
windSpeed,503909.0,6.649928,3.982716,0.0,3.66,5.93,8.94,22.91
windBearing,503909.0,202.356533,106.520351,0.0,148.0,208.0,295.0,359.0
precipIntensity,503909.0,0.002598,0.011257,0.0,0.0,0.0,0.0,0.191
dewPoint,503909.0,38.693993,19.087953,-27.24,24.6,39.03,54.79,75.49
precipProbability,503909.0,0.056453,0.165836,0.0,0.0,0.0,0.0,0.84


Set X and y 

In [3]:
X = df.drop(['Gen [kw]'], axis=1)
y = df['Gen [kw]']

Feature Scaling

In [5]:
sc = StandardScaler()
X = sc.fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### Hyperparameter Tuning

Using Grid_CV to find best hyperparameter for each model

Ridge

In [6]:
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model1 = Ridge()
grid = GridSearchCV(estimator=model1, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 0.e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
0.9999996141108766
0.1


Lasso

In [7]:
# create and fit a lasso regression model, testing each alpha
model2 = Lasso()
grid = GridSearchCV(estimator=model2, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 0.e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
0.9999982184093008
0.0001


Random Forest

In [13]:
model3 = RandomForestRegressor()
grid = GridSearchCV(estimator=model3, param_grid={
            'max_depth': range(3,7),
            'n_estimators': (10, 50, 100, 1000)},
                cv=5)
grid.fit(X_test, y_test)
# summarize the results of the grid search
print(grid.best_score_)
print(grid_result.best_params_)

KeyboardInterrupt: 

### Kfold Cross Validation - Model Selection

Splitting X and y

In [8]:
# Split-out validation dataset
validation_size = 0.20
seed = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=validation_size, random_state=seed)

Fit different regression models and evaluate it's Rsquare scores

In [15]:
# Spot Check Algorithms
models = []
models.append(('Linear', LinearRegression()))
models.append(('Ridge', Ridge(alpha=0.01)))
models.append(('Lasso', Lasso(alpha=0.0001)))
models.append(('DecisionTree', DecisionTreeRegressor(max_depth=4)))
models.append(('SVM', SVR(kernel='rbf', degree=3, gamma='auto',shrinking=True)))
#models.append(('RandomForest', RandomForestRegressor()))
models.append(('AdaBoostRegressor', AdaBoostRegressor()))
models.append(('BaggingRegressor', BaggingRegressor()))
models.append(('GradientBoosting', GradientBoostingRegressor()))

# evaluate each model in turn
training_results = []
testing_results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=42)
    train_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='r2')
    test_results = cross_val_score(model, X_test, y_test, cv=kfold, scoring='r2')
    training_results.append(train_results)
    testing_results.append(test_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, train_results.mean(),test_results.mean() )
    print(msg)

Linear: 1.000000 (1.000000)
Ridge: 1.000000 (1.000000)
Lasso: 0.999998 (0.999998)
DecisionTree: 0.964105 (0.963847)
SVM: 0.762947 (0.757317)
AdaBoostRegressor: 0.967856 (0.971058)
BaggingRegressor: 0.999980 (0.999910)
GradientBoosting: 0.999541 (0.999505)
