In [2]:
run src/preprocessing-dataset2.py

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, SGDRegressor
from sklearn.svm import LinearSVR

http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

<img src="https://www.evernote.com/l/AAEeKAnYmjFJX7GnDLsoUChT8idXE1JrFiIB/image.png">

### Most Appropriate Estimators

In [6]:
gs_param_lasso = {
    'alpha' : np.logspace(-1,5,7)
}

gs_param_ridge = {
    'alpha' : np.logspace(-1,5,7)
}

gs_param_sgd = {
    'penalty' : ['l1', 'l2'],
    'alpha' : np.logspace(-1,5,7)
}

gs_param_linear_svr = {
    'C' : np.logspace(-5, 5, 7)
    
}

In [7]:
(categorical_encoded_out_rem_df.shape,
 numeric_log_std_sc_out_rem_df.shape,
 numeric_log_std_sc_out_rem_pca_df.shape,
 target_log_std_sc_out_rem_df.shape)

((1444, 359), (1444, 23), (1444, 8), (1444,))

In [8]:
dataset_2.shape, target_2.shape

((1444, 390), (1444,))

In [9]:
lasso_grid_search     = GridSearchCV(Lasso(), param_grid=gs_param_lasso, n_jobs=-1)
ridge_grid_search     = GridSearchCV(Ridge(), param_grid=gs_param_ridge, n_jobs=-1)
sgd_grid_search       = GridSearchCV(SGDRegressor(), param_grid=gs_param_ridge, n_jobs=-1)
linearsvr_grid_search = GridSearchCV(LinearSVR(), param_grid=gs_param_linear_svr, n_jobs=-1)

In [10]:
lasso_grid_search.fit(dataset_2, target_2)
ridge_grid_search.fit(dataset_2, target_2)
sgd_grid_search.fit(dataset_2, target_2)
linearsvr_grid_search.fit(dataset_2, target_2)

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': array([  1.00000e-05,   4.64159e-04,   2.15443e-02,   1.00000e+00,
         4.64159e+01,   2.15443e+03,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
best_lasso = lasso_grid_search.best_estimator_
best_ridge = ridge_grid_search.best_estimator_
best_sgd = sgd_grid_search.best_estimator_
best_linearsvr = linearsvr_grid_search.best_estimator_

In [12]:
(lasso_grid_search.best_score_,
 ridge_grid_search.best_score_,
 sgd_grid_search.best_score_,
 linearsvr_grid_search.best_score_)

(0.89403825833193273,
 0.88839091976836726,
 0.84664091472483538,
 0.8802934949410639)

In [20]:
best_lasso

Lasso(alpha=100.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

### Next Level

In [23]:
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor

In [24]:
gs_param_svr = {
    'kernel' : ['rbf'],
    'C' : np.logspace(-5, 5, 7)
}

gs_param_adaboost = {
    
}

gs_param_gradboost = {
    'max_depth' : [1,2,3,4,5],
    'max_features' : ['sqrt', 'auto', 'log2']
}

gs_param_random_forest = {   
    'n_estimators' : [10,20,50,100],
    'max_features' : ['sqrt', 'auto', 'log2']
}

In [44]:
svr_grid_search = GridSearchCV(SVR(), param_grid=gs_param_svr, n_jobs=-1)
adaboost_grid_search = GridSearchCV(AdaBoostRegressor(), param_grid=gs_param_adaboost, n_jobs=-1)
gradboost_grid_search = GridSearchCV(GradientBoostingRegressor(), param_grid=gs_param_gradboost, n_jobs=-1)
gradboost_grid_search_4 = GridSearchCV(GradientBoostingRegressor(), param_grid=gs_param_gradboost, n_jobs=-1)
random_forest_grid_search = GridSearchCV(RandomForestRegressor(), param_grid=gs_param_random_forest, n_jobs=-1)

In [26]:
svr_grid_search.fit(dataset_2, target_2)
adaboost_grid_search.fit(dataset_2, target_2)
gradboost_grid_search.fit(dataset_2, target_2)
random_forest_grid_search.fit(dataset_2, target_2)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 20, 50, 100], 'max_features': ['sqrt', 'auto', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [45]:
gradboost_grid_search_4.fit(dataset_2, target_2)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [1, 2, 3, 4, 5], 'max_features': ['sqrt', 'auto', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [46]:
gradboost_grid_search_4.best_score_

0.89928971367227428

In [39]:
import pickle

In [41]:
from sklearn.externals import joblib
joblib.dump(best_lasso, 'best_lasso.p')

In [43]:
lasso_model = joblib.load('best_lasso.p')

Lasso(alpha=100.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [27]:
best_svr = svr_grid_search.best_estimator_
best_adaboost = adaboost_grid_search.best_estimator_
best_gradboost = gradboost_grid_search.best_estimator_
best_random_forest = random_forest_grid_search.best_estimator_

In [28]:
(svr_grid_search.best_score_,
 adaboost_grid_search.best_score_,
 gradboost_grid_search.best_score_,
 random_forest_grid_search.best_score_)

(0.85966755828557939,
 0.8230737895109127,
 0.90183742213517093,
 0.86273575036544237)

In [29]:
lasso_grid_search.best_score_, gradboost_grid_search.best_score_

(0.89403509010330673, 0.90183742213517093)

### Neural Network

In [31]:
from sklearn.neural_network import MLPRegressor

In [35]:
gs_param_nn = {
    'hidden_layer_sizes' : [
        (8, ), (4,4), (2,2,2)
    ],
    'alpha' : np.logspace(-3,3,7)
}

In [36]:
nn_grid_search = GridSearchCV(MLPRegressor(), param_grid=gs_param_nn, n_jobs=-1)

In [37]:
nn_grid_search.fit(dataset_2, target_2)

GridSearchCV(cv=None, error_score='raise',
       estimator=MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'hidden_layer_sizes': [(8,), (4, 4), (2, 2, 2)], 'alpha': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [38]:
nn_grid_search.best_score_

-4.9637667598200998