In [6]:
import os
import sys
import warnings
import sklearn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy

from autofeat import AutoFeatRegressor

%matplotlib inline
%load_ext autoreload
%autoreload 2

# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
np.seterr(divide = 'ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
# same interface for loading all datasets - adapt the datapath
# to where you've downloaded (and renamed) the datasets
def load_regression_dataset(name, datapath="../datasets/regression/"):
    # load one of the datasets as X and y (and possibly units)
    units = {}
    if name == "boston":
        # sklearn boston housing dataset
        X, y = load_boston(True)

    elif name == "diabetes":
        # sklearn diabetes dataset
        X, y = load_diabetes(True)
    else:
        raise RuntimeError("Unknown dataset %r" % name)
    return np.array(X, dtype=float), np.array(y, dtype=float), units

In [3]:
def test_model(X, y, model, param_grid):
    # load data
    #X, y, _ = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    
    if model.__class__.__name__ == "SVR":
        sscaler = StandardScaler()
        X_train = sscaler.fit_transform(X_train)
        X_test = sscaler.transform(X_test)
        
    # train model on train split incl cross-validation for parameter selection
    gsmodel = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)
    gsmodel.fit(X_train, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test)))
    return gsmodel.best_estimator_

def test_autofeat(X, y, units, feateng_steps):
    # load data
    #X, y, units = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    # run autofeat
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=feateng_steps, units=units)
    # fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
    X_train_tr = afreg.fit_transform(X_train, y_train)
    X_test_tr = afreg.transform(X_test)
    print("autofeat new features:", len(afreg.new_feat_cols_))
    print("autofeat MSE on training data:", mean_squared_error(y_train, afreg.predict(X_train_tr)))
    print("autofeat MSE on test data:", mean_squared_error(y_test, afreg.predict(X_test_tr)))
    print("autofeat R^2 on training data:", r2_score(y_train, afreg.predict(X_train_tr)))
    print("autofeat R^2 on test data:", r2_score(y_test, afreg.predict(X_test_tr)))
    
    
    # train rreg on transformed train split incl cross-validation for parameter selection
    print("# Ridge Regression")
    rreg = Ridge()
    param_grid = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000.]}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(rreg, param_grid, scoring='neg_mean_squared_error', cv=5)
        gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))
    
    
    print("# Random Forest")
    rforest = RandomForestRegressor(n_estimators=100, random_state=13)
    param_grid = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    gsmodel = GridSearchCV(rforest, param_grid, scoring='neg_mean_squared_error', cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))
    
    
    print("# SVR")
    svr = SVR(gamma="scale")
    param_grid = {"C": [1., 10., 25., 50., 100., 250.]}
    sscaler = StandardScaler()
    X_train_tr = sscaler.fit_transform(X_train_tr)
    X_test_tr = sscaler.transform(X_test_tr)

    gsmodel = GridSearchCV(svr, param_grid, scoring='neg_mean_squared_error', cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))

In [4]:
dsname = 'boston'
print("####", dsname)
X, y, _ = load_regression_dataset(dsname)
print(X.shape)

#### boston
(506, 13)


In [10]:
randforreg = RandomForestRegressor(random_state=13)
randforreg.fit(X, y)
# define Boruta feature selection method
feat_selector = BorutaPy(randforreg, n_estimators='auto', verbose=2, random_state=11)
# find all relevant features
feat_selector.fit(X, y)
# call transform() on X to filter it down to selected features
XS = feat_selector.transform(X)
XS.shape

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	13
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	13
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	13
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	13
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	13
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	13
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	13
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	7
Tentative: 	3
Rejected: 	3
Iteration: 	9 / 100
Confirmed: 	7
Tentative: 	3
Rejected: 	3
Iteration: 	10 / 100
Confirmed: 	7
Tentative: 	3
Rejected: 	3
Iteration: 	11 / 100
Confirmed: 	7
Tentative: 	3
Rejected: 	3
Iteration: 	12 / 100
Confirmed: 	9
Tentative: 	1
Rejected: 	3
Iteration: 	13 / 100
Confirmed: 	9
Tentative: 	1
Rejected: 	3
Iteration: 	14 / 100
Confirmed: 	9
Tentative: 	1
Rejected: 	3
Iteration: 	15 / 100
Confirmed: 	9
Tentative: 	1
Rejected: 	3
Iteration: 	16 / 100
Confirmed: 	9
Tentative: 	0
Rejected: 	4


BorutaPy

(506, 9)

In [11]:
print("####", dsname)
rreg = Ridge()
params = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000., 25000., 50000., 100000.]}
rreg = test_model(XS, y, rreg, params)

#### boston
best params: {'alpha': 1e-05}
best score: -27.644942558520608
MSE on training data: 24.725302258259475
MSE on test data: 21.010716640003164
R^2 on training data: 0.7091323414872319
R^2 on test data: 0.7428632906956836


In [12]:
print("####", dsname)
rforest = RandomForestRegressor(n_estimators=100, random_state=13)
params = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
rforest = test_model(XS, y, rforest, params)

#### boston
best params: {'min_samples_leaf': 0.0001}
best score: -10.402274233950614
MSE on training data: 1.4176682351485144
MSE on test data: 11.003037990196086
R^2 on training data: 0.9833225965936239
R^2 on test data: 0.8653408624928761


In [13]:
print("####", dsname)
svr = SVR(gamma="scale")
params = {"C": [1., 10., 25., 50., 100., 250.]}
svr = test_model(XS, y, svr, params)

#### boston
best params: {'C': 100.0}
best score: -11.360914527717174
MSE on training data: 3.3998257118459576
MSE on test data: 8.582564383431839
R^2 on training data: 0.9600045599513026
R^2 on test data: 0.8949634893106739


In [14]:
print("####", dsname)
test_autofeat(XS, y, _, feateng_steps=1)

#### boston
[AutoFeat] The 1 step feature engineering process could generate up to 63 features.
[AutoFeat] With 404 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 47 transformed features from 9 original features - done.
[feateng] Generated altogether 47 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 16 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 13 features after 5 feature selection runs
[featsel] 13 features after correlation filtering
[featsel] 11 features after noise filtering
[AutoFeat] Computing 4 new features.
[AutoFeat]     4/    4 new features ...done.
[AutoFeat] Final dataframe with 13 feature column

In [15]:
print("####", dsname)
test_autofeat(XS, y, _, feateng_steps=2)

#### boston
[AutoFeat] The 2 step feature engineering process could generate up to 2016 features.
[AutoFeat] With 404 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 47 transformed features from 9 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 1504 feature combinations from 1540 original feature tuples - done.
[feateng] Generated altogether 1552 new features in 2 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 684 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 41 features after 5 feature selection runs
[featsel] 32 features after correlation filtering
[featsel] 26 features after noise 

In [17]:
#sokáig fut
#print("####", dsname)
#test_autofeat(XS, y, _, feateng_steps=3)