In [1]:
#pip install autofeat

In [1]:
import os
import sys
import warnings
import sklearn
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from autofeat import AutoFeatRegressor
from boruta import BorutaPy

%matplotlib inline
%load_ext autoreload
%autoreload 2

# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
np.seterr(divide = 'ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
#regr adatbeolvasó fv.
def load_regression_dataset(name):
    units = {}
    if name == "boston":
        X, y = load_boston(True)
    elif name == "diabetes":
        X, y = load_diabetes(True)
    else:
        raise RuntimeError("Unknown dataset %r" % name)
    return np.array(X, dtype=float), np.array(y, dtype=float), units

In [3]:
#módosított modell tesztelő fv.
def test_model(X, y, model, param_grid):
    # load data
    #X, y, _ = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    
    if model.__class__.__name__ == "SVR":
        sscaler = StandardScaler()
        X_train = sscaler.fit_transform(X_train)
        X_test = sscaler.transform(X_test)
        
    # train model on train split incl cross-validation for parameter selection
    gsmodel = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)
    gsmodel.fit(X_train, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test)))
    return gsmodel.best_estimator_

In [4]:
#módosított autofeat modell tesztelő fv.
def test_autofeat(X, y, units, feateng_steps=2):
    # load data
    #X, y, units = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    # run autofeat
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=feateng_steps, units=units)
    # fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
    X_train_tr = afreg.fit_transform(X_train, y_train)
    X_test_tr = afreg.transform(X_test)
    print("autofeat new features:", len(afreg.new_feat_cols_))
    print("autofeat MSE on training data:", mean_squared_error(y_train, afreg.predict(X_train_tr)))
    print("autofeat MSE on test data:", mean_squared_error(y_test, afreg.predict(X_test_tr)))
    print("autofeat R^2 on training data:", r2_score(y_train, afreg.predict(X_train_tr)))
    print("autofeat R^2 on test data:", r2_score(y_test, afreg.predict(X_test_tr)))
    
    
    # train rreg on transformed train split incl cross-validation for parameter selection
    print("# Ridge Regression")
    rreg = Ridge()
    param_grid = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000.]}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(rreg, param_grid, scoring='neg_mean_squared_error', cv=5)
        gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))
    
    
    print("# Random Forest")
    rforest = RandomForestRegressor(n_estimators=100, random_state=13)
    param_grid = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    gsmodel = GridSearchCV(rforest, param_grid, scoring='neg_mean_squared_error', cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))
    
    print("# SVR")
    svr = SVR(gamma="scale")
    param_grid = {"C": [1., 10., 25., 50., 100., 250.]}
    sscaler = StandardScaler()
    X_train_tr = sscaler.fit_transform(X_train_tr)
    X_test_tr = sscaler.transform(X_test_tr)

    gsmodel = GridSearchCV(svr, param_grid, scoring='neg_mean_squared_error', cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))

In [5]:
#adatok beolvasása és splitelése
dsname = 'diabetes'
print("####", dsname)
X, y, units = load_regression_dataset(dsname)
print(X.shape)

#### diabetes
(442, 10)


In [6]:
randforreg = RandomForestRegressor(random_state=13)
randforreg.fit(X, y)
# define Boruta feature selection method
feat_selector = BorutaPy(randforreg, n_estimators='auto', verbose=2, random_state=11)
# find all relevant features
feat_selector.fit(X, y)
# call transform() on X to filter it down to selected features
XS = feat_selector.transform(X)
XS.shape

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	3
Tentative: 	2
Rejected: 	5
Iteration: 	9 / 100
Confirmed: 	3
Tentative: 	2
Rejected: 	5
Iteration: 	10 / 100
Confirmed: 	3
Tentative: 	2
Rejected: 	5
Iteration: 	11 / 100
Confirmed: 	3
Tentative: 	2
Rejected: 	5
Iteration: 	12 / 100
Confirmed: 	3
Tentative: 	2
Rejected: 	5
Iteration: 	13 / 100
Confirmed: 	3
Tentative: 	2
Rejected: 	5
Iteration: 	14 / 100
Confirmed: 	3
Tentative: 	2
Rejected: 	5
Iteration: 	15 / 100
Confirmed: 	3
Tentative: 	2
Rejected: 	5
Iteration: 	16 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	5
Iteration:

(442, 4)

In [7]:
#új változók generálása autofeattel
afreg = AutoFeatRegressor(verbose=1, feateng_steps=1, units=units)
#és a legjobbak kiválasztása
XS_af = afreg.fit_transform(XS, y)
XS_af.shape

[AutoFeat] The 1 step feature engineering process could generate up to 28 features.
[AutoFeat] With 442 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 20 transformed features from 4 original features - done.
[feateng] Generated altogether 20 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 16 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 10 features after 5 feature selection runs
[featsel] 9 features after correlation filtering
[featsel] 6 features after noise filtering
[AutoFeat] Computing 3 new features.
[AutoFeat]     3/    3 new features ...done.
[AutoFeat] Final dataframe with 7 feature columns (3 new).
[Aut

(442, 7)

In [8]:
randforreg = RandomForestRegressor(random_state=13)
randforreg.fit(XS_af, y)
# define Boruta feature selection method
feat_selector = BorutaPy(randforreg, n_estimators='auto', verbose=2, random_state=11)
# find all relevant features
feat_selector.fit(XS_af.iloc[:,:].values, y)
# call transform() on X to filter it down to selected features
XSS_af = feat_selector.transform(XS_af.iloc[:,:].values)
XSS_af.shape

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	7
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	4
Tentative: 	2
Rejected: 	1
Iteration: 	9 / 100
Confirmed: 	4
Tentative: 	2
Rejected: 	1
Iteration: 	10 / 100
Confirmed: 	4
Tentative: 	2
Rejected: 	1
Iteration: 	11 / 100
Confirmed: 	4
Tentative: 	2
Rejected: 	1
Iteration: 	12 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	2
Iteration: 	13 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	2
Iteration: 	14 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	2
Iteration: 	15 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	2
Iteration: 	16 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	2
Iteration: 	17 / 

(442, 5)

## Adathalmaz tesztelése

In [9]:
dsname='diabetes'

In [10]:
#Ridge regresszió
print("####", dsname)
rreg = Ridge()
params = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000., 25000., 50000., 100000.]}
rreg = test_model(XSS_af, y, rreg, params)

#### diabetes
best params: {'alpha': 1e-05}
best score: -3139.048328329742
MSE on training data: 2987.7712953501004
MSE on test data: 3280.7156136311824
R^2 on training data: 0.5136110370109136
R^2 on test data: 0.3511508234244992


In [11]:
#SVR regresszió
print("####", dsname)
svr = SVR(gamma="scale")
params = {"C": [1., 10., 25., 50., 100., 250.]}
svr = test_model(XSS_af, y, svr, params)

#### diabetes
best params: {'C': 25.0}
best score: -3323.170349705336
MSE on training data: 2758.183646603716
MSE on test data: 3476.2526671092073
R^2 on training data: 0.5509863537102365
R^2 on test data: 0.31247814615492364


In [12]:
#random forest regresszió
print("####", dsname)
rforest = RandomForestRegressor(n_estimators=100, random_state=13)
params = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
rforest = test_model(XSS_af, y, rforest, params)

#### diabetes
best params: {'min_samples_leaf': 0.05}
best score: -3368.395081756684
MSE on training data: 2595.3140594756883
MSE on test data: 3263.464320717948
R^2 on training data: 0.577500421138674
R^2 on test data: 0.3545627275697737


In [13]:
#autofeat regresszió 1 steppel
print("####", dsname)
test_autofeat(XSS_af, y, units, feateng_steps=1)

#### diabetes
[AutoFeat] The 1 step feature engineering process could generate up to 35 features.
[AutoFeat] With 353 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 23 transformed features from 5 original features - done.
[feateng] Generated altogether 25 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 18 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 11 features after 5 feature selection runs
[featsel] 11 features after correlation filtering
[featsel] 10 features after noise filtering
[AutoFeat] Computing 6 new features.
[AutoFeat]     6/    6 new features ...done.
[AutoFeat] Final dataframe with 11 feature colu

In [14]:
#autofeat regresszió 2 steppel
print("####", dsname)
test_autofeat(XSS_af, y, units, feateng_steps=2)

#### diabetes
[AutoFeat] The 2 step feature engineering process could generate up to 630 features.
[AutoFeat] With 353 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 23 transformed features from 5 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 321 feature combinations from 378 original feature tuples - done.
[feateng] Generated altogether 387 new features in 2 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 199 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 17 features after 5 feature selection runs
[featsel] 16 features after correlation filtering
[featsel] 7 features after noise fil

In [15]:
#autofeat regresszió 3 steppel
print("####", dsname)
test_autofeat(XSS_af, y, units, feateng_steps=3)

#### diabetes
[AutoFeat] The 3 step feature engineering process could generate up to 14910 features.
[AutoFeat] With 353 data points this new feature matrix would use about 0.02 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 23 transformed features from 5 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 1396 feature combinations from 378 original feature tuples - done.
[feateng] Step 3: transformation of new features
[feateng] Generated 5794 transformed features from 1396 original features - done.
[feateng] Generated altogether 7828 new features in 3 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 3387 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[fe