In [1]:
#pip install autofeat

In [2]:
import os
import sys
import warnings
import sklearn
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from autofeat import AutoFeatRegressor
from boruta import BorutaPy

%matplotlib inline
%load_ext autoreload
%autoreload 2

# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
np.seterr(divide = 'ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [3]:
#regr adatbeolvasó fv.
def load_regression_dataset(name):
    units = {}
    if name == "boston":
        X, y = load_boston(True)
    elif name == "diabetes":
        X, y = load_diabetes(True)
    else:
        raise RuntimeError("Unknown dataset %r" % name)
    return np.array(X, dtype=float), np.array(y, dtype=float), units

In [4]:
#módosított modell tesztelő fv.
def test_model(X, y, model, param_grid):
    # load data
    #X, y, _ = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    
    if model.__class__.__name__ == "SVR":
        sscaler = StandardScaler()
        X_train = sscaler.fit_transform(X_train)
        X_test = sscaler.transform(X_test)
        
    # train model on train split incl cross-validation for parameter selection
    gsmodel = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)
    gsmodel.fit(X_train, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test)))
    return gsmodel.best_estimator_

In [5]:
#módosított autofeat modell tesztelő fv.
def test_autofeat(X, y, units, feateng_steps=2):
    # load data
    #X, y, units = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    # run autofeat
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=feateng_steps, units=units)
    # fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
    X_train_tr = afreg.fit_transform(X_train, y_train)
    X_test_tr = afreg.transform(X_test)
    print("autofeat new features:", len(afreg.new_feat_cols_))
    print("autofeat MSE on training data:", mean_squared_error(y_train, afreg.predict(X_train_tr)))
    print("autofeat MSE on test data:", mean_squared_error(y_test, afreg.predict(X_test_tr)))
    print("autofeat R^2 on training data:", r2_score(y_train, afreg.predict(X_train_tr)))
    print("autofeat R^2 on test data:", r2_score(y_test, afreg.predict(X_test_tr)))
    
    
    # train rreg on transformed train split incl cross-validation for parameter selection
    print("# Ridge Regression")
    rreg = Ridge()
    param_grid = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000.]}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(rreg, param_grid, scoring='neg_mean_squared_error', cv=5)
        gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))
    
    
    print("# Random Forest")
    rforest = RandomForestRegressor(n_estimators=100, random_state=13)
    param_grid = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    gsmodel = GridSearchCV(rforest, param_grid, scoring='neg_mean_squared_error', cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))
    
    
    print("# SVR")
    svr = SVR(gamma="scale")
    param_grid = {"C": [1., 10., 25., 50., 100., 250.]}
    sscaler = StandardScaler()
    X_train_tr = sscaler.fit_transform(X_train_tr)
    X_test_tr = sscaler.transform(X_test_tr)
    gsmodel = GridSearchCV(svr, param_grid, scoring='neg_mean_squared_error', cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))

In [6]:
#adatok beolvasása
X, y, units = load_regression_dataset("diabetes")
#új változók generálása autofeattel
afreg = AutoFeatRegressor(verbose=1, feateng_steps=1, units=units)
#és a legjobbak kiválasztása
X_af = afreg.fit_transform(X, y)
X_af.shape

[AutoFeat] The 1 step feature engineering process could generate up to 70 features.
[AutoFeat] With 442 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 45 transformed features from 10 original features - done.
[feateng] Generated altogether 45 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 36 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 14 features after 5 feature selection runs
[featsel] 13 features after correlation filtering
[featsel] 11 features after noise filtering
[AutoFeat] Computing 6 new features.
[AutoFeat]     6/    6 new features ...done.
[AutoFeat] Final dataframe with 16 feature columns (6 new).


(442, 16)

In [7]:
randforreg = RandomForestRegressor(random_state=13)
randforreg.fit(X_af, y)
# define Boruta feature selection method
feat_selector = BorutaPy(randforreg, n_estimators='auto', verbose=2, random_state=11)
# find all relevant features
feat_selector.fit(X_af.iloc[:,:].values, y)
# call transform() on X to filter it down to selected features
XS_af = feat_selector.transform(X_af.iloc[:,:].values)
XS_af.shape

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	3
Tentative: 	4
Rejected: 	9
Iteration: 	9 / 100
Confirmed: 	3
Tentative: 	4
Rejected: 	9
Iteration: 	10 / 100
Confirmed: 	3
Tentative: 	4
Rejected: 	9
Iteration: 	11 / 100
Confirmed: 	3
Tentative: 	4
Rejected: 	9
Iteration: 	12 / 100
Confirmed: 	3
Tentative: 	3
Rejected: 	10
Iteration: 	13 / 100
Confirmed: 	3
Tentative: 	3
Rejected: 	10
Iteration: 	14 / 100
Confirmed: 	3
Tentative: 	3
Rejected: 	10
Iteration: 	15 / 100
Confirmed: 	3
Tentative: 	3
Rejected: 	10
Iteration: 	16 / 100
Confirmed: 	3
Tentative: 	2
Rejected: 	11
Itera

(442, 4)

## Adathalmaz tesztelése

In [8]:
dsname='diabetes'

In [9]:
#Ridge regresszió
print("####", dsname)
rreg = Ridge()
params = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000., 25000., 50000., 100000.]}
rreg = test_model(XS_af, y, rreg, params)

#### diabetes
best params: {'alpha': 1e-05}
best score: -3119.837089002782
MSE on training data: 2997.335571347947
MSE on test data: 3241.406425322461
R^2 on training data: 0.5120540375539693
R^2 on test data: 0.35892526579310635


In [10]:
#SVR regresszió
print("####", dsname)
svr = SVR(gamma="scale")
params = {"C": [1., 10., 25., 50., 100., 250.]}
svr = test_model(XS_af, y, svr, params)

#### diabetes
best params: {'C': 25.0}
best score: -3264.700288701071
MSE on training data: 2873.2778736733253
MSE on test data: 3330.028841935485
R^2 on training data: 0.5322497918329803
R^2 on test data: 0.3413978148288801


In [11]:
#random forest regresszió
print("####", dsname)
rforest = RandomForestRegressor(n_estimators=100, random_state=13)
params = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
rforest = test_model(XS_af, y, rforest, params)

#### diabetes
best params: {'min_samples_leaf': 0.05}
best score: -3385.5534792766075
MSE on training data: 2630.1808994838243
MSE on test data: 3320.5738004705527
R^2 on training data: 0.5718243353617415
R^2 on test data: 0.34326780192667017


In [12]:
#autofeat regresszió 1 steppel
print("####", dsname)
test_autofeat(XS_af, y, units, feateng_steps=1)

#### diabetes
[AutoFeat] The 1 step feature engineering process could generate up to 28 features.
[AutoFeat] With 353 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 18 transformed features from 4 original features - done.
[feateng] Generated altogether 20 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 14 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 8 features after 5 feature selection runs
[featsel] 8 features after correlation filtering
[featsel] 8 features after noise filtering
[AutoFeat] Computing 5 new features.
[AutoFeat]     5/    5 new features ...done.
[AutoFeat] Final dataframe with 9 feature columns 

In [13]:
#autofeat regresszió 2 steppel
print("####", dsname)
warnings.simplefilter("ignore")
test_autofeat(XS_af, y, units, feateng_steps=2)

#### diabetes
[AutoFeat] The 2 step feature engineering process could generate up to 406 features.
[AutoFeat] With 353 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 18 transformed features from 4 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 192 feature combinations from 231 original feature tuples - done.
[feateng] Generated altogether 239 new features in 2 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 113 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 7 features after 5 feature selection runs
[featsel] 7 features after correlation filtering
[featsel] 3 features after noise filte

In [14]:
#autofeat regresszió 3 steppel
print("####", dsname)
warnings.simplefilter("ignore")
test_autofeat(XS_af, y, units, feateng_steps=3)

#### diabetes
[AutoFeat] The 3 step feature engineering process could generate up to 9478 features.
[AutoFeat] With 353 data points this new feature matrix would use about 0.01 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 18 transformed features from 4 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 837 feature combinations from 231 original feature tuples - done.
[feateng] Step 3: transformation of new features
[feateng] Generated 3447 transformed features from 837 original features - done.
[feateng] Generated altogether 4700 new features in 3 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 1955 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[feats