In [1]:
import os
import sys
import warnings
import sklearn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

from autofeat import AutoFeatClassifier

%matplotlib inline
%load_ext autoreload
%autoreload 2

# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
np.seterr(divide = 'ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
# same interface for loading all datasets
def load_classification_dataset(name):
    # load one of the datasets as X and y
    units = {}
    if name == "iris":
        # sklearn iris housing dataset
        X, y = load_iris(True)

    elif name == "wine":
        # sklearn wine dataset
        X, y = load_wine(True)
    
    elif name == "breast_cancer":
        # sklearn breast_cancer dataset
        X, y = load_breast_cancer(True)

    else:
        raise RuntimeError("Unknown dataset %r" % name)
    return np.array(X, dtype=float), np.array(y, dtype=float), units

In [3]:
def test_model(X, y, model, param_grid):
    # load data
    #X, y, _ = load_classification_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    
    if model.__class__.__name__ == "SVC":
        sscaler = StandardScaler()
        X_train = sscaler.fit_transform(X_train)
        X_test = sscaler.transform(X_test)
    
    # train model on train split incl cross-validation for parameter selection
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(model, param_grid, cv=5)
        gsmodel.fit(X_train, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test)))
    return gsmodel.best_estimator_

def test_autofeat(X, y, units, feateng_steps=2):
    # load data
    #X, y, units = load_classification_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    # run autofeat
    afclas = AutoFeatClassifier(verbose=1, feateng_steps=feateng_steps, units=units)
    # fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
    X_train_tr = afclas.fit_transform(X_train, y_train)
    X_test_tr = afclas.transform(X_test)
    print("autofeat new features:", len(afclas.new_feat_cols_))
    print("autofeat Acc. on training data:", accuracy_score(y_train, afclas.predict(X_train_tr)))
    print("autofeat Acc. on test data:", accuracy_score(y_test, afclas.predict(X_test_tr)))
    
    # train rreg on transformed train split incl cross-validation for parameter selection
    print("# Logistic Regression")
    rreg = LogisticRegression(class_weight="balanced")
    param_grid = {"C": np.logspace(-4, 4, 10)}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(rreg, param_grid, cv=5)
        gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train_tr)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test_tr)))
    
    print("# Random Forest")
    rforest = RandomForestClassifier(n_estimators=100, random_state=13)
    param_grid = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    gsmodel = GridSearchCV(rforest, param_grid, cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train_tr)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test_tr)))
    
    print("# SVC")
    svc = SVC(gamma="scale", class_weight="balanced")
    param_grid = {"C": [1., 10., 25., 50., 100., 250.]}
    sscaler = StandardScaler()
    X_train_tr = sscaler.fit_transform(X_train_tr)
    X_test_tr = sscaler.transform(X_test_tr)
    gsmodel = GridSearchCV(svc, param_grid, cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train_tr)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test_tr)))

In [4]:
dsname = 'iris'
print("####", dsname)
X, y, units = load_classification_dataset(dsname)
print(X.shape, np.unique(y))

#### iris
(150, 4) [0. 1. 2.]


In [5]:
randforclas = RandomForestClassifier(class_weight='balanced', random_state=13)
randforclas.fit(X, y)
# define Boruta feature selection method
feat_selector = BorutaPy(randforclas, n_estimators='auto', verbose=2, random_state=11)
# find all relevant features
feat_selector.fit(X, y)
# call transform() on X to filter it down to selected features
XS = feat_selector.transform(X)
XS.shape

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	4
Tentative: 	0
Rejected: 	0


BorutaPy finished running.

Iteration: 	9 / 100
Confirmed: 	4
Tentative: 	0
Rejected: 	0


(150, 4)

In [6]:
#új változók generálása autofeattel
afclas = AutoFeatClassifier(verbose=1, feateng_steps=2, units=units)
# fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
XS_af = afclas.fit_transform(XS, y)
XS_af.shape

[AutoFeat] The 2 step feature engineering process could generate up to 406 features.
[AutoFeat] With 150 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 24 transformed features from 4 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 362 feature combinations from 378 original feature tuples - done.
[feateng] Generated altogether 387 new features in 2 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 112 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 13 features after 5 feature selection runs
[featsel] 6 features after correlation filtering
[featsel] 4 features after noise filtering
[AutoFea

(150, 8)

In [7]:
randforclas = RandomForestClassifier(class_weight='balanced', random_state=13)
randforclas.fit(XS_af, y)
# define Boruta feature selection method
feat_selector = BorutaPy(randforclas, n_estimators='auto', verbose=2, random_state=11)
# find all relevant features
feat_selector.fit(XS_af.iloc[:,:].values, y)
# call transform() on X to filter it down to selected features
XSS_af = feat_selector.transform(XS_af.iloc[:,:].values)
XSS_af.shape

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	0
Iteration: 	10 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	0
Iteration: 	11 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	0
Iteration: 	12 / 100
Confirmed: 	8
Tentative: 	0
Rejected: 	0


BorutaPy finished running.

Iteration: 	13 / 100
Confirmed: 	8
Tentative: 	0
Rejected: 	0


(150, 8)

In [8]:
print("####", dsname)
rreg = LogisticRegression(class_weight="balanced")
params = {"C": np.logspace(-4, 4, 10)}
rreg = test_model(XSS_af, y, rreg, params)

#### iris
best params: {'C': 2.782559402207126}
best score: 0.9666666666666666
Acc. on training data: 0.975
Acc. on test data: 0.9666666666666667


In [9]:
print("####", dsname)
rforest = RandomForestClassifier(n_estimators=100, random_state=13)
params = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
rforest = test_model(XSS_af, y, rforest, params)

#### iris
best params: {'min_samples_leaf': 0.2}
best score: 0.9666666666666668
Acc. on training data: 0.975
Acc. on test data: 0.9666666666666667


In [10]:
print("####", dsname)
svc = SVC(gamma="scale", class_weight="balanced")
params = {"C": [1., 10., 25., 50., 100., 250.]}
svc = test_model(XSS_af, y, svc, params)

#### iris
best params: {'C': 10.0}
best score: 0.9583333333333334
Acc. on training data: 0.975
Acc. on test data: 0.9333333333333333


In [11]:
print("####", dsname)
test_autofeat(XSS_af, y, units, feateng_steps=1)

#### iris
[AutoFeat] The 1 step feature engineering process could generate up to 56 features.
[AutoFeat] With 120 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 45 transformed features from 8 original features - done.
[feateng] Generated altogether 46 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 14 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 5 features after 5 feature selection runs
[featsel] 5 features after correlation filtering
[featsel] 5 features after noise filtering
[AutoFeat] Final dataframe with 8 feature columns (0 new).
[AutoFeat] Training final classification model.
[AutoFeat] Trained model: lar

In [12]:
print("####", dsname)
test_autofeat(XSS_af, y, units, feateng_steps=2)

#### iris
[AutoFeat] The 2 step feature engineering process could generate up to 1596 features.
[AutoFeat] With 120 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 45 transformed features from 8 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 1349 feature combinations from 1378 original feature tuples - done.
[feateng] Generated altogether 1396 new features in 2 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 504 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 14 features after 5 feature selection runs
[featsel] 8 features after correlation filtering
[featsel] 5 features after noise filt

In [13]:
#sokáig futna
#print("####", dsname)
#test_autofeat(XSS_af, y, units, feateng_steps=3)