In [1]:
import os
import sys
import warnings
import sklearn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

from autofeat import AutoFeatClassifier

%matplotlib inline
%load_ext autoreload
%autoreload 2

# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
np.seterr(divide = 'ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
# same interface for loading all datasets
def load_classification_dataset(name):
    # load one of the datasets as X and y
    units = {}
    if name == "iris":
        # sklearn iris housing dataset
        X, y = load_iris(True)

    elif name == "wine":
        # sklearn wine dataset
        X, y = load_wine(True)
    
    elif name == "breast_cancer":
        # sklearn breast_cancer dataset
        X, y = load_breast_cancer(True)

    else:
        raise RuntimeError("Unknown dataset %r" % name)
    return np.array(X, dtype=float), np.array(y, dtype=float), units

In [3]:
def test_model(X, y, model, param_grid):
    # load data
    #X, y, _ = load_classification_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    
    if model.__class__.__name__ == "SVC":
        sscaler = StandardScaler()
        X_train = sscaler.fit_transform(X_train)
        X_test = sscaler.transform(X_test)
    
    # train model on train split incl cross-validation for parameter selection
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(model, param_grid, cv=5)
        gsmodel.fit(X_train, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test)))
    return gsmodel.best_estimator_

def test_autofeat(X, y, units, feateng_steps=2):
    # load data
    #X, y, units = load_classification_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    # run autofeat
    afclas = AutoFeatClassifier(verbose=1, feateng_steps=feateng_steps, units=units)
    # fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
    X_train_tr = afclas.fit_transform(X_train, y_train)
    X_test_tr = afclas.transform(X_test)
    print("autofeat new features:", len(afclas.new_feat_cols_))
    print("autofeat Acc. on training data:", accuracy_score(y_train, afclas.predict(X_train_tr)))
    print("autofeat Acc. on test data:", accuracy_score(y_test, afclas.predict(X_test_tr)))
    
    # train rreg on transformed train split incl cross-validation for parameter selection
    print("# Logistic Regression")
    rreg = LogisticRegression(class_weight="balanced")
    param_grid = {"C": np.logspace(-4, 4, 10)}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(rreg, param_grid, cv=5)
        gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train_tr)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test_tr)))
    
    print("# Random Forest")
    rforest = RandomForestClassifier(n_estimators=100, random_state=13)
    param_grid = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    gsmodel = GridSearchCV(rforest, param_grid, cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train_tr)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test_tr)))
    
    print("# SVC")
    svc = SVC(gamma="scale", class_weight="balanced")
    param_grid = {"C": [1., 10., 25., 50., 100., 250.]}
    sscaler = StandardScaler()
    X_train_tr = sscaler.fit_transform(X_train_tr)
    X_test_tr = sscaler.transform(X_test_tr)
    gsmodel = GridSearchCV(svc, param_grid, cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train_tr)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test_tr)))

In [4]:
dsname = 'wine'
print("####", dsname)
X, y, units = load_classification_dataset(dsname)
print(X.shape, np.unique(y))

#### wine
(178, 13) [0. 1. 2.]


In [5]:
#új változók generálása autofeattel
afclas = AutoFeatClassifier(verbose=1, feateng_steps=1, units=units)
# fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
X_af = afclas.fit_transform(X, y)
X_af.shape

[AutoFeat] The 1 step feature engineering process could generate up to 91 features.
[AutoFeat] With 178 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 73 transformed features from 13 original features - done.
[feateng] Generated altogether 73 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 16 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 12 features after 5 feature selection runs
[featsel] 12 features after correlation filtering
[featsel] 9 features after noise filtering
[AutoFeat] Computing 3 new features.
[AutoFeat]     3/    3 new features ...done.
[AutoFeat] Final dataframe with 16 feature columns (3 new).
[

(178, 16)

In [6]:
randforclas = RandomForestClassifier(class_weight='balanced', random_state=13)
randforclas.fit(X_af, y)
# define Boruta feature selection method
feat_selector = BorutaPy(randforclas, n_estimators='auto', verbose=2, random_state=11)
# find all relevant features
feat_selector.fit(X_af.iloc[:,:].values, y)
# call transform() on X to filter it down to selected features
XS_af = feat_selector.transform(X_af.iloc[:,:].values)
XS_af.shape

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	15
Tentative: 	1
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	15
Tentative: 	1
Rejected: 	0
Iteration: 	10 / 100
Confirmed: 	15
Tentative: 	1
Rejected: 	0
Iteration: 	11 / 100
Confirmed: 	15
Tentative: 	1
Rejected: 	0
Iteration: 	12 / 100
Confirmed: 	16
Tentative: 	0
Rejected: 	0


BorutaPy finished running.

Iteration: 	13 / 100
Confirmed: 	16
Tentative: 	0
Rejected: 	0


(178, 16)

In [7]:
print("####", dsname)
rreg = LogisticRegression(class_weight="balanced")
params = {"C": np.logspace(-4, 4, 10)}
rreg = test_model(XS_af, y, rreg, params)

#### wine
best params: {'C': 0.3593813663804626}
best score: 0.9295566502463053
Acc. on training data: 0.9577464788732394
Acc. on test data: 0.9722222222222222


In [8]:
print("####", dsname)
rforest = RandomForestClassifier(n_estimators=100, random_state=13)
params = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
rforest = test_model(XS_af, y, rforest, params)

#### wine
best params: {'min_samples_leaf': 0.0001}
best score: 0.9859605911330049
Acc. on training data: 1.0
Acc. on test data: 0.9722222222222222


In [9]:
print("####", dsname)
svc = SVC(gamma="scale", class_weight="balanced")
params = {"C": [1., 10., 25., 50., 100., 250.]}
svc = test_model(XS_af, y, svc, params)

#### wine
best params: {'C': 1.0}
best score: 0.9857142857142858
Acc. on training data: 1.0
Acc. on test data: 0.9722222222222222


In [10]:
print("####", dsname)
test_autofeat(XS_af, y, units, feateng_steps=1)

#### wine
[AutoFeat] The 1 step feature engineering process could generate up to 112 features.
[AutoFeat] With 142 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 91 transformed features from 16 original features - done.
[feateng] Generated altogether 91 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 24 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 12 features after 5 feature selection runs
[featsel] 12 features after correlation filtering
[featsel] 12 features after noise filtering
[AutoFeat] Computing 1 new features.
[AutoFeat]     1/    1 new features ...done.
[AutoFeat] Final dataframe with 17 feature column

In [12]:
#sokáig futna
#print("####", dsname)
#test_autofeat(XS_af, y, units, feateng_steps=2)

In [13]:
#sokáig futna
#print("####", dsname)
#test_autofeat(XS_af, y, units, feateng_steps=3)