In [1]:
import os
import sys
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from autofeat import AutoFeatClassifier

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
datasets = ["iris", "wine", "breast_cancer"]

# same interface for loading all datasets
def load_classification_dataset(name):
    # load one of the datasets as X and y
    units = {}
    if name == "iris":
        # sklearn iris housing dataset
        X, y = load_iris(True)

    elif name == "wine":
        # sklearn wine dataset
        X, y = load_wine(True)
    
    elif name == "breast_cancer":
        # sklearn breast_cancer dataset
        X, y = load_breast_cancer(True)

    else:
        raise RuntimeError("Unknown dataset %r" % name)
    return np.array(X, dtype=float), np.array(y, dtype=float), units

In [3]:
for dsname in datasets:
    print("####", dsname)
    X, y, _ = load_classification_dataset(dsname)
    print(X.shape, np.unique(y))

#### iris




(150, 4) [0. 1. 2.]
#### wine
(178, 13) [0. 1. 2.]
#### breast_cancer
(569, 30) [0. 1.]


In [2]:
datasets = ["iris", "wine", "breast_cancer"]

# same interface for loading all datasets
def load_classification_dataset(name):
    # load one of the datasets as X and y
    units = {}
    if name == "iris":
        # sklearn iris housing dataset
        X, y = load_iris(True)

    elif name == "wine":
        # sklearn wine dataset
        X, y = load_wine(True)
    
    elif name == "breast_cancer":
        # sklearn breast_cancer dataset
        X, y = load_breast_cancer(True)

    else:
        raise RuntimeError("Unknown dataset %r" % name)
    return np.array(X, dtype=float), np.array(y, dtype=float), units

def test_model(dataset, model, param_grid):
    # load data
    X, y, _ = load_classification_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    if model.__class__.__name__ == "SVC":
        sscaler = StandardScaler()
        X_train = sscaler.fit_transform(X_train)
        X_test = sscaler.transform(X_test)
    # train model on train split incl cross-validation for parameter selection
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(model, param_grid, cv=5)
        gsmodel.fit(X_train, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test)))
    return gsmodel.best_estimator_

def test_autofeat(dataset, feateng_steps=2):
    # load data
    X, y, units = load_classification_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    # run autofeat
    afreg = AutoFeatClassifier(verbose=1, feateng_steps=feateng_steps, units=units)
    # fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
    X_train_tr = afreg.fit_transform(X_train, y_train)
    X_test_tr = afreg.transform(X_test)
    print("autofeat new features:", len(afreg.new_feat_cols_))
    print("autofeat Acc. on training data:", accuracy_score(y_train, afreg.predict(X_train_tr)))
    print("autofeat Acc. on test data:", accuracy_score(y_test, afreg.predict(X_test_tr)))
    # train rreg on transformed train split incl cross-validation for parameter selection
    print("# Logistic Regression")
    rreg = LogisticRegression(class_weight="balanced")
    param_grid = {"C": np.logspace(-4, 4, 10)}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(rreg, param_grid, cv=5)
        gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train_tr)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test_tr)))
    print("# Random Forest")
    rforest = RandomForestClassifier(n_estimators=100, random_state=13)
    param_grid = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    gsmodel = GridSearchCV(rforest, param_grid, cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train_tr)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test_tr)))

In [4]:
for dsname in datasets:
    print("####", dsname)
    rreg = LogisticRegression(class_weight="balanced")
    params = {"C": np.logspace(-4, 4, 10)}
    rreg = test_model(dsname, rreg, params)

#### iris
best params: {'C': 0.3593813663804626}
best score: 0.9666666666666666
Acc. on training data: 0.9666666666666667
Acc. on test data: 0.9666666666666667
#### wine




best params: {'C': 0.3593813663804626}
best score: 0.9364532019704435
Acc. on training data: 0.9507042253521126
Acc. on test data: 0.9444444444444444
#### breast_cancer




best params: {'C': 2.782559402207126}
best score: 0.9494505494505494
Acc. on training data: 0.945054945054945
Acc. on test data: 0.9473684210526315


In [5]:
for dsname in datasets:
    print("####", dsname)
    svc = SVC(gamma="scale", class_weight="balanced")
    params = {"C": [1., 10., 25., 50., 100., 250.]}
    svc = test_model(dsname, svc, params)

#### iris
best params: {'C': 10.0}
best score: 0.975
Acc. on training data: 0.9916666666666667
Acc. on test data: 0.9666666666666667
#### wine




best params: {'C': 10.0}
best score: 0.9785714285714286
Acc. on training data: 1.0
Acc. on test data: 0.9722222222222222
#### breast_cancer




best params: {'C': 1.0}
best score: 0.9758241758241759
Acc. on training data: 0.989010989010989
Acc. on test data: 0.9649122807017544


In [6]:
for dsname in datasets:
    print("####", dsname)
    rforest = RandomForestClassifier(n_estimators=100, random_state=13)
    params = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    rforest = test_model(dsname, rforest, params)

#### iris




best params: {'min_samples_leaf': 0.2}
best score: 0.9666666666666668
Acc. on training data: 0.9666666666666667
Acc. on test data: 0.9333333333333333
#### wine




best params: {'min_samples_leaf': 0.0001}
best score: 0.9859605911330049
Acc. on training data: 1.0
Acc. on test data: 0.9722222222222222
#### breast_cancer




best params: {'min_samples_leaf': 0.0001}
best score: 0.9582417582417582
Acc. on training data: 1.0
Acc. on test data: 0.9385964912280702


In [7]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=1)

#### iris
[AutoFeat] The 1 step feature engineering process could generate up to 28 features.
[AutoFeat] With 120 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng]               0/              4 features transformed



[feateng] Generated 24 transformed features from 4 original features - done.
[feateng] Generated altogether 24 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 8 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 6 features after 5 feature selection runs
[featsel] 5 features after correlation filtering
[featsel] 5 features after noise filtering
[AutoFeat] Computing 3 new features.
[AutoFeat]     3/    3 new features ...done.
[AutoFeat] Final dataframe with 7 feature columns (3 new).
[AutoFeat] Training final classification model.
[AutoFeat] Trained model: largest coefficients:
[ 7.05568401  1.04530727 -8.10099127]
0.257281 * exp(x002)
0.225728 * x003**3
0.054527 * 1/x003
0.036753 * x001
0.023388 * x002
[AutoFeat]



[feateng] Generated 73 transformed features from 13 original features - done.
[feateng] Generated altogether 73 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 17 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 11 features after 5 feature selection runs
[featsel] 11 features after correlation filtering
[featsel] 9 features after noise filtering
[AutoFeat] Computing 3 new features.
[AutoFeat]     3/    3 new features ...done.
[AutoFeat] Final dataframe with 16 feature columns (3 new).
[AutoFeat] Training final classification model.
[AutoFeat] Trained model: largest coefficients:
[-11.1838298   10.71610116   0.46772864]
9.398320 * x011
9.172419 * 1/x001
7.693593 * 1/x006
7.534347 * x002
6.912900 * 1/x009
6.7006

  return (log(t))
  return (log(t))
  return (log(t))
  return (log(t))
  return (log(t))
  return (log(t))


[feateng] Generated 155 transformed features from 30 original features - done.
[feateng] Generated altogether 164 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 62 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 14 features after 5 feature selection runs
[featsel] 10 features after correlation filtering
[featsel] 10 features after noise filtering
[AutoFeat] Computing 3 new features.
[AutoFeat]     3/    3 new features ...done.
[AutoFeat] Final dataframe with 33 feature columns (3 new).
[AutoFeat] Training final classification model.
[AutoFeat] Trained model: largest coefficients:
[53.63835254]
47.870369 * x007
43.976047 * x024
20.959313 * x028
7.683839 * x010
6.011701 * x026
5.590446 * 1/x013
1.458130 * x020