<a href="https://colab.research.google.com/github/harnalashok/classification/blob/main/autofeat_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Last amended: 5th Sep, 2021
# Ref: 1. https://github.com/cod3licious/autofeat
#      2. https://github.com/cod3licious/autofeat/blob/master/autofeat_examples.ipynb

In [None]:
!pip install autofeat

Collecting autofeat
  Downloading autofeat-2.0.9-py3-none-any.whl (24 kB)
Collecting pint
  Downloading Pint-0.17-py2.py3-none-any.whl (204 kB)
[K     |████████████████████████████████| 204 kB 3.6 MB/s 
Installing collected packages: pint, autofeat
Successfully installed autofeat-2.0.9 pint-0.17


In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from autofeat import FeatureSelector, AutoFeatRegressor, AutoFeatClassifier

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Experiments

In [None]:
# generate some toy data
np.random.seed(10)
x1 = np.random.rand(1000)     # (1000,)
x2 = np.random.randn(1000)    # (1000,)
x3 = np.random.rand(1000)     # (1000,)
x4 = np.random.randn(1000)
x5 = np.random.rand(1000)


In [None]:
# Stack seven (1000,) variables one upon another
#  to get (7,1000) dataset
X = np.vstack([x1, x2, x3, x4, x5, 1/(x2 - 1/x3), (x2 + np.log(x1))**3])
X.shape

(7, 1000)

Transpose it to have our seven features as:<br>
f1=x1&nbsp;&nbsp; &nbsp; &nbsp;f2=x2&nbsp;&nbsp; &nbsp; &nbsp;    f3=x3&nbsp;&nbsp; &nbsp; &nbsp;     f4=x4&nbsp;&nbsp; &nbsp; &nbsp;    f5=x5&nbsp;&nbsp; &nbsp; &nbsp;   f6=1/(x2 - 1/x3)&nbsp;&nbsp; &nbsp; &nbsp;    f7=(x2 + np.log(x1))**3

In [None]:
X = X.T

In [None]:
# Data as a dataframe
df = pd.DataFrame(X, columns=["x1", "x2", "x3", "x4", "x5", "eng6", "eng7"])

In [None]:
df.head()

Unnamed: 0,x1,x2,x3,x4,x5,eng6,eng7
0,0.771321,1.097121,0.22256,0.029988,0.72017,-0.294459,0.587364
1,0.020752,-0.538577,0.226277,-0.975896,0.723432,-0.201697,-85.981735
2,0.633648,2.195597,0.824412,1.021897,0.090843,1.017697,5.26199
3,0.748804,-0.702644,0.716858,0.632287,0.029328,-0.476731,-0.975961
4,0.498507,-0.0606,0.013533,0.5171,0.435986,-0.013522,-0.433346


In [None]:
# Create target. Target, row-by-row has a non-linear relationship
# with independent features with target
# Note that x4 and x5 are absent here:

target = 2 + 15*x1 + 3/(x2 - 1/x3) + 5*(x2 + np.log(x1))**3

In [None]:
help(FeatureSelector)
# FeatureSelector(
#                  problem_type='regression',  # "classification"
#                  featsel_runs=5,
#                  keep=None,
#                  n_jobs=1,
#                  verbose=0)

In [None]:
# Instantiate feature selector
fsel = FeatureSelector(verbose=1)

In [None]:
# Perform feature selection
new_X = fsel.fit_transform(df, target)

In [None]:
# should contain ["x1", "eng6", "eng7"]
print(new_X.columns)

Index(['x1', 'eng7', 'eng6'], dtype='object')


## autofeat

### Generate some toy data

In [None]:
# Generate some toy data
np.random.seed(10)
x1 = np.random.rand(1000)
x2 = np.random.randn(1000)
x3 = np.random.rand(1000)

In [None]:
# Stack and transpose to create (1000,3) shape dataset
X = np.vstack([x1, x2, x3]).T

In [None]:
# Data frame
df_org = pd.DataFrame(X, columns=["x1", "x2", "x3"])
df_org.head()

In [None]:
# Now the target
target = 2 + 15*x1 + 3/(x2 - 1/x3) + 5*(x2 + np.log(x1))**3

In [None]:
# Add some noise to target
#  and create two more targets
target_noisy = target + 0.01*target.std()*np.random.randn(1000)
target_very_noisy = target + 0.1*target.std()*np.random.randn(1000)

### Autofeat with different numbers of FE steps

In [None]:
help(AutoFeatRegressor)

In [None]:
# Instaniate regressor
steps =1 
afreg = AutoFeatRegressor(verbose=1, feateng_steps=steps)

In [None]:
# Syntax of Regressor problem_type: regression
# AutoFeatRegressor(
#                    categorical_cols=None, # list of column names of cat features; 
 |                                          # these will be transformed into 0/1 encoding
#                    feateng_cols=None,     # list of col names for feature engineering
#                    units=None,            # Let it be as it is
#                    feateng_steps=2, # no. of steps to perform in the FE (default: 2)
#                    featsel_runs=5,
#                    max_gb=None, # max gigabytes to use
#                                 # this is no guarantee! 
#                                 # it will lead to subsampling of the
#                                 # data points if the new dataframe
#                                 # generated is n_rows * n_cols * 32bit > max_gb
#                    transformations=('1/', 'exp', 'log', 'abs', 'sqrt', '^2', '^3'),
#                    apply_pi_theorem=True,
#                    always_return_numpy=False,
#                    n_jobs=1,
#                    verbose=0)
 |  

In [None]:
df = afreg.fit_transform(df_org, target)

In [None]:
df.head()

Unnamed: 0,x1,x2,x3,1/x1,x2**2,x2**3,log(x1)
0,0.771321,1.097121,0.22256,1.296478,1.203674,1.320576,-0.259651
1,0.020752,-0.538577,0.226277,48.188244,0.290066,-0.156223,-3.875115
2,0.633648,2.195597,0.824412,1.578163,4.820645,10.584193,-0.456261
3,0.748804,-0.702644,0.716858,1.335463,0.493708,-0.346901,-0.289278
4,0.498507,-0.0606,0.013533,2.00599,0.003672,-0.000223,-0.696138


In [None]:
r2 = afreg.score(df_org, target)  # R^2/Accuracy returned by prediction_model
print("## Final R^2: %.4f" % r2)

In [None]:
plt.figure()
plt.scatter(
             afreg.predict(df_org),
             target,
             s=2
            );
plt.title("%i FE steps (R^2: %.4f; %i new features)" % (steps, r2, len(afreg.new_feat_cols_)))

In [None]:
steps = 2
afreg = AutoFeatRegressor(verbose=1, feateng_steps=steps)
df = afreg.fit_transform(df_org, target)
df.head()

In [None]:
r2 = afreg.score(df_org, target)
print("## Final R^2: %.4f" % r2)
plt.figure()
plt.scatter(afreg.predict(df_org), target, s=2);
plt.title("%i FE steps (R^2: %.4f; %i new features)" % (steps, r2, len(afreg.new_feat_cols_)))

In [None]:
steps = 3
afreg = AutoFeatRegressor(verbose=1, feateng_steps=steps)
df = afreg.fit_transform(df_org, target)
df.head()

In [None]:
r2 = afreg.score(df_org, target)
print("## Final R^2: %.4f" % r2)
plt.figure()
plt.scatter(afreg.predict(df_org), target, s=2);
plt.title("%i FE steps (R^2: %.4f; %i new features)" % (steps, r2, len(afreg.new_feat_cols_)))

In [None]:
help(AutoFeatClassifier)

# Problem

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score,cohen_kappa_score
import os, gc

In [None]:
# Path to data
path = "/gdrive/MyDrive/Colab_data_files/autofeat/"

In [None]:
# Read datafile
data = pd.read_csv(path + "Concrete_Data_Yeh.csv")
data.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [None]:
# Data should not have any nulls
data.isnull().sum().sum()

0

In [None]:
# Get predictors and target
X = data.iloc[:,:-1]
X.shape
y = data.iloc[:,-1]
y.shape

(1030, 8)

(1030,)

In [None]:
# split dataset
X_train,X_test, y_train,y_test = train_test_split(X,y,
                                                  test_size = 0.3
                                                  )

In [None]:
# Standardize
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns = data.columns[:-1])
X_test = pd.DataFrame(X_test, columns = data.columns[:-1])
X_train.head()
X_test.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age
0,-0.022239,-0.864576,-0.865997,0.441054,-1.033417,-0.051765,1.021137,-0.509222
1,1.588466,-0.583054,0.367997,-0.890834,0.883435,-0.064725,-0.780808,0.16377
2,-1.493795,1.292588,-0.865997,1.015144,-1.033417,-0.178767,0.321632,0.708573
3,0.170283,-0.864576,0.969374,-0.302967,0.536419,0.658406,-0.261498,-0.509222
4,-0.398706,1.92132,-0.865997,2.14036,-1.033417,-0.518302,-2.257402,3.592823


Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age
0,1.588466,-0.583054,0.367997,-0.890834,0.883435,-0.064725,-0.780808,-0.621387
1,-0.933384,-0.864576,1.742573,-0.541788,0.222452,1.083473,0.042581,-0.284891
2,-0.756111,0.695526,-0.865997,0.486981,-1.033417,-0.08546,0.397964,-0.621387
3,-1.204058,0.331893,-0.865997,0.486981,-1.033417,-1.088513,2.111064,-0.284891
4,0.177907,0.742446,0.805362,0.900326,-0.041942,-1.218106,-1.494078,-0.284891


In [None]:
help(AutoFeatRegressor)

Help on class AutoFeatRegressor in module autofeat.autofeat:

class AutoFeatRegressor(AutoFeatModel, sklearn.base.BaseEstimator, sklearn.base.RegressorMixin)
 |  AutoFeatRegressor(categorical_cols=None, feateng_cols=None, units=None, feateng_steps=2, featsel_runs=5, max_gb=None, transformations=('1/', 'exp', 'log', 'abs', 'sqrt', '^2', '^3'), apply_pi_theorem=True, always_return_numpy=False, n_jobs=1, verbose=0)
 |  
 |  Short-cut initialization for AutoFeatModel with problem_type: regression
 |  
 |  Method resolution order:
 |      AutoFeatRegressor
 |      AutoFeatModel
 |      sklearn.base.BaseEstimator
 |      sklearn.base.RegressorMixin
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, categorical_cols=None, feateng_cols=None, units=None, feateng_steps=2, featsel_runs=5, max_gb=None, transformations=('1/', 'exp', 'log', 'abs', 'sqrt', '^2', '^3'), apply_pi_theorem=True, always_return_numpy=False, n_jobs=1, verbose=0)
 |      multi-step feature enginee

In [None]:
X.columns

Index(['cement', 'slag', 'flyash', 'water', 'superplasticizer',
       'coarseaggregate', 'fineaggregate', 'age'],
      dtype='object')

In [None]:
%%time
cols_fe = ['slag']
afreg = AutoFeatRegressor(verbose=1, feateng_steps=3, feateng_cols = cols_fe)
df = afreg.fit_transform(X_train, y_train)

[AutoFeat] The 3 step feature engineering process could generate up to 532 features.
[AutoFeat] With 721 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng]               0/              1 features transformed[feateng] Generated 5 transformed features from 1 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 56 feature combinations from 15 original feature tuples - done.
[feateng] Step 3: transformation of new features
[feateng] Generated 226 transformed features from 56 original features - done.
[feateng] Generated altogether 293 new features in 3 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 207 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[

In [None]:
df.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,Abs(slag)/slag,exp(slag - 1/slag),exp(slag - exp(slag))
0,-0.022239,-0.864576,-0.865997,0.441054,-1.033417,-0.051765,1.021137,-0.509222,-1.0,1.339184,0.276428
1,1.588466,-0.583054,0.367997,-0.890834,0.883435,-0.064725,-0.780808,0.16377,-1.0,3.102021,0.319421
2,-1.493795,1.292588,-0.865997,1.015144,-1.033417,-0.178767,0.321632,0.708573,1.0,1.680255,0.095406
3,0.170283,-0.864576,0.969374,-0.302967,0.536419,0.658406,-0.261498,-0.509222,-1.0,1.339184,0.276428
4,-0.398706,1.92132,-0.865997,2.14036,-1.033417,-0.518302,-2.257402,3.592823,1.0,4.058628,0.007382


In [None]:
df_test =  afreg.transform(X_test)

[AutoFeat] Computing 31 new features.
[AutoFeat]     0/   31 new features[AutoFeat]     1/   31 new features[AutoFeat]     2/   31 new features[AutoFeat]     3/   31 new features[AutoFeat]     4/   31 new features[AutoFeat]     5/   31 new features[AutoFeat]     6/   31 new features[AutoFeat]     7/   31 new features[AutoFeat]     8/   31 new features[AutoFeat]     9/   31 new features[AutoFeat]    10/   31 new features[AutoFeat]    11/   31 new features[AutoFeat]    12/   31 new features[AutoFeat]    13/   31 new features[AutoFeat]    14/   31 new features[AutoFeat]    15/   31 new features[AutoFeat]    16/   31 new features[AutoFeat]    17/   31 new features[AutoFeat]    18/   31 new features[AutoFeat]    19/   31 new features[AutoFeat]    20/   31 new features[AutoFeat]    21/   31 new features[AutoFeat]    22/   31 new features[AutoFeat]    23/   31 new features[AutoFeat]    24/   31 new features[AutoFeat]    25/   31 new features[AutoFeat]    26/   31 new

In [None]:
df_test.head()

In [None]:
df.to_csv(path + "concrete_train.csv", index = False)
df_test.to_csv(path+"concrete_test.csv", index = False)

In [None]:
clf_autofeat = RandomForestRegressor()
clf_autofeat.fit(df, y_train)
pred_autofeat = clf_autofeat.predict(df_test)

In [None]:
clf = RandomForestRegressor()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

In [None]:
y_test.shape
X_test.shape
pred.shape

(309,)

(309, 8)

(309,)

In [None]:
from sklearn.metrics import explained_variance_score, mean_squared_error
explained_variance_score(y_test, pred), explained_variance_score(y_test, pred_autofeat)
mean_squared_error(y_test, pred),mean_squared_error(y_test, pred_autofeat)

(0.9077746541204639, 0.8894223984074386)

(27.262571583041655, 32.60140986914953)

In [None]:
# Path to data
path = "/gdrive/MyDrive/Colab_data_files/autofeat/"

# Read datafile
data = pd.read_csv(path + "heart.csv")
data.head()

# Get predictors and target
X = data.iloc[:,:-1]
X.shape
y = data.iloc[:,-1]
y.shape

# split dataset
X_train,X_test, y_train,y_test = train_test_split(X,y,
                                                  test_size = 0.3
                                                  )

# Standardize
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

X_train = pd.DataFrame(X_train, columns = data.columns[:-1])
X_test = pd.DataFrame(X_test, columns = data.columns[:-1])
X_train.head()
X_test.head()

In [None]:
%%time
afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
df = afreg.fit_transform(X_train, y_train)

[AutoFeat] The 3 step feature engineering process could generate up to 102466 features.
[AutoFeat] With 212 data points this new feature matrix would use about 0.09 gb of space.
[feateng] Step 1: transformation of original features
[feateng]               0/             13 features transformed[feateng] Generated 50 transformed features from 13 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 7772 feature combinations from 1953 original feature tuples - done.
[feateng] Step 3: transformation of new features
[feateng] Generated 32061 transformed features from 7772 original features - done.
[feateng] Generated altogether 39936 new features in 3 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 31358 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature se

In [None]:
df_test =  afreg.transform(X_test)

[AutoFeat] Computing 24 new features.
[AutoFeat]     0/   24 new features[AutoFeat]     1/   24 new features[AutoFeat]     2/   24 new features[AutoFeat]     3/   24 new features[AutoFeat]     4/   24 new features[AutoFeat]     5/   24 new features[AutoFeat]     6/   24 new features[AutoFeat]     7/   24 new features[AutoFeat]     8/   24 new features[AutoFeat]     9/   24 new features[AutoFeat]    10/   24 new features[AutoFeat]    11/   24 new features[AutoFeat]    12/   24 new features[AutoFeat]    13/   24 new features[AutoFeat]    14/   24 new features[AutoFeat]    15/   24 new features[AutoFeat]    16/   24 new features[AutoFeat]    17/   24 new features[AutoFeat]    18/   24 new features[AutoFeat]    19/   24 new features[AutoFeat]    20/   24 new features[AutoFeat]    21/   24 new features[AutoFeat]    22/   24 new features[AutoFeat]    23/   24 new features[AutoFeat]    24/   24 new features ...done.


In [None]:
clf_autofeat = RandomForestClassifier(n_estimators=500)
clf_autofeat.fit(df, y_train)
pred_autofeat = clf_autofeat.predict_proba(df_test)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
clf = RandomForestClassifier(n_estimators=500)
clf.fit(X_train, y_train)
pred = clf.predict_proba(X_test)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
pred[:5]

array([[0.14 , 0.86 ],
       [0.298, 0.702],
       [0.058, 0.942],
       [0.102, 0.898],
       [0.204, 0.796]])

In [None]:
pred_autofeat[:5]

array([[0.198, 0.802],
       [0.456, 0.544],
       [0.07 , 0.93 ],
       [0.036, 0.964],
       [0.092, 0.908]])

In [None]:
roc_auc_score(y_test, pred[:,1])

0.8844476744186047

In [None]:
roc_auc_score(y_test, pred_autofeat[:,1])

0.8599806201550387

In [None]:
accuracy_score(y_test,pred_autofeat)

0.7802197802197802

In [None]:
accuracy_score(y_test, pred)

0.7802197802197802

In [None]:
pred

array([1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0])

In [None]:
pred_autofeat

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0])

In [None]:
f1_score(y_test,pred) , f1_score(y_test,pred_autofeat)

(0.8076923076923077, 0.7959183673469388)