Licensed under the MIT License.

Copyright (c) 2021-2031. All rights reserved.

# Model Selection with TPOT

* TPOT uses genetic programming to optimize a machine learning pipeline that optimize the model performance. It has internal k-fold cross-validaton. At the end of the pipeline optimization procedure, the best pipeline is trained on the entire set of provided samples, and you can export the best pipeline code.


In [12]:
import pandas as pd
from tpot import TPOTRegressor, TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, balanced_accuracy_score
import timeit

## About TPOT API

* TPOT API: http://epistasislab.github.io/tpot/api/
  * Different choices for `config_dict`: http://epistasislab.github.io/tpot/using/#built-in-tpot-configurations
    * The defualt choice takes very long time for large dataset...
  * `generations` is the number of iterations to run the pipeline optimiation
  * `popluation_size` is the number of pipelines to retain in the genetic programming population every generation
  * Larger `generations` and `popluation_size` will take longer time to execute the pipeline, may not always bring better performance
  * Details of available values for each parameter: http://epistasislab.github.io/tpot/using/#tpot-on-the-command-line
  * How to use customized scoring function: http://epistasislab.github.io/tpot/using/#scoring-functions
  
### Regression

In [7]:
df = pd.read_pickle('../luigi_pipeline/output/preprocessed_data.pkl')

# train, test split for df
train_df = df.loc[df['Year'].astype(str) < '2015']
test_df = df.loc[df['Year'].astype(str) == '2015']

y_train, y_test = train_df['Sales'], test_df['Sales']
X_train, X_test = train_df.drop(['Sales', 'Date', 'Year'], axis=1), test_df.drop(['Sales', 'Date', 'Year'], axis=1)

X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
X_train.head()

(532529, 19) (161332, 19) (532529,) (161332,)


Unnamed: 0,Store,Month,Quarter,Customers_larger_than_3000,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,DayOfWeek,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,12,729,0.0,2,0,1270.0,9,2008,0,-1,-1,0,3,327,1,0,0,1
1,1,12,728,0.0,2,0,1270.0,9,2008,0,-1,-1,0,2,703,1,0,0,1
2,1,12,727,0.0,2,0,1270.0,9,2008,0,-1,-1,0,1,700,1,0,0,1
3,1,12,726,0.0,2,0,1270.0,9,2008,0,-1,-1,0,7,0,0,0,0,1
4,1,12,725,0.0,2,0,1270.0,9,2008,0,-1,-1,0,6,684,1,0,0,1


In [3]:
GENERATIONS = 5
POPULATION_SIZE = 100
CV = 5
SEED = 10

tpot_regression = TPOTRegressor(
    generations=GENERATIONS,
    population_size=POPULATION_SIZE,
    random_state=SEED,
    config_dict='TPOT light',
    n_jobs=-1, 
    cv=CV,
    verbosity=2,
)

start = timeit.default_timer()
tpot_regression.fit(X_train, y_train)
stop = timeit.default_timer()
print(f'Time: {stop - start} seconds')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=600.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: -1700568.1847760915

Generation 2 - Current best internal CV score: -1529739.2334290748

Generation 3 - Current best internal CV score: -1529739.2334290748

Generation 4 - Current best internal CV score: -1526786.2560226254

Generation 5 - Current best internal CV score: -1526786.2560226254

Best pipeline: LassoLarsCV(VarianceThreshold(MinMaxScaler(ZeroCount(input_matrix)), threshold=0.001), normalize=False)
Time: 14507.524469 seconds


In [6]:
preds = tpot_regression.predict(X_test)
print(r2_score(y_test, preds))
print()

tpot_regression.export('tpot_exported_pipeline/tpot_regression_sales_pipeline.py')
print(tpot_regression.export())

0.8966096700362922

import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from tpot.builtins import ZeroCount
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=10)

# Average CV score on the training set was: -1526786.2560226254
exported_pipeline = make_pipeline(
    ZeroCount(),
    MinMaxScaler(),
    VarianceThreshold(threshold=0.001),
    LassoLarsCV(normalize=False)
)
# Fix random state for all the steps i

  return f(*args, **kwargs)


### Classification

In [8]:
df30 = pd.read_csv('../../crystal_ball/data_collector/structured_data/leaf.csv')

y30 = df30['species']
X30 = df30.drop('species', axis=1)

X_train30, X_test30, y_train30, y_test30 = train_test_split(X30, y30, test_size=0.2,
                                               random_state=10, shuffle=True, stratify=y30)

X_train30.reset_index(inplace=True, drop=True)
X_test30.reset_index(inplace=True, drop=True)
y_train30.reset_index(inplace=True, drop=True)
y_test30.reset_index(inplace=True, drop=True)

print(X_train30.shape, X_test30.shape, y_train30.shape, y_test30.shape)
print(y_train30.nunique(), y_test30.nunique())

(272, 15) (68, 15) (272,) (68,)
30 30


In [22]:
GENERATIONS = 5
POPULATION_SIZE = 100
CV = 5
SEED = 10

tpot_classification = TPOTClassifier(
    generations=GENERATIONS,
    population_size=POPULATION_SIZE,
    random_state=SEED,
    config_dict='TPOT light',
    scoring='balanced_accuracy',
    n_jobs=-1, 
    cv=CV,
    verbosity=2,
)

start = timeit.default_timer()
tpot_classification.fit(X_train30, y_train30)
stop = timeit.default_timer()
print(f'Time: {stop - start} seconds')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=600.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.8679030263935925

Generation 2 - Current best internal CV score: 0.8748283878944256

Generation 3 - Current best internal CV score: 0.8748283878944256

Generation 4 - Current best internal CV score: 0.8748283878944256

Generation 5 - Current best internal CV score: 0.8852058781368808

Best pipeline: GaussianNB(PCA(Normalizer(ZeroCount(SelectFwe(input_matrix, alpha=0.007)), norm=l1), iterated_power=7, svd_solver=randomized))
Time: 61.20634079999945 seconds


In [19]:
preds30 = tpot_classification.predict(X_test30)
print(balanced_accuracy_score(y_test30, preds30))
print()

tpot_classification.export('tpot_exported_pipeline/tpot_classification_leaves30_pipeline.py')
print(tpot_classification.export())

0.8055555555555555

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFwe, f_classif
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from tpot.builtins import ZeroCount
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=10)

# Average CV score on the training set was: 0.7830976430976431
exported_pipeline = make_pipeline(
    SelectFwe(score_func=f_classif, alpha=0.007),
    Normalizer(norm="l2"),
    ZeroCount(),
    ZeroCount(),
   