## データの準備

In [1]:
from sklearn.datasets import load_boston
housing = load_boston()

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target,
                                                    train_size=0.75, test_size=0.25,
                                                    random_state=2018)

## TPOT回帰を実践
### モジュールのインポート

In [3]:
from tpot import TPOTRegressor

### 引数を入れてインスタンス生成

In [4]:
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)

### フィッティング実行

In [5]:
import time
t0 = time.time()
tpot.fit(X_train, y_train)
print(time.time() - t0)



HBox(children=(IntProgress(value=0, description='Optimization Progress', max=300, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: -9.89499338132346
Generation 2 - Current best internal CV score: -9.89499338132346
Generation 3 - Current best internal CV score: -9.89499338132346
Generation 4 - Current best internal CV score: -9.20830986451281
Generation 5 - Current best internal CV score: -9.20830986451281

Best pipeline: GradientBoostingRegressor(SelectFwe(input_matrix, alpha=0.003), alpha=0.75, learning_rate=0.1, loss=ls, max_depth=6, max_features=0.35000000000000003, min_samples_leaf=3, min_samples_split=18, n_estimators=100, subsample=0.7500000000000001)
126.89158344268799


In [6]:
print(tpot.score(X_test, y_test))

-7.550367333235424


In [7]:
tpot.fitted_pipeline_

Pipeline(memory=None,
     steps=[('selectfwe', SelectFwe(alpha=0.003,
     score_func=<function f_regression at 0x000002483C0147B8>)), ('gradientboostingregressor', GradientBoostingRegressor(alpha=0.75, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=6,
             max_features=0.35...0.7500000000000001, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False))])

In [8]:
tpot.evaluated_individuals_

{'ElasticNetCV(input_matrix, ElasticNetCV__l1_ratio=0.7000000000000001, ElasticNetCV__tol=0.1)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 1,
  'internal_cv_score': -31.069535231284778},
 'ExtraTreesRegressor(DecisionTreeRegressor(input_matrix, DecisionTreeRegressor__max_depth=2, DecisionTreeRegressor__min_samples_leaf=1, DecisionTreeRegressor__min_samples_split=18), ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.8, ExtraTreesRegressor__min_samples_leaf=11, ExtraTreesRegressor__min_samples_split=6, ExtraTreesRegressor__n_estimators=100)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 2,
  'internal_cv_score': -26.761995303942495},
 'KNeighborsRegressor(input_matrix, KNeighborsRegressor__n_neighbors=42, KNeighborsRegressor__p=1, KNeighborsRegressor__weights=distance)': {'generation': 0,
  'mutation_count': 0,
  'crossover_cou

### テストデータに適用

In [9]:
from scipy.stats import pearsonr
print('r = {:.3f}, p = {:.3f}'.format(*pearsonr(tpot.predict(X_test), y_test)))

r = 0.949, p = 0.000


In [11]:
from sklearn.metrics import r2_score
r2_score(tpot.predict(X_test), y_test)

0.8904210023069816

## 引数を変えて試す

### 並列演算してみる

In [12]:
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, n_jobs=-1)
t0 = time.time()
tpot.fit(X_train, y_train)
print(time.time() - t0)



HBox(children=(IntProgress(value=0, description='Optimization Progress', max=300, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: -14.19487446474859
Generation 2 - Current best internal CV score: -14.19487446474859
Generation 3 - Current best internal CV score: -12.981679628725038
Generation 4 - Current best internal CV score: -12.981679628725038
Generation 5 - Current best internal CV score: -12.981679628725038

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.6500000000000001, min_samples_leaf=1, min_samples_split=15, n_estimators=100)
45.23204517364502


### 世代と人口を増やしてみる

In [13]:
tpot = TPOTRegressor(verbosity=2, n_jobs=-1)
t0 = time.time()
tpot.fit(X_train, y_train)
print(time.time() - t0)



HBox(children=(IntProgress(value=0, description='Optimization Progress', max=10100, style=ProgressStyle(descri…

Generation 1 - Current best internal CV score: -12.320834825896018
Generation 2 - Current best internal CV score: -12.320834825896018
Generation 3 - Current best internal CV score: -12.320834825896018
Generation 4 - Current best internal CV score: -11.456809145038346
Generation 5 - Current best internal CV score: -10.14467958287719
Generation 6 - Current best internal CV score: -10.14467958287719
Generation 7 - Current best internal CV score: -10.14467958287719
Generation 8 - Current best internal CV score: -10.14467958287719
Generation 9 - Current best internal CV score: -9.741908708600786
Generation 10 - Current best internal CV score: -9.741908708600786
Generation 11 - Current best internal CV score: -9.741908708600786
Generation 12 - Current best internal CV score: -9.741908708600786
Generation 13 - Current best internal CV score: -9.662321019789474
Generation 14 - Current best internal CV score: -9.314097566243934
Generation 15 - Current best internal CV score: -9.314097566243934


In [17]:
print('r = {:.3f}, p = {:.3f}'.format(*pearsonr(tpot.predict(X_test), y_test)))
print('R2 = {:.3f}'.format(r2_score(tpot.predict(X_test), y_test)))

r = 0.954, p = 0.000
R2 = 0.900


## パイプラインの出力

In [15]:
tpot.export('tpot_boston_pipeline.py')

True