In [10]:
import numpy as np 
import pandas as pd 
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split

In [4]:
train_data = pd.read_csv('Big_Mart_Train_Preprocessed.csv')
print(train_data.shape)
train_data.head()

(8523, 11)


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,0,0.016047,4,249.8092,9,18,1,0,1,3735.138
1,5.92,1,0.019278,14,48.2692,3,8,1,2,2,443.4228
2,17.5,0,0.01676,10,141.618,9,18,1,0,1,2097.27
3,19.2,1,0.0,6,182.095,0,19,1,2,0,732.38
4,8.93,0,0.0,9,53.8614,1,30,0,2,1,994.7052


In [12]:
tpot_train = train_data.iloc[:,:-1]
print(tpot_train.shape)
tpot_train.head()

(8523, 10)


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,9.3,0,0.016047,4,249.8092,9,18,1,0,1
1,5.92,1,0.019278,14,48.2692,3,8,1,2,2
2,17.5,0,0.01676,10,141.618,9,18,1,0,1
3,19.2,1,0.0,6,182.095,0,19,1,2,0
4,8.93,0,0.0,9,53.8614,1,30,0,2,1


In [8]:
target = train_data.iloc[:,-1]
print(len(target))
target.head()

8523


0    3735.1380
1     443.4228
2    2097.2700
3     732.3800
4     994.7052
Name: Item_Outlet_Sales, dtype: float64

In [15]:
X_train, X_val, y_train, y_val = train_test_split(tpot_train, target,
 train_size=0.75, test_size=0.25)

In [19]:
# for each seed, find a tpot selected model
seeds = range(5)
for seed in seeds:
    print(seed)
    tpot_estimator = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=seed)
    tpot_estimator.fit(X_train, y_train)
    print(tpot_estimator.score(X_val, y_val))   # By default neg mean squared error (neg MSE), larger indicates better
    tpot_estimator.export('tpot_bigmart_pipeline'+str(seed)+'.py')

0


Generation 1 - Current best internal CV score: -1192429.06591
Generation 2 - Current best internal CV score: -1189832.54309
Generation 3 - Current best internal CV score: -1189832.54309
Generation 4 - Current best internal CV score: -1184954.14
Generation 5 - Current best internal CV score: -1183871.74403

Best pipeline: ExtraTreesRegressor(RobustScaler(input_matrix), bootstrap=False, max_features=0.8, min_samples_leaf=20, min_samples_split=5, n_estimators=100)
-1125293.57683
1


Generation 1 - Current best internal CV score: -1195278.44618
Generation 2 - Current best internal CV score: -1188827.95852
Generation 3 - Current best internal CV score: -1182551.24937
Generation 4 - Current best internal CV score: -1181564.24502
Generation 5 - Current best internal CV score: -1181564.24502

Best pipeline: ExtraTreesRegressor(SelectFwe(input_matrix, alpha=0.018), bootstrap=False, max_features=0.65, min_samples_leaf=10, min_samples_split=20, n_estimators=100)
-1119006.32401
2


Generation 1 - Current best internal CV score: -1183996.65434
Generation 2 - Current best internal CV score: -1182903.75659
Generation 3 - Current best internal CV score: -1180972.83182
Generation 4 - Current best internal CV score: -1178786.73987
Generation 5 - Current best internal CV score: -1178786.73987

Best pipeline: ExtraTreesRegressor(RidgeCV(LassoLarsCV(input_matrix, normalize=True)), bootstrap=True, max_features=0.4, min_samples_leaf=10, min_samples_split=10, n_estimators=100)
-1122429.63203
3


Generation 1 - Current best internal CV score: -1183871.74403
Generation 2 - Current best internal CV score: -1183871.74403
Generation 3 - Current best internal CV score: -1183871.74403
Generation 4 - Current best internal CV score: -1178538.30484
Generation 5 - Current best internal CV score: -1178538.30484

Best pipeline: ExtraTreesRegressor(ElasticNetCV(SelectPercentile(input_matrix, percentile=18), l1_ratio=0.35, tol=0.0001), bootstrap=False, max_features=0.85, min_samples_leaf=20, min_samples_split=19, n_estimators=100)
-1121194.76123
4


Generation 1 - Current best internal CV score: -1179876.09176
Generation 2 - Current best internal CV score: -1179876.09176
Generation 3 - Current best internal CV score: -1179876.09176
Generation 4 - Current best internal CV score: -1179876.09176
Generation 5 - Current best internal CV score: -1179876.09176

Best pipeline: ExtraTreesRegressor(SelectFwe(input_matrix, alpha=0.015), bootstrap=True, max_features=0.75, min_samples_leaf=16, min_samples_split=2, n_estimators=100)
-1119358.04475


## Spot-Check Model Selection

* List potential models with initial optimized params from TPOT
  * Although I did  data preprocessing before using TPOT, it will still do some preprocessing for you.
* calculate average evaluation result and variance
  * All the scoring str in sklearn: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [28]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LassoLarsCV, RidgeCV, ElasticNetCV
from sklearn.feature_selection import SelectFwe, f_regression, SelectPercentile
from tpot.builtins import StackingEstimator
from sklearn.model_selection import cross_val_score

pipeline_lst = [make_pipeline(
    RobustScaler(),
    ExtraTreesRegressor(bootstrap=False, max_features=0.8, min_samples_leaf=20, min_samples_split=5, n_estimators=100)),
    
    make_pipeline(
    SelectFwe(score_func=f_regression, alpha=0.018),
    ExtraTreesRegressor(bootstrap=False, max_features=0.65, min_samples_leaf=10, min_samples_split=20, n_estimators=100)),
        
    make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=RidgeCV()),
    ExtraTreesRegressor(bootstrap=True, max_features=0.4, min_samples_leaf=10, min_samples_split=10, n_estimators=100)),
                
    make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=18),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.35, tol=0.0001)),
    ExtraTreesRegressor(bootstrap=False, max_features=0.85, min_samples_leaf=20, min_samples_split=19, n_estimators=100)),
                
    make_pipeline(
    SelectFwe(score_func=f_regression, alpha=0.015),
    ExtraTreesRegressor(bootstrap=True, max_features=0.75, min_samples_leaf=16, min_samples_split=2, n_estimators=100))]

In [29]:
def evaluate_models(X, y, pipeline_lst, metric='neg_mean_squared_error', cv_folds=10): # by default cv is stratified kfold
    agg_score_lst = []
    for pipeline in pipeline_lst:
        print(pipeline)
        scores = cross_val_score(pipeline, X, y, scoring=metric, cv=cv_folds, n_jobs=-1)
        avg_score = np.mean(scores)
        score_std = np.std(scores)
        agg_score_lst.append((avg_score, score_std))
    return agg_score_lst

In [30]:
agg_score_lst = evaluate_models(tpot_train, target, pipeline_lst)

Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('extratreesregressor', ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features=0.8, max_leaf_nodes=None, min_impurity_decrease=0.0,
    ...ators=100, n_jobs=None, oob_score=False,
          random_state=None, verbose=0, warm_start=False))])
Pipeline(memory=None,
     steps=[('selectfwe', SelectFwe(alpha=0.018, score_func=<function f_regression at 0x1a17c278c0>)), ('extratreesregressor', ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features=0.65, max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=No...ators=100, n_jobs=None,
          oob_score=False, random_state=None, verbose=0, warm_start=False))])
Pipeline(memory=None,
     steps=[('stackingestimator-1', StackingEstimator(estimator=LassoLarsCV(copy_X=True, cv='warn', eps=2.22044604

In [31]:
print(agg_score_lst)

[(-1166438.9446668164, 42581.834765934145), (-1163621.7129955809, 42356.911860905471), (-1164530.0664142575, 40158.963808928522), (-1163390.391937614, 39140.044374774894), (-1162337.9632020867, 42195.489678611018)]


So base on the spot check here, the last second pipeline is better in both avg score and score variance.

`make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=18),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.35, tol=0.0001)),
    ExtraTreesRegressor(bootstrap=False, max_features=0.85, min_samples_leaf=20, min_samples_split=19, n_estimators=100))`