In [1]:
import ray
ray.init()

2022-11-29 15:11:20,299	INFO worker.py:1518 -- Started a local Ray instance.


0,1
Python version:,3.9.5
Ray version:,2.0.1


In [None]:
ray.shutdown()

# Basic Example

## Load Data

Basic data that can be understood as two different input columns (linear and random values) and three different output columns (sin of linear x, 1 + sin of linear x and random values).

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv("example_data.csv", dtype=float)

In [4]:
data.head()

Unnamed: 0,lin_x,rand_x,sinx,randomy,sinx_plus1
0,1.0,-0.55,0.099833,0.09,1.099833
1,2.0,-0.44,0.198669,0.83,1.198669
2,3.0,0.94,0.29552,0.12,1.29552
3,4.0,-0.61,0.389418,0.41,1.389418
4,5.0,-0.51,0.479426,-0.7,1.479426


## Get Combinations function

This function can be used to determine the overall number of combinations the predictability routine analyses given the number of data columns, fitting type etc. This allows to estimate the overall runtime of the predictability routine.

In [1]:
from src.ASD_predictability_utils.utils import get_column_combinations



The function returns a list of combination tuples, where the first inputs-many elements correspond to the inputs and the remaining ouputs-many to the targets. The argument targets can be used to define columns that should exclusively be regarded as targets.

In [7]:
# applied to a numerical example:
get_column_combinations(all_cols=[1, 2, 3, 4, 5, 6],
                                  inputs=3,
                                  outputs=1,
                                  targets=[5, 6]
                                  )

[(1, 2, 3, 5),
 (1, 2, 3, 6),
 (1, 2, 4, 5),
 (1, 2, 4, 6),
 (1, 3, 4, 5),
 (1, 3, 4, 6),
 (2, 3, 4, 5),
 (2, 3, 4, 6)]

In [8]:
# the argument "amount_only" can be used to output the amout of combinations only
get_column_combinations(all_cols=[1, 2, 3, 4, 5, 6],
                                  inputs=3,
                                  outputs=1,
                                  targets=[5, 6],
                                  amount_only=True
                                  )

8

In [9]:
# applied to the data above
print(get_column_combinations(all_cols=data.columns, inputs=1, outputs=1, targets=["sinx", "randomy", "sinx_plus1"]))

[('lin_x', 'sinx'), ('lin_x', 'randomy'), ('lin_x', 'sinx_plus1'), ('rand_x', 'sinx'), ('rand_x', 'randomy'), ('rand_x', 'sinx_plus1')]


In [10]:
print(get_column_combinations(all_cols=data.columns, inputs=1, outputs=1, targets=["sinx", "randomy", "sinx_plus1"],
                                  amount_only=True))

6


## Predictability function

Running the predictability function over all possible 1+1 combinations where the respective target is either sin, sin+1 or random and the input is linear x or random values.

The purpose of having sin + 1 is to have a tuple that exclusively contains positive values so fitting a power law can be applied.

In [5]:
from bin.main import predictability



In [6]:
metrics_dict, datas_dict = predictability(data=data,
                                          input_cols=1,
                                          output_cols=1,
                                          col_set=None,
                                          targets=["sinx", "randomy", "sinx_plus1"],
                                          method="kNN",
                                          random_state_split=None
                                          )

[2m[36m(parallel_pred_step_kNN pid=43240)[0m Fitting 3 folds for each of 6 candidates, totalling 18 fits
[2m[36m(parallel_pred_step_kNN pid=43235)[0m Fitting 3 folds for each of 6 candidates, totalling 18 fits
[2m[36m(parallel_pred_step_kNN pid=43249)[0m Fitting 3 folds for each of 6 candidates, totalling 18 fits
[2m[36m(parallel_pred_step_kNN pid=43244)[0m Fitting 3 folds for each of 6 candidates, totalling 18 fits




[2m[36m(parallel_pred_step_kNN pid=43237)[0m Fitting 3 folds for each of 6 candidates, totalling 18 fits




[2m[36m(parallel_pred_step_kNN pid=43247)[0m Fitting 3 folds for each of 6 candidates, totalling 18 fits


202211:15:11:42 : INFO : main : predictability : 165 : The whole run took 9.99s.


##### Note that starting the Ray instance accounts for most of the runtime. Start the cell above again and runtime will be <1s.

In [7]:
pd.DataFrame.from_dict(metrics_dict).transpose()

Unnamed: 0,Unnamed: 1,kNN r2,linear r2,pow. law r2,mean r2,kNN RMSE,linear RMSE,pow. law RMSE,mean RMSE,kNN RMSE/std,linear RMSE/std,...,pow. law MAPE,mean MAPE,kNN rae,linear rae,pow. law rae,mean rae,kNN dcor,linear dcor,pow. law dcor,mean dcor
lin_x,sinx,0.975773,-0.094976,,-0.086062,0.111329,0.748444,,0.745391,0.15565,1.046411,...,,1.535684,37.772541,30.095942,,29.94321,0.98579,0.468877,,0.000216
lin_x,randomy,-0.074181,-0.090311,,-0.020105,0.631771,0.636496,,0.615663,1.036427,1.04418,...,,0.999583,29.926694,29.607401,,29.589768,0.207838,0.389297,,0.0
lin_x,sinx_plus1,0.996178,-0.013076,-0.412493,-0.00519,0.04529,0.73733,0.870631,0.734455,0.061824,1.006517,...,8.547186,14.02315,36.490942,30.802241,36.913528,30.641519,0.997819,0.422273,0.425928,0.000245
rand_x,sinx,0.025909,0.026123,,-0.010687,0.654294,0.654222,,0.666471,0.98696,0.986852,...,,1.071262,31.637508,30.987042,,30.701637,0.405535,0.371991,,0.000171
rand_x,randomy,-0.247209,-0.022426,,-0.009712,0.644845,0.583851,,0.580209,1.116785,1.011151,...,,0.968305,31.409502,30.323394,,30.247089,0.318527,0.211952,,0.00011
rand_x,sinx_plus1,-0.027956,0.004712,,-0.013886,0.638268,0.628044,,0.633884,1.013882,0.997641,...,,527.005458,30.334707,29.838283,,29.76062,0.292305,0.329566,,0.0


## Tuple Selection function

This function can be used to limit the number of tuples that is further analysed in more detail.

In [8]:
from bin.main import tuple_selection

In [9]:
selected_tuples = tuple_selection(metrics_dict, n_best=2)
selected_tuples

[('lin_x', 'sinx_plus1'), ('lin_x', 'sinx')]

## Refine Predictability function

This function is used to further refine the predictability of the previously selected best tuples.

In [10]:
from bin.main import refine_predictability

### auto-sklearn

In [None]:
autoskl_refined_metrics_dict, autoskl_refined_datas_dict = refine_predictability(best_tuples=selected_tuples,
                                                                 data_dict=datas_dict,
                                                                 time_left_for_this_task=30,
                                                                 per_run_time_limit=30,
                                                                 n_jobs=-1,
                                                                 use_ray=True,
                                                                 package="autosklearn"
                                                                 )

In [20]:
pd.DataFrame.from_dict(autoskl_refined_metrics_dict).transpose()

Unnamed: 0,Unnamed: 1,r2,RMSE,MAPE,rae,dcor
lin_x,sinx_plus1,1.0,1.609804e-07,6.822043e-08,1.01615e-07,1.0
lin_x,sinx,0.867775,0.24251,0.05620455,0.08158683,0.935042


### Hyperopt

In [None]:
hyperopt_refined_metrics_dict, hyperopt_refined_datas_dict = refine_predictability(best_tuples=selected_tuples,
                                                                 data_dict=datas_dict,
                                                                 time_left_for_this_task=30,
                                                                 per_run_time_limit=90,
                                                                 n_jobs=-1,
                                                                 use_ray=True,
                                                                 package="hyperopt"
                                                                 )

In [18]:
pd.DataFrame.from_dict(hyperopt_refined_metrics_dict).transpose()

Unnamed: 0,Unnamed: 1,r2,RMSE,MAPE,rae,dcor
lin_x,sinx_plus1,0.84202,0.269841,1.303316,0.410102,0.908912
lin_x,sinx,0.863891,0.246046,0.22667,0.154319,0.928859


In [18]:
for key in list(hyperopt_refined_datas_dict.keys()):
    print(key, hyperopt_refined_datas_dict[key]["ensemble"])


('lin_x', 'sinx') {'learner': ExtraTreesRegressor(bootstrap=True, max_features='sqrt', n_estimators=2752,
                    n_jobs=1, random_state=4, verbose=False), 'preprocs': (StandardScaler(with_mean=False),), 'ex_preprocs': ()}
('lin_x', 'sinx_plus1') {'learner': ExtraTreesRegressor(max_features=None, n_estimators=1421, n_jobs=1,
                    random_state=3, verbose=False), 'preprocs': (StandardScaler(with_std=False),), 'ex_preprocs': ()}


### TPOT

In [None]:
tpot_refined_metrics_dict, tpot_refined_datas_dict = refine_predictability(best_tuples=selected_tuples,
                                                                 data_dict=datas_dict,
                                                                 time_left_for_this_task=120,
                                                                 per_run_time_limit=90,
                                                                 n_jobs=-1,
                                                                 use_ray=True,
                                                                 package="tpot"
                                                                 )

In [12]:
pd.DataFrame.from_dict(tpot_refined_metrics_dict).transpose()

Unnamed: 0,Unnamed: 1,r2,RMSE,MAPE,rae,dcor
lin_x,sinx_plus1,0.995503,0.049124,0.159309,0.051666,0.99709
lin_x,sinx,0.984351,0.089474,0.257285,0.100065,0.989501


In [13]:
for key in list(tpot_refined_datas_dict.keys()):
    print(key, tpot_refined_datas_dict[key]["ensemble"])


('lin_x', 'sinx_plus1') Pipeline(steps=[('stackingestimator',
                 StackingEstimator(estimator=ExtraTreesRegressor(max_features=0.25,
                                                                 min_samples_leaf=3,
                                                                 min_samples_split=15))),
                ('xgbregressor',
                 XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              f...one, gamma=0, gpu_id=-1,
                              grow_policy='depthwise', importance_type=None,
                              interaction_constraints='', learning_rate=0.1,
                              max_bin=256, max_cat_threshold=64,
                              max_cat_to_onehot=4, max_delta_step

In [23]:
for key in list(tpot_refined_datas_dict.keys()):
    print(key, tpot_refined_datas_dict[key]["pareto_pipelines"])

('lin_x', 'sinx') {'KNeighborsRegressor(input_matrix, KNeighborsRegressor__n_neighbors=5, KNeighborsRegressor__p=2, KNeighborsRegressor__weights=distance)': Pipeline(steps=[('kneighborsregressor',
                 KNeighborsRegressor(weights='distance'))]), 'LassoLarsCV(RBFSampler(input_matrix, RBFSampler__gamma=0.30000000000000004), LassoLarsCV__normalize=False)': Pipeline(steps=[('rbfsampler', RBFSampler(gamma=0.30000000000000004)),
                ('lassolarscv', LassoLarsCV(normalize=False))]), 'LassoLarsCV(RBFSampler(MaxAbsScaler(CombineDFs(CombineDFs(input_matrix, CombineDFs(input_matrix, input_matrix)), input_matrix)), RBFSampler__gamma=0.55), LassoLarsCV__normalize=False)': Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('featureunion',
                                                 FeatureUnion(transformer_list=[('functiontransformer',
                                                                                 FunctionTransformer(func=<

In [26]:
tpot_refined_datas_dict[list(tpot_refined_datas_dict.keys())[1]]["pareto_pipelines"].keys()

dict_keys(['KNeighborsRegressor(input_matrix, KNeighborsRegressor__n_neighbors=4, KNeighborsRegressor__p=1, KNeighborsRegressor__weights=distance)', 'LassoLarsCV(RBFSampler(input_matrix, RBFSampler__gamma=0.05), LassoLarsCV__normalize=False)'])

In [22]:
for key in list(tpot_refined_datas_dict.keys()):
    print(key, tpot_refined_datas_dict[key]["all_individuals"])

('lin_x', 'sinx') {'DecisionTreeRegressor(input_matrix, DecisionTreeRegressor__max_depth=2, DecisionTreeRegressor__min_samples_leaf=18, DecisionTreeRegressor__min_samples_split=17)': {'generation': 0, 'mutation_count': 0, 'crossover_count': 0, 'predecessor': ('ROOT',), 'operator_count': 1, 'internal_cv_score': -0.16192083307945257}, 'RidgeCV(RobustScaler(input_matrix))': {'generation': 0, 'mutation_count': 0, 'crossover_count': 0, 'predecessor': ('ROOT',), 'operator_count': 2, 'internal_cv_score': -0.43610853916798514}, 'KNeighborsRegressor(ZeroCount(input_matrix), KNeighborsRegressor__n_neighbors=14, KNeighborsRegressor__p=2, KNeighborsRegressor__weights=distance)': {'generation': 0, 'mutation_count': 0, 'crossover_count': 0, 'predecessor': ('ROOT',), 'operator_count': 2, 'internal_cv_score': -0.06571448102548552}, 'DecisionTreeRegressor(input_matrix, DecisionTreeRegressor__max_depth=4, DecisionTreeRegressor__min_samples_leaf=16, DecisionTreeRegressor__min_samples_split=6)': {'generat

In [None]:
for key in refined_datas_dict[list(refined_datas_dict.keys())[0]]["ensemble_models"]["sklearn_regressor"].keys():
    print(f'Weight: {refined_datas_dict[list(refined_datas_dict.keys())[0]]["ensemble_models"]["ensemble_weight"][key]}, \t Regressor: {refined_datas_dict[list(refined_datas_dict.keys())[0]]["ensemble_models"]["sklearn_regressor"][key]}')