In [2]:
import ray
ray.init()

2022-11-08 15:24:24,606	INFO worker.py:1518 -- Started a local Ray instance.


0,1
Python version:,3.9.5
Ray version:,2.0.1


# Basic Example

## Load Data

Basic data that can be understood as two different input columns (linear and random values) and three different output columns (sin of linear x, 1 + sin of linear x and random values).

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("example_data.csv", dtype=float)

In [5]:
data.head()

Unnamed: 0,lin_x,rand_x,sinx,randomy,sinx_plus1
0,1.0,-0.55,0.099833,0.09,1.099833
1,2.0,-0.44,0.198669,0.83,1.198669
2,3.0,0.94,0.29552,0.12,1.29552
3,4.0,-0.61,0.389418,0.41,1.389418
4,5.0,-0.51,0.479426,-0.7,1.479426


## Get Combinations function

This function can be used to determine the overall number of combinations the predictability routine analyses given the number of data columns, fitting type etc. This allows to estimate the overall runtime of the predictability routine.

In [6]:
from src.ASD_predictability_utils.utils import get_column_combinations

The function returns a list of combination tuples, where the first inputs-many elements correspond to the inputs and the remaining ouputs-many to the targets. The argument targets can be used to define columns that should exclusively be regarded as targets.

In [7]:
# applied to a numerical example:
get_column_combinations(all_cols=[1, 2, 3, 4, 5, 6],
                                  inputs=3,
                                  outputs=1,
                                  targets=[5, 6]
                                  )

[(1, 2, 3, 5),
 (1, 2, 3, 6),
 (1, 2, 4, 5),
 (1, 2, 4, 6),
 (1, 3, 4, 5),
 (1, 3, 4, 6),
 (2, 3, 4, 5),
 (2, 3, 4, 6)]

In [8]:
# the argument "amount_only" can be used to output the amout of combinations only
get_column_combinations(all_cols=[1, 2, 3, 4, 5, 6],
                                  inputs=3,
                                  outputs=1,
                                  targets=[5, 6],
                                  amount_only=True
                                  )

8

In [9]:
# applied to the data above
print(get_column_combinations(all_cols=data.columns, inputs=1, outputs=1, targets=["sinx", "randomy", "sinx_plus1"]))

[('lin_x', 'sinx'), ('lin_x', 'randomy'), ('lin_x', 'sinx_plus1'), ('rand_x', 'sinx'), ('rand_x', 'randomy'), ('rand_x', 'sinx_plus1')]


In [10]:
print(get_column_combinations(all_cols=data.columns, inputs=1, outputs=1, targets=["sinx", "randomy", "sinx_plus1"],
                                  amount_only=True))

6


## Predictability function

Running the predictability function over all possible 1+1 combinations where the respective target is either sin, sin+1 or random and the input is linear x or random values.

The purpose of having sin + 1 is to have a tuple that exclusively contains positive values so fitting a power law can be applied.

In [11]:
from bin.main import predictability

In [12]:
metrics_dict, datas_dict = predictability(data=data,
                                          input_cols=1,
                                          output_cols=1,
                                          col_set=None,
                                          targets=["sinx", "randomy", "sinx_plus1"],
                                          method="kNN",
                                          random_state_split=None
                                          )

[2m[36m(parallel_pred_step_kNN pid=102781)[0m Analysing ('rand_x', 'sinx_plus1') now.
[2m[36m(parallel_pred_step_kNN pid=102781)[0m start kNN routine
[2m[36m(parallel_pred_step_kNN pid=102781)[0m Fitting 3 folds for each of 6 candidates, totalling 18 fits
[2m[36m(parallel_pred_step_kNN pid=102786)[0m Analysing ('lin_x', 'sinx_plus1') now.
[2m[36m(parallel_pred_step_kNN pid=102786)[0m start kNN routine
[2m[36m(parallel_pred_step_kNN pid=102786)[0m Fitting 3 folds for each of 6 candidates, totalling 18 fits
[2m[36m(parallel_pred_step_kNN pid=102787)[0m Analysing ('lin_x', 'sinx') now.
[2m[36m(parallel_pred_step_kNN pid=102787)[0m start kNN routine
[2m[36m(parallel_pred_step_kNN pid=102787)[0m Fitting 3 folds for each of 6 candidates, totalling 18 fits
[2m[36m(parallel_pred_step_kNN pid=102778)[0m Analysing ('lin_x', 'randomy') now.
[2m[36m(parallel_pred_step_kNN pid=102778)[0m start kNN routine
[2m[36m(parallel_pred_step_kNN pid=102778)[0m Fitting 3 fo

##### Note that starting the Ray instance accounts for most of the runtime. Start the cell above again and runtime will be <1s.

In [13]:
pd.DataFrame.from_dict(metrics_dict).transpose()

Unnamed: 0,Unnamed: 1,kNN r2,linear r2,pow. law r2,mean r2,kNN RMSE,linear RMSE,pow. law RMSE,mean RMSE,kNN RMSE/std,linear RMSE/std,...,pow. law MAPE,mean MAPE,kNN rae,linear rae,pow. law rae,mean rae,kNN dcor,linear dcor,pow. law dcor,mean dcor
lin_x,sinx,0.927973,-0.10376,,-0.031741,0.180688,0.707328,,0.683863,0.268378,1.0506,...,,1.025132,36.373312,30.354678,,30.225325,0.969348,0.472007,,0.000214
lin_x,randomy,0.076147,-0.005914,,-0.005303,0.600267,0.62636,,0.626169,0.961173,1.002953,...,,0.9975,29.864493,29.657798,,29.659535,0.459352,0.242814,,0.0
lin_x,sinx_plus1,0.986451,-0.205439,-0.959307,-0.077711,0.077002,0.72631,0.925978,0.686753,0.116401,1.097925,...,304.297736,504.329381,38.221088,35.267212,45.202231,33.264498,0.990308,0.435462,0.426597,0.0
rand_x,sinx,0.001501,0.004933,,-0.013052,0.632834,0.631745,,0.637429,0.999249,0.99753,...,,0.977183,32.306825,31.637759,,30.723015,0.274593,0.319025,,0.000193
rand_x,randomy,-0.326034,-0.220654,,-0.211552,0.690875,0.662854,,0.660378,1.151536,1.104832,...,,1.450535,35.179767,34.545465,,34.264513,0.23472,0.263941,,0.0
rand_x,sinx_plus1,-0.121684,-0.139449,,-0.169557,0.704903,0.710463,,0.719788,1.059096,1.06745,...,,557.091293,31.70171,31.119089,,30.952806,0.354361,0.358192,,0.0


## Tuple Selection function

This function can be used to limit the number of tuples that is further analysed in more detail.

In [14]:
from bin.main import tuple_selection

In [15]:
selected_tuples = tuple_selection(metrics_dict, n_best=2)
selected_tuples

[('lin_x', 'sinx_plus1'), ('lin_x', 'sinx')]

## Refine Predictability function

This function is used to further refine the predictability of the previously selected best tuples.

In [16]:
from bin.main import refine_predictability

In [17]:
refined_metrics_dict, refined_datas_dict = refine_predictability(best_tuples=selected_tuples, data_dict=datas_dict, time_left_for_this_task=120,
                          per_run_time_limit=30, n_jobs=-1)

[2m[36m(parallel_refinement_step pid=102789)[0m Perhaps you already have a cluster running?
[2m[36m(parallel_refinement_step pid=102789)[0m Hosting the HTTP server on port 44551 instead


[2m[36m(parallel_refinement_step pid=102786)[0m Train R2 score: 0.9476159378347307
[2m[36m(parallel_refinement_step pid=102786)[0m Test R2 score: 0.9580847241030104


In [18]:
pd.DataFrame.from_dict(refined_metrics_dict).transpose()

Unnamed: 0,Unnamed: 1,r2,RMSE,MAPE,rae,dcor
lin_x,sinx_plus1,0.999999,0.000562,1.336834,38.394055,1.0
lin_x,sinx,0.958085,0.137838,0.212106,35.932202,0.989433


In [24]:
for key in refined_datas_dict[list(refined_datas_dict.keys())[0]]["ensemble_models"]["sklearn_regressor"].keys():
    print(f'Weight: {refined_datas_dict[list(refined_datas_dict.keys())[0]]["ensemble_models"]["ensemble_weight"][key]}, \t Regressor: {refined_datas_dict[list(refined_datas_dict.keys())[0]]["ensemble_models"]["sklearn_regressor"][key]}')

Weight: 1.0, 	 Regressor: GaussianProcessRegressor(alpha=5.211916694590534e-14,
                         kernel=RBF(length_scale=1), n_restarts_optimizer=10,
                         normalize_y=True, random_state=1)


In [29]:
refined_datas_dict[list(refined_datas_dict.keys())[0]]["ensemble_models"]

Unnamed: 0,ensemble_weight,rank,sklearn_regressor
83,1.0,1,GaussianProcessRegressor(alpha=5.2119166945905...
