In [1]:
import ray
import pandas as pd


### Import Methods

#### Way 1

without directly using the
`from .predictability.utils import get_column_combinations`
in `src/asd/__init__.py` file.

In [None]:
'''
import asd.predictability.utils as asdpu
asdpu.get_column_combinations(all_cols=[1, 2, 3, 4, 5, 6],
                                  inputs=4,
                                  outputs=1,
                                  targets=[5, 6]
                                  )
'''

#### Way 2

with using the import
`from .predictability.utils import get_column_combinations`
in `src/asd/__init__.py` file.

In [None]:
import asd

In [None]:
asd.get_column_combinations(all_cols=[1, 2, 3, 4, 5, 6],
                                  inputs=4,
                                  outputs=1,
                                  targets=[5, 6]
                                  )

In [3]:
ray.init()

0,1
Python version:,3.9.5
Ray version:,2.0.1


# Basic Example

## Load Data

Basic data that can be understood as two different input columns (linear and random values) and three different output columns (sin of linear x, 1 + sin of linear x and random values).

In [3]:
data = pd.read_csv("example_data.csv", dtype=float)

In [5]:
data.head()

Unnamed: 0,lin_x,rand_x,sinx,randomy,sinx_plus1
0,1.0,-0.55,0.099833,0.09,1.099833
1,2.0,-0.44,0.198669,0.83,1.198669
2,3.0,0.94,0.29552,0.12,1.29552
3,4.0,-0.61,0.389418,0.41,1.389418
4,5.0,-0.51,0.479426,-0.7,1.479426


## Get Combinations function

This function can be used to determine the overall number of combinations the predictability routine analyses given the number of data columns, fitting type etc. This allows to estimate the overall runtime of the predictability routine.

The function returns a list of combination tuples, where the first inputs-many elements correspond to the inputs and the remaining ouputs-many to the targets. The argument targets can be used to define columns that should exclusively be regarded as targets.

In [8]:
# applied to a numerical example:
asd.get_column_combinations(all_cols=[1, 2, 3, 4, 5, 6],
                                  inputs=3,
                                  outputs=1,
                                  targets=[5, 6]
                                  )

[(1, 2, 3, 5),
 (1, 2, 3, 6),
 (1, 2, 4, 5),
 (1, 2, 4, 6),
 (1, 3, 4, 5),
 (1, 3, 4, 6),
 (2, 3, 4, 5),
 (2, 3, 4, 6)]

In [9]:
# the argument "amount_only" can be used to output the amout of combinations only
asd.get_column_combinations(all_cols=[1, 2, 3, 4, 5, 6],
                                  inputs=3,
                                  outputs=1,
                                  targets=[5, 6],
                                  amount_only=True
                                  )

8

In [10]:
# applied to the data above
print(asd.get_column_combinations(all_cols=data.columns, inputs=1, outputs=1, targets=["sinx", "randomy", "sinx_plus1"]))

[('lin_x', 'sinx'), ('lin_x', 'randomy'), ('lin_x', 'sinx_plus1'), ('rand_x', 'sinx'), ('rand_x', 'randomy'), ('rand_x', 'sinx_plus1')]


In [11]:
print(asd.get_column_combinations(all_cols=data.columns, inputs=1, outputs=1, targets=["sinx", "randomy", "sinx_plus1"],
                                  amount_only=True))

6


## Predictability function

Running the predictability function over all possible 1+1 combinations where the respective target is either sin, sin+1 or random and the input is linear x or random values.

The purpose of having sin + 1 is to have a tuple that exclusively contains positive values so fitting a power law can be applied.

### "normal" routine

In [None]:
metrics_dict, datas_dict = asd.run_predictability(data=data,
                                          input_cols=1,
                                          output_cols=1,
                                          col_set=None,
                                          targets=["sinx", "randomy", "sinx_plus1"],
                                          method="kNN",
                                          random_state_split=None,
                                          refined_n_best=0
                                          )

_Note that starting the Ray instance accounts for most of the runtime. Start the cell above again and runtime will be ~1s._

### greedy routine

In [None]:
greedy_metrics_dict, greedy_datas_dict = asd.run_predictability(data=data,
                                          input_cols=2,
                                          output_cols=1,
                                          col_set=None,
                                          targets=["sinx", "randomy", "sinx_plus1"],
                                          method="kNN",
                                          random_state_split=None,
                                          greedy=True,
                                          refined_n_best=0
                                          )

In [None]:
pd.DataFrame.from_dict(greedy_metrics_dict).transpose()

### Results

In [None]:
pd.DataFrame.from_dict(metrics_dict).transpose()

In [None]:
pd.DataFrame.from_dict(greedy_metrics_dict).transpose()

### Structure of returned dictionaries

In [None]:
struc_dict = datas_dict[list(datas_dict.keys())[0]]
for key in list(struc_dict.keys()):
    if type(struc_dict[key]) is dict:
        print(key, "\t dict with key(s):\t", list(struc_dict[key].keys()))
    else:
        print(key, "\t type:\t", type(struc_dict[key]), "\t shape:\t", struc_dict[key].shape)

### Plotting

In [None]:
asd.predictability_plot(datas_dict, list(datas_dict.keys())[0], plot_along=["linear", "mean"])

## Tuple Selection function

This function can be used to limit the number of tuples that is further analysed in more detail.

In [None]:
selected_tuples = asd.predictability.core.tuple_selection(metrics_dict, n_best=2)
selected_tuples

## Refine Predictability function

This function is used to further refine the predictability of the previously selected best tuples.

In [None]:
refined_metrics_dict, refined_datas_dict = asd.refine_predictability(best_tuples=selected_tuples,
                                                                 data_dict=datas_dict,
                                                                 time_left_for_this_task=60,
                                                                 n_jobs=-1,
                                                                 use_ray=True,
                                                                 )

In [None]:
pd.DataFrame.from_dict(refined_metrics_dict).transpose()

#### Structure of returned dictionaries

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
struc_dict = refined_datas_dict[list(refined_datas_dict.keys())[0]]
for key in list(struc_dict.keys()):
    if type(struc_dict[key]) is dict:
        print(key, "\t dict with key(s):\t"#, list(struc_dict[key].keys())
              )
    elif type(struc_dict[key]) is Pipeline:
        print(key, "\t type:\t", type(struc_dict[key]))
    else:
        print(key, "\t type:\t", type(struc_dict[key]), "\t shape:\t", struc_dict[key].shape)

#### Plotting

In [None]:
asd.predictability_plot(refined_datas_dict,
                        list(refined_datas_dict.keys())[0],
                        refined_plot=True,
                        refined_input_datas_dict=datas_dict,
                        plot_along=[#"linear", "mean",
                                    "init"]
                        )

## Predictability w direct refined run

In [None]:
diref_metrics_dict, diref_datas_dict = asd.run_predictability(data=data,
                                          input_cols=1,
                                          output_cols=1,
                                          col_set=None,
                                          targets=["sinx", "randomy", "sinx_plus1"],
                                          method="kNN",
                                          random_state_split=None,
                                          refined_n_best=1,
                                          )

In [None]:
asd.predictability_plot(diref_datas_dict,
                        list(diref_datas_dict.keys())[0],
                        refined_plot=True,
                        #refined_input_datas_dict=datas_dict,
                        plot_along=["init"#, "linear", "mean",
                                    ]
                        )

In [None]:
pd.DataFrame.from_dict(diref_metrics_dict).transpose()

In [None]:
asd.predictability_plot(refined_datas_dict,
                        list(refined_datas_dict.keys())[0],
                        refined_plot=True,
                        refined_input_datas_dict=datas_dict,
                        plot_along=[#"linear", "mean",
                                    "init"]
                        )

## Predictability w direct refined run

In [None]:
diref_metrics_dict, diref_datas_dict = asd.run_predictability(data=data,
                                          input_cols=1,
                                          output_cols=1,
                                          col_set=None,
                                          targets=["sinx", "randomy", "sinx_plus1"],
                                          method="kNN",
                                          random_state_split=None,
                                          refined_n_best=1,
                                          )

In [8]:
asd.predictability_plot(diref_datas_dict,
                        list(diref_datas_dict.keys())[0],
                        refined_plot=True,
                        #refined_input_datas_dict=datas_dict,
                        plot_along=["init"#, "linear", "mean",
                                    ]
                        )

In [7]:
pd.DataFrame.from_dict(diref_metrics_dict).transpose()

Unnamed: 0,Unnamed: 1,kNN r2,linear r2,pow. law r2,mean r2,kNN RMSE,linear RMSE,pow. law RMSE,mean RMSE,kNN RMSE/std,linear RMSE/std,...,mean MAPE,kNN rae,linear rae,pow. law rae,mean rae,kNN dcor,linear dcor,pow. law dcor,mean dcor,refined_metrics
lin_x,sinx,0.995141,-0.020562,,-0.001055,0.044605,0.64646,,0.640252,0.069704,1.010229,...,0.906842,0.070344,1.013917,,0.994924,0.996683,0.37989,,0.0,"{'r2': 0.9967748393334996, 'RMSE': 0.036341041..."
lin_x,randomy,0.022256,-0.004075,,-0.00634,0.617967,0.626233,,0.626938,0.98881,1.002036,...,1.01127,0.97886,0.994079,,0.993836,0.318714,0.223778,,0.000212,
lin_x,sinx_plus1,0.990412,-0.054185,-0.366504,-0.067661,0.061225,0.641995,0.730934,0.646085,0.097916,1.026735,...,3.218067,0.084378,1.018854,1.149817,1.02164,0.993079,0.419273,0.417945,0.0,
rand_x,sinx,-0.660264,-0.021725,,-0.010156,0.860173,0.674784,,0.670952,1.288512,1.010804,...,1.147547,1.200781,0.999628,,0.992721,0.199828,0.325765,,0.0,
rand_x,randomy,-0.142981,-0.149405,,-0.023304,0.535255,0.536757,,0.506458,1.069103,1.072103,...,1.117261,1.0888,1.067399,,1.021308,0.279583,0.330535,,7.9e-05,
rand_x,sinx_plus1,-0.126655,-0.066726,,-0.001244,0.66203,0.644182,,0.624097,1.06144,1.032824,...,517.198107,1.055223,1.041813,,1.005622,0.321271,0.432239,,0.000134,
lin_x_plus1,sinx,0.97654,-0.248292,,-0.253399,0.093313,0.680676,,0.682067,0.153166,1.11727,...,1.915628,0.127478,1.094805,,1.099285,0.985258,0.46326,,0.000143,
lin_x_plus1,randomy,-0.117802,-0.032797,,-0.035206,0.661415,0.635769,,0.63651,1.057261,1.016266,...,1.021926,1.003721,0.990373,,0.989117,0.379206,0.312616,,0.000244,
lin_x_plus1,sinx_plus1,0.977159,-0.111789,-0.984062,-0.046013,0.092531,0.645563,0.862393,0.626176,0.151133,1.054414,...,0.857867,0.132254,1.057068,1.330046,1.017172,0.989511,0.531361,0.581373,0.0,
