In [2]:
import numpy as np
import pandas as pd
import ray

In [None]:
ray.init()

In [None]:
import asd

# The data

In [5]:
df = pd.read_csv("JET_(2036, 122)_DB3V13.csv")

In [6]:
df.head()

Unnamed: 0,TOK,TOK_ID,DIVNAME,LCUPDATE,DATE,SHOT,TIME,TIME_ID,T1,T2,...,IAEA92,DB2P5,DB2P8,DB3IS,DB3V5,IAE2000N,IAE2000X,HMWS2003,SELDB3,SELDB3X
0,JET,6,MarkGBSR,20031119,20020530,56145,62.6474,62647,62.56,62.75,...,0,0,0,0,0,0,0,0,1111111010,1110
1,JET,6,MarkGBSR,20031119,20020916,56603,64.882996,64883,64.8,65.0,...,0,0,0,0,0,0,0,0,1111111111,11111100
2,JET,6,MarkGBSR,20031119,20020916,56605,64.882996,64883,64.8,65.0,...,0,0,0,0,0,0,0,0,1111111111,11111100
3,JET,6,MarkGBSR,20031119,20020916,56606,64.882996,64883,64.8,65.0,...,0,0,0,0,0,0,0,0,1111111111,11111100
4,JET,6,MarkGBSR,20031119,20020916,56610,65.379761,65380,65.25,65.5,...,0,0,0,0,0,0,0,0,1111111111,11111100


In [15]:
# columns we may use for a 4-1 fit
cols_4 = ["PLTH", "TAUTOT", "AMIN", "RGEO", "KAPPA", "KAPPAA",
          "KAREA", "VOL", "NEL", "PNBI", "PINJ",
          "WTH", "WTOT", "IP",
          "DELTA", "SEPLIM", "XPLIM", "AREA",
          ]

# Predictability Run

First check the number of combinations:

In [12]:
asd.get_column_combinations(all_cols=cols_4, inputs=4, outputs=1,targets=["PLTH", "TAUTOT"], amount_only=True)

3640

This should be analysed in reasonable time when using ``method="kNN"``. However, feel free to further reduce ``cols_4``.

In [None]:
metrics_dict, datas_dict = asd.run_predictability(data=df,
                                                  input_cols=4,
                                                  output_cols=1,
                                                  col_set=cols_4,
                                                  primkey_cols = ["TOK"],
                                                  targets=["PLTH", "TAUTOT"],
                                                  method="kNN",
                                                  random_state_split=None,
                                                  refined_n_best=0
                                                  )

Let's have a look at the results:

In [17]:
metrics_df = pd.DataFrame.from_dict(metrics_dict).transpose().sort_values(by="kNN r2", ascending=False)

In [18]:
metrics_df.head(5)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,kNN r2,linear r2,pow. law r2,mean r2,kNN RMSE,linear RMSE,pow. law RMSE,mean RMSE,kNN RMSE/std,linear RMSE/std,...,pow. law MAPE,mean MAPE,kNN rae,linear rae,pow. law rae,mean rae,kNN dcor,linear dcor,pow. law dcor,mean dcor
NEL,KAREA,PNBI,AMIN,PLTH,0.960817,0.892848,,-0.001207,918523.507544,1518940.0,,4643039.0,0.197947,0.327341,...,,1.210217,0.164337,0.291313,,1.003275,0.975373,0.943493,,0.0
AREA,NEL,PNBI,KAPPA,PLTH,0.960785,0.888489,,-0.000367,927024.033367,1563225.0,,4682117.0,0.198029,0.333933,...,,1.300888,0.158087,0.289579,,1.001617,0.975281,0.936923,,0.00035
VOL,NEL,SEPLIM,PNBI,PLTH,0.955914,0.897683,,-0.006571,960022.613494,1462531.0,,4587253.0,0.209967,0.319871,...,,1.297672,0.169171,0.289345,,1.007241,0.972935,0.944284,,0.0
AREA,IP,PNBI,WTH,PLTH,0.955079,0.891353,,-0.000565,986172.76252,1533682.0,,4654238.0,0.211947,0.329617,...,,1.228608,0.159112,0.300718,,0.999032,0.974066,0.941806,,0.001305
VOL,NEL,PNBI,KAPPA,PLTH,0.954949,0.879881,,-0.001392,967828.434491,1580343.0,,4562976.0,0.212252,0.346581,...,,1.21132,0.170784,0.306019,,1.003358,0.973653,0.939394,,0.0


## Refine predictability

First select the tuples that should be further analysed.

In [19]:
selected_tuples = asd.tuple_selection(metrics_dict, n_best=3 )

Then run the ``refine_predictability`` routine.

In [None]:
refined_metrics_dict, refined_datas_dict = asd.refine_predictability(best_tuples=selected_tuples,
                                                                     data_dict=datas_dict,
                                                                     time_left_for_this_task=60,
                                                                     n_jobs=-1,
                                                                     use_ray=True
                                                                     )

In [22]:
pd.DataFrame.from_dict(refined_metrics_dict).transpose()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,r2,RMSE,MAPE,rae,dcor
NEL,KAREA,PNBI,AMIN,PLTH,0.968342,825628.154668,0.117957,0.139893,0.980591
AREA,NEL,PNBI,KAPPA,PLTH,0.967612,842474.221484,0.143575,0.133107,0.980397
VOL,NEL,SEPLIM,PNBI,PLTH,0.970533,784877.484422,0.125215,0.13327,0.982324


We may now compare the results:

In [52]:
for key in list(refined_metrics_dict.keys()):
    print(f'{key}: \t r2: {round(metrics_dict[key]["kNN r2"],2)}\t\t\t'
          f'RMSE: {round(metrics_dict[key]["kNN RMSE"],2)}\n '
          f'\t\t\t\t\t\t\t\t\t\t\t ref.: {round(refined_metrics_dict[key]["r2"],2)}\t\t\t'
          f'ref.: {round(refined_metrics_dict[key]["RMSE"],2)}\n')

('NEL', 'KAREA', 'PNBI', 'AMIN', 'PLTH'): 	 r2: 0.96			RMSE: 918523.51
 											 ref.: 0.97			ref.: 825628.15

('AREA', 'NEL', 'PNBI', 'KAPPA', 'PLTH'): 	 r2: 0.96			RMSE: 927024.03
 											 ref.: 0.97			ref.: 842474.22

('VOL', 'NEL', 'SEPLIM', 'PNBI', 'PLTH'): 	 r2: 0.96			RMSE: 960022.61
 											 ref.: 0.97			ref.: 784877.48



An improvement on the predictions can be obtained for ``time_left_for_this_task=60`` already.

# Plotting

In [27]:
asd.predictability_plot(refined_datas_dict,
                        list(refined_datas_dict.keys())[0],
                        refined_plot=True,
                        initial_datas_dict=datas_dict,
                        plot_along=["init",
                                    "linear",
                                    #"pl", # In this dataset and for 4-1 fits, there's in most cases some non-positive value
                                    #"mean"
                                    ]
                        )