In [1]:
import pandas as pd
import pickle
import plotly.express as px
from plotly.subplots import make_subplots    
import plotly.graph_objects as go
import numpy as np
import time

In [2]:
from ASD_utils import predictability, plot_result, get_column_combinations, get_column_combinations_w_targets, data_prep_split, rae


In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)

## New long run, with power law fit and confinement time as target

In [4]:
df = pd.read_csv("tokamak_general/JET_(2036, 122)_DB3V13.csv")

In [131]:
# columns we may use:
cols_4 = ["PFLOSS", "PL", "PLTH", 
           "TAUTH", "TAUTOT",
           "AMIN", "RGEO", "KAPPA", "KAPPAA", 
           "KAREA", "INDENT", "VOL", "NEL", "PALPHA", "PNBI", "PINJ",
           "WTH", "WTOT", "IP", "PECRH", "PICRH", "PICRHC", "DELTA", "SEPLIM", "XPLIM", "AREA"]

In [133]:
# check number of non-zero and negative entries
for col in cols_4:
    if len(str(len(df.loc[df[col]!=0]))) > 3:
        print(col+"\t non-zero: "+str(len(df.loc[df[col]!=0]))+" \t neg.: "+str(len(df.loc[df[col]<0])))
    else:
        print(col+"\t non-zero: "+str(len(df.loc[df[col]!=0]))+" \t \t neg.: "+str(len(df.loc[df[col]<0])))
print("overall entries: "+str(len(df)))

PFLOSS	 non-zero: 1573 	 neg.: 0
PL	 non-zero: 2036 	 neg.: 0
PLTH	 non-zero: 2036 	 neg.: 0
TAUTH	 non-zero: 2036 	 neg.: 0
TAUTOT	 non-zero: 2036 	 neg.: 0
AMIN	 non-zero: 2036 	 neg.: 0
RGEO	 non-zero: 2036 	 neg.: 0
KAPPA	 non-zero: 2036 	 neg.: 0
KAPPAA	 non-zero: 2036 	 neg.: 0
KAREA	 non-zero: 2036 	 neg.: 0
INDENT	 non-zero: 0 	 	 neg.: 0
VOL	 non-zero: 2036 	 neg.: 0
NEL	 non-zero: 2036 	 neg.: 0
PALPHA	 non-zero: 104 	 	 neg.: 0
PNBI	 non-zero: 1573 	 neg.: 0
PINJ	 non-zero: 1573 	 neg.: 0
WTH	 non-zero: 2036 	 neg.: 0
WTOT	 non-zero: 2036 	 neg.: 0
IP	 non-zero: 2036 	 neg.: 2036
PECRH	 non-zero: 0 	 	 neg.: 0
PICRH	 non-zero: 15 	 	 neg.: 0
PICRHC	 non-zero: 15 	 	 neg.: 0
DELTA	 non-zero: 2036 	 neg.: 0
SEPLIM	 non-zero: 2036 	 neg.: 0
XPLIM	 non-zero: 2018 	 neg.: 498
AREA	 non-zero: 2036 	 neg.: 0
overall entries: 2036


#### for power-law fitting:
* IP has only negative values – use IP --> -IP ?
* XPLIM, PFLOSS,  have both zero and negative entries
* PNBI, PINJ have zero entries but no negative ones

#### as IP is always negative for this JET data, may as well use |IP| instead, with the benefit of enabling more power law fits

In [158]:
df["|IP|"] = np.abs(df["IP"])

In [5]:
cols_4 = ["PLTH", "TAUTOT", "AMIN", "RGEO", "KAPPA", "KAPPAA", 
        "KAREA", "VOL", "NEL", "PNBI", "PINJ",
        "WTH", "WTOT", "IP", "DELTA", "SEPLIM", "XPLIM", "AREA",
        #"|IP|" # if include |IP|, exclude IP
         ]

In [9]:
# so the overall number of combinations for a 4-1 run on 18 columns where
# 2 are targets only is
len(get_column_combinations_w_targets(np.arange(0,18), 4, 1, [0,1]))

3640

In [None]:
metrics_4_1, datas_4_1 = predictability(data=df,
                                input_cols=4,
                                output_cols=1,
                                col_set=cols_4,
                                primkey_cols = ["TOK"],
                                targets=["PLTH", "TAUTOT"],
                                hidden_layers=[#(70,5,), 
                                               (70, 25, 5),
                                                (120,70,30,5)
                                              ],
                                alphas=[0.0005],
                                scaling="yes",
                                scoring="RMSE",
                                n_jobs=-1
                               )

##### Notes on the run

The run over 3640 combinations and with GridSearchCV on 2 hidden layer sizes, 1 alpha value, scaling=yes took 49h.

Runs with PLTH as target took around 10x as long as the corresponding combination with TAUTOT as target (some seconds vs. 100-400s)

In [None]:
'''
with open('JET_metrics_4_1.pkl', 'wb') as f:
    pickle.dump(metrics_4_1, f)
with open('JET_datas_4_1.pkl', 'wb') as f:
    pickle.dump(datas_4_1, f)
'''

In [6]:
with open('JET_metrics_4_1.pkl', 'rb') as f:
    metrics_4_1 = pickle.load(f)
with open('JET_datas_4_1.pkl', 'rb') as f:
    datas_4_1 = pickle.load(f)

In [7]:
metrics_4_1_df = pd.DataFrame.from_dict(metrics_4_1).transpose().sort_values(by="MLP r2", ascending=False)
metrics_4_1_df.index.names = ["input1", "input2", "input3", "input4", "target"]


worse_metrics_4_1_df = pd.DataFrame.from_dict(metrics_4_1).transpose().sort_values(by="MLP r2", ascending=True)
worse_metrics_4_1_df.index.names = ["input1", "input2", "input3", "input4", "target"]


In [148]:
# best results
#metrics_4_1_df.head(50)

# best results with power law fit
#metrics_4_1_df.loc[np.isnan(metrics_4_1_df["pow. law r2"])==False].head(10)

# best confinement time fits
#metrics_4_1_df.loc[metrics_4_1_df.index.get_level_values("target")=="TAUTOT"].head(10)

# worst_results
#worse_metrics_4_1_df.head(20)

# worst confinement time fits
worse_metrics_4_1_df.loc[worse_metrics_4_1_df.index.get_level_values("target")=="TAUTOT"].head(20)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,MLP r2,linear r2,pow. law r2,mean r2,MLP RMSE,linear RMSE,pow. law RMSE,mean RMSE,MLP RMSE/std,linear RMSE/std,pow. law RMSE/std,mean RMSE/std,MLP MAPE,linear MAPE,pow. law MAPE,mean MAPE,MLP rae,linear rae,pow. law rae,mean rae,MLP dcor,linear dcor,pow. law dcor,mean dcor
input1,input2,input3,input4,target,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
PNBI,RGEO,XPLIM,AMIN,TAUTOT,-0.044083,0.4051,,-4.7e-05,0.330604,0.249553,,0.323557,1.021804,0.771298,,1.000023,0.473861,0.344068,,0.546632,592.275466,0.667473,,609.979845,0.000132,0.71348,,0.0
IP,RGEO,SEPLIM,NEL,TAUTOT,-0.042153,0.407998,,-4.7e-05,0.330298,0.248944,,0.323557,1.020859,0.769417,,1.000023,0.475337,0.329658,,0.546632,592.426132,0.669799,,609.979845,0.0,0.709565,,0.0
KAPPAA,AREA,NEL,AMIN,TAUTOT,-0.040158,0.046278,0.432896,-4.7e-05,0.329982,0.315974,0.243653,0.323557,1.019881,0.976587,0.753063,1.000023,0.476909,0.541695,0.28741,0.546632,592.598289,0.963203,0.627028,609.979845,0.000432,0.249364,0.746772,0.0
PINJ,IP,AREA,NEL,TAUTOT,-0.039726,0.410835,,-4.7e-05,0.329913,0.248347,,0.323557,1.01967,0.767571,,1.000023,0.477455,0.33264,,0.546632,592.645862,0.670458,,609.979845,0.054643,0.710532,,0.0
PNBI,PINJ,IP,KAPPA,TAUTOT,-0.039066,0.462359,,-4.7e-05,0.329809,0.237239,,0.323557,1.019346,0.73324,,1.000023,0.477786,0.291609,,0.546632,592.694316,0.607772,,609.979845,0.000464,0.763652,,0.0
WTH,RGEO,DELTA,KAPPAA,TAUTOT,-0.039022,0.400723,0.342176,-4.7e-05,0.329802,0.250469,0.262419,0.323557,1.019324,0.77413,0.811064,1.000023,0.477821,0.334021,0.286704,0.546632,592.698239,0.667121,0.662644,609.979845,0.0,0.728526,0.722803,0.0
PINJ,KAPPA,NEL,KAREA,TAUTOT,-0.038625,0.050076,,-4.7e-05,0.329739,0.315344,,0.323557,1.019129,0.97464,,1.000023,0.478144,0.541032,,0.546632,592.733539,0.958174,,609.979845,0.0,0.260047,,0.0
WTH,RGEO,SEPLIM,XPLIM,TAUTOT,-0.03832,0.357723,,-4.7e-05,0.32969,0.259299,,0.323557,1.01898,0.801422,,1.000023,0.478392,0.324389,,0.546632,592.760769,0.687678,,609.979845,0.0,0.69474,,0.0
PNBI,AREA,VOL,NEL,TAUTOT,-0.037718,0.051486,,-4.7e-05,0.329595,0.31511,,0.323557,1.018684,0.973917,,1.000023,0.478885,0.534563,,0.546632,592.814768,0.95762,,609.979845,0.000518,0.257828,,0.0
WTH,DELTA,NEL,WTOT,TAUTOT,-0.037431,0.385496,0.459831,-4.7e-05,0.329549,0.253631,0.237797,0.323557,1.018544,0.783903,0.734962,1.000023,0.479122,0.344612,0.24121,0.546632,592.840702,0.675803,0.573171,609.979845,0.000521,0.707432,0.76892,0.0


In [8]:
best_pl_res = metrics_4_1_df.loc[np.isnan(metrics_4_1_df["pow. law r2"])==False].head(1).index.values[0]
best_tautot_res = metrics_4_1_df.loc[metrics_4_1_df.index.get_level_values("target")=="TAUTOT"].head(1).index.values[0]
best_results_4_1 = metrics_4_1_df.sort_values(by="MLP r2", ascending=False).index


##### which hyperparameters where chosen (alpha was fixed)

In [141]:

chosen_layers = []
chosen_alphas = []
for key in list(datas_4_1.keys()):
    chosen_layers.append(datas_4_1[key]["GridSearchParams"]["mlp__hidden_layer_sizes"])
    chosen_alphas.append(datas_4_1[key]["GridSearchParams"]["mlp__alpha"])
for layer in set(chosen_layers):
    print(str(layer)+" frequency: "+str(chosen_layers.count(layer)))
for alpha in set(chosen_alphas):
    print(str(alpha)+" frequency: "+str(chosen_alphas.count(alpha)))


(70, 25, 5) frequency: 1052
(120, 70, 30, 5) frequency: 2588
0.0005 frequency: 3640


#### plotting

##### results

In [160]:
plot_result(datas_4_1, best_results_4_1[-1],
            plot_along=[
                        #"mean", 
                        "linear", 
                        "pl"
                        ]
           )

no power law fit performed, some columns did not include positive values only


##### column importance

In [149]:

input_cols = ["input1", "input2", "input3", "input4", "target"]
input_pos_df = metrics_4_1_df.reset_index()[input_cols].reset_index()

#inputs = [i for i in cols if i not in ["PLTH"]]
inputs = cols_4

for curr_input in inputs:
    input_pos_df[curr_input] = [1 if n > 0 else 0 for n in np.sum(input_pos_df[input_cols].values==curr_input, 1)]

input_pos_df = input_pos_df[inputs+["index"]]


In [150]:
input_pos_plt_df = input_pos_df.melt(id_vars=["index"], var_name="input", value_name="value")

In [151]:
separator_conditions = []
sep = []
m=0
for curr_input in inputs:
    separator_conditions.append(input_pos_plt_df["input"]==curr_input)
    sep.append(m)
    m += 1
input_pos_plt_df["sep"] = np.select(separator_conditions, sep)

In [None]:
fig = px.line_3d(input_pos_plt_df, x = "sep", y="index", z="value", color="input",# size=[1]*len(input_pos_plt_df)
                color_discrete_sequence=px.colors.qualitative.Dark24
                )
fig.update_layout(
    scene = dict(
                xaxis=dict(title="input"),
                yaxis=dict(title="n-th best"),
                zaxis=dict(title="present y/n", range=[0,2])
    )
)
fig.show()

## Additional correlation coefficients as metrics

In [8]:
import scipy.stats

In [107]:
ext_metrics_4_1_df = metrics_4_1_df.copy(deep=True)

In [None]:
list_pearsons = []
list_spearmans = []
list_kendalls = []
for comb in ext_metrics_4_1_df.index:
    list_pearsons.append(scipy.stats.pearsonr(datas_4_1[comb]["y_test_pred"], datas_4_1[comb]["y_test"].flatten())[0])
    list_spearmans.append(scipy.stats.spearmanr(datas_4_1[comb]["y_test_pred"], datas_4_1[comb]["y_test"].flatten())[0])
    list_kendalls.append(scipy.stats.kendalltau(datas_4_1[comb]["y_test_pred"], datas_4_1[comb]["y_test"].flatten())[0])
ext_metrics_4_1_df["MLP Pearson"] = list_pearsons
ext_metrics_4_1_df["MLP Spearman"] = list_spearmans
ext_metrics_4_1_df["MLP KendallTau"] = list_kendalls
list_pearsons = []
list_spearmans = []
list_kendalls = []
for comb in ext_metrics_4_1_df.index:
    list_pearsons.append(scipy.stats.pearsonr(datas_4_1[comb]["y_test_pred_linear"].flatten(), datas_4_1[comb]["y_test"].flatten())[0])
    list_spearmans.append(scipy.stats.spearmanr(datas_4_1[comb]["y_test_pred_linear"].flatten(), datas_4_1[comb]["y_test"].flatten())[0])
    list_kendalls.append(scipy.stats.kendalltau(datas_4_1[comb]["y_test_pred_linear"].flatten(), datas_4_1[comb]["y_test"].flatten())[0])
ext_metrics_4_1_df["linear Pearson"] = list_pearsons
ext_metrics_4_1_df["linear Spearman"] = list_spearmans
ext_metrics_4_1_df["linear KendallTau"] = list_kendalls
list_pearsons = []
list_spearmans = []
list_kendalls = []
for comb in ext_metrics_4_1_df.index:
    list_pearsons.append(scipy.stats.pearsonr(datas_4_1[comb]["y_test_pred_mean"], datas_4_1[comb]["y_test"].flatten())[0])
    list_spearmans.append(scipy.stats.spearmanr(datas_4_1[comb]["y_test_pred_mean"], datas_4_1[comb]["y_test"].flatten())[0])
    list_kendalls.append(scipy.stats.kendalltau(datas_4_1[comb]["y_test_pred_mean"], datas_4_1[comb]["y_test"].flatten())[0])
ext_metrics_4_1_df["mean Pearson"] = list_pearsons
ext_metrics_4_1_df["mean Spearman"] = list_spearmans
ext_metrics_4_1_df["mean KendallTau"] = list_kendalls

list_pearsons = []
list_spearmans = []
list_kendalls = []
for comb in ext_metrics_4_1_df.index:
    if "y_test_pred_pl" in datas_4_1[comb].keys():
        list_pearsons.append(scipy.stats.pearsonr(datas_4_1[comb]["y_test_pred_pl"].flatten(), datas_4_1[comb]["y_test"].flatten())[0])
        list_spearmans.append(scipy.stats.spearmanr(datas_4_1[comb]["y_test_pred_pl"].flatten(), datas_4_1[comb]["y_test"].flatten())[0])
        list_kendalls.append(scipy.stats.kendalltau(datas_4_1[comb]["y_test_pred_pl"].flatten(), datas_4_1[comb]["y_test"].flatten())[0])
    else:
        list_pearsons.append(np.nan)
        list_spearmans.append(np.nan)
        list_kendalls.append(np.nan)
ext_metrics_4_1_df["pow. law Pearson"] = list_pearsons
ext_metrics_4_1_df["pow. law Spearman"] = list_spearmans
ext_metrics_4_1_df["pow. law KendallTau"] = list_kendalls

In [153]:
ext_metrics_4_1_df.sort_values(by="pow. law r2", ascending=False).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,MLP r2,linear r2,pow. law r2,mean r2,MLP RMSE,linear RMSE,pow. law RMSE,mean RMSE,MLP RMSE/std,linear RMSE/std,pow. law RMSE/std,mean RMSE/std,MLP MAPE,linear MAPE,pow. law MAPE,mean MAPE,MLP rae,linear rae,pow. law rae,mean rae,MLP dcor,linear dcor,pow. law dcor,mean dcor,MLP Pearson,MLP Spearman,MLP KendallTau,linear Pearson,linear Spearman,linear KendallTau,mean Pearson,mean Spearman,mean KendallTau,pow. law Pearson,pow. law Spearman,pow. law KendallTau
input1,input2,input3,input4,target,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
WTH,AREA,NEL,WTOT,PLTH,0.836926,0.759966,0.823548,-0.006278,1847488.0,2241427.0,1921773.0,4589313.0,0.403825,0.489932,0.420062,1.003134,0.239479,0.347491,0.235714,1.265121,783.374443,0.435,0.343972,617.368137,0.914574,0.887605,0.913249,0.0,0.914888,0.929145,0.767254,0.872264,0.90155,0.730601,,,,0.910979,0.931177,0.771966
WTH,VOL,NEL,WTOT,PLTH,0.799915,0.759966,0.822244,-0.006278,2046426.0,2241427.0,1928859.0,4589313.0,0.447309,0.489932,0.421611,1.003134,0.280882,0.347491,0.237386,1.265121,769.375634,0.435,0.34712,617.368137,0.89286,0.887605,0.912871,0.0,0.894952,0.914637,0.743427,0.872264,0.90155,0.730601,,,,0.910605,0.930681,0.77115
WTH,NEL,WTOT,AMIN,PLTH,0.810571,0.759966,0.811687,-0.006278,1991188.0,2241427.0,1985313.0,4589313.0,0.435235,0.489932,0.433951,1.003134,0.261777,0.347491,0.240728,1.265121,774.34425,0.435,0.359532,617.368137,0.900316,0.887605,0.906827,0.0,0.900567,0.919528,0.750875,0.872264,0.90155,0.730601,,,,0.90438,0.926733,0.766299
WTH,NEL,WTOT,KAREA,PLTH,0.876756,0.759966,0.808057,-0.006278,1606099.0,2241427.0,2004357.0,4589313.0,0.351062,0.489932,0.438113,1.003134,0.210017,0.347491,0.254953,1.265121,797.382711,0.435,0.360015,617.368137,0.935823,0.887605,0.911445,0.0,0.936554,0.944417,0.796995,0.872264,0.90155,0.730601,,,,0.903406,0.926164,0.764077
WTH,KAPPAA,NEL,WTOT,PLTH,0.799777,0.759966,0.801238,-0.006278,2047132.0,2241427.0,2039651.0,4589313.0,0.447463,0.489932,0.445828,1.003134,0.281842,0.347491,0.258447,1.265121,772.162841,0.435,0.367297,617.368137,0.90143,0.887605,0.909929,0.0,0.895379,0.915508,0.74846,0.872264,0.90155,0.730601,,,,0.89958,0.924249,0.761726


##### Remarks:

* For constant predictions (e.g. the mean predictor or failed MLP regressions), the correlation coefficients are ill-defined, yielding NaN.

* A Pearson correlation coefficient of 1 does not mean $y_{\text{pred}} = y_{\text{true}}$ but rather $y_{\text{pred}} = M\cdot y_{\text{true}}+c$ <br>
For dcor (distance correlation): $M = b\cdot O$ with $b\ \text{real}, O \in O(n)$ <br>
For the Spearman correlation coefficient: perfect monotone function

In [64]:
import dcor

In [96]:

test_values = np.array([1,2,3,4])

test_values_x2 = np.array([2,4,6,8])

test_values_x2_p1 = 1+np.array([2,4,6,8])

M = np.array([[1,1,1,1], [2,2,2,2], [3,3,3,3], [4,4,4,4]])
test_values_xMp1 = np.matmul(M, test_values)+1

print("Pearson: \t *2: "+str(scipy.stats.pearsonr(test_values, test_values_x2)[0])+"\t *2+1: "+
      str(scipy.stats.pearsonr(test_values, test_values_x2_p1)[0])
      + "\t *M+1: "+str(scipy.stats.pearsonr(test_values, test_values_xMp1)[0]))
print("Spearman: \t *2: "+str(scipy.stats.spearmanr(test_values, test_values_x2)[0])+"\t *2+1: "+
      str(scipy.stats.spearmanr(test_values, test_values_x2_p1)[0])
      + "\t *M+1: "+str(scipy.stats.spearmanr(test_values, test_values_xMp1)[0]))
print("Kendall: \t *2: "+str(scipy.stats.kendalltau(test_values, test_values_x2)[0])+"\t *2+1: "+
      str(scipy.stats.kendalltau(test_values, test_values_x2_p1)[0])
      + "\t *M+1: "+str(scipy.stats.kendalltau(test_values, test_values_xMp1)[0]))
print("dcor: \t \t *2: "+str(dcor.distance_correlation(test_values, test_values_x2))+"\t *2+1: "+
      str(dcor.distance_correlation(test_values, test_values_x2_p1))
      + "\t *M+1: "+str(dcor.distance_correlation(test_values, test_values_xMp1)))



Pearson: 	 *2: 1.0	 *2+1: 1.0	 *M+1: 1.0
Spearman: 	 *2: 1.0	 *2+1: 1.0	 *M+1: 1.0
Kendall: 	 *2: 1.0	 *2+1: 1.0	 *M+1: 1.0
dcor: 	 	 *2: 1.0	 *2+1: 1.0	 *M+1: 1.0


## On errors

For derived quantities, info_variables_and_data_2.txt does not specify the explicit errors but they have to be derived instead.

* $\mathrm{PLTH} = \mathrm{PL} - \mathrm{PFLOSS}$ <br>
<u>For JET</u>:<br>
    * $\mathrm{PL} = \mathrm{POHM} + \mathrm{PNBI} + \mathrm{PICRH} - \mathrm{DWDIA}$
    * $\mathrm{PFLOSS} = \frac{\mathrm{PINJ}}{100} \cdot \exp\left(3.35 - 0.667\cdot\frac{|\mathrm{IP}|}{106} - 0.2 \cdot\frac{\mathrm{NEL}}{1019}\right)$

* $\mathrm{TAUTOT} = \frac{\mathrm{WTOT}}{\mathrm{PL}}$ <br>
<u>For JET</u>: <br>
    * $\mathrm{WTOT} = \mathrm{WTH} + \mathrm{WFPER} + \mathrm{WFPAR} + \mathrm{WFICRH}$ <br>
And "If WFPER and WFPAR are missing they are replaced by WFFORM."
        * <u>For JET</u>: $\mathrm{WTH} = \mathrm{WDIA} - 1.5\cdot(\mathrm{WFPER}+\mathrm{WFICRH})$<br>
            And "If WFPER is missing WFPER is replaced by WFANI ´ WFFORM."
        * <u> For JET</u> and SHOT $\leq 18760$: $\mathrm{WFFORM} = 0.16 \cdot 1019 \cdot \frac{\mathrm{PINJ}}{\mathrm{NEV}}$

Their respective errors then are stated as follows:

| quantity | error |
| :-: | :-: |
| POHM | $ \pm 20 $ % |
| PNBI | $ \pm 10 $ % |
| PICRH | $ \pm 10 $ % |
| DWDIA | $ \pm 10 $ % |
| PINJ | $ \pm 6 $ % |
| IP | $ \pm 1 $ % |
| NEL | $ \pm 8 $ % |
| WDIA | $ \pm 5 $ % |
| WFPER | $ \pm 30 $ % |
| WFPAR | $ \pm 30 $ % |
| WFICRH | $ \pm 50 $ % |
| WFFORM | $ \pm 50 $ % |


##### Note:
The formulae above also motivate why PNBI turns out to be a valuable input!

## Checks on MLP performance

We notice that there are always MLP fits that get (presumably) stuck in a local minimum. Does fitting again in principle solve this problem?

For quicker runs, do with TAUTOT as target

In [9]:
best_tautot_results_df = metrics_4_1_df.loc[metrics_4_1_df.index.get_level_values("target")=="TAUTOT"]
best_tautot_results = best_tautot_results_df.index

In [10]:
best_ct = [i for i in best_tautot_results[0]]
worst_ct = [i for i in best_tautot_results[-1]]
print(best_ct, worst_ct)

['PNBI', 'IP', 'WTH', 'XPLIM', 'TAUTOT'] ['PNBI', 'RGEO', 'XPLIM', 'AMIN', 'TAUTOT']


In [None]:
run = 0
check_runs = best_tautot_results_df.iloc[[0,-1]]
check_runs["run"] = run
while run < 50:
    run += 1
    
    metrics_checks_best, datas_checks_best = predictability(data=df,
                                input_cols=4,
                                output_cols=1,
                                col_set=best_ct,
                                primkey_cols = ["TOK"],
                                targets=["TAUTOT"],
                                hidden_layers=[#(70,5,), 
                                               (70, 25, 5),
                                                (120,70,30,5)
                                              ],
                                alphas=[0.0005],
                                scaling="yes",
                                scoring="RMSE",
                                n_jobs=-1
                               )
    result_best = pd.DataFrame.from_dict(metrics_checks_best).transpose()
    result_best["run"] = run
    check_runs = pd.concat([check_runs, result_best])
    
    metrics_checks_worst, datas_checks_worst = predictability(data=df,
                                    input_cols=4,
                                    output_cols=1,
                                    col_set=worst_ct,
                                    primkey_cols = ["TOK"],
                                    targets=["TAUTOT"],
                                    hidden_layers=[#(70,5,), 
                                                   (70, 25, 5),
                                                    (120,70,30,5)
                                                  ],
                                    alphas=[0.0005],
                                    scaling="yes",
                                    scoring="RMSE",
                                    n_jobs=-1
                                   )
    result_worst = pd.DataFrame.from_dict(metrics_checks_worst).transpose()
    result_worst["run"] = run
    check_runs = pd.concat([check_runs, result_worst])

check_runs = check_runs.set_index("run", append=True)

In [13]:
#check_runs.to_pickle("check_runs.pkl")

In [14]:
check_runs = pd.read_pickle("check_runs.pkl")

In [15]:
# set threshold that is considered as marking a "failed" MLP run
threshold = .1

In [None]:
check_runs[["MLP r2"]].style.applymap(lambda x: 'background-color : yellow' if x<threshold else '')


In [17]:
print("Out of the "+str(run)+" runs, there were "+str(len(check_runs.loc[check_runs["MLP r2"]<threshold]))+
      " with an r2 score < "+str(threshold))

Out of the 50 runs, there were 7 with an r2 score < 0.1


In [21]:
print("Of those, "+str(len(check_runs.loc[((check_runs.index.get_level_values("input1")=="RGEO") |
                                           (check_runs.index.get_level_values("input2")=="RGEO") |
                                           (check_runs.index.get_level_values("input3")=="RGEO") |
                                           (check_runs.index.get_level_values("input4")=="RGEO") ) & 
                                          (check_runs["MLP r2"]<threshold)])+1)+
      " came from the worst result.")

Of those, 5 came from the worst result.


## Checks on MLP runtime for power loss predictions

We noticed that fitting the MLP with target power loss takes about 100x as long as with confinement time as target.

Check whether this has to do with the power loss values spanning several orders of magnitude.

##### first check on confinement time

In [22]:
# use best and worst Confinement Time result for analysis
print(best_ct, worst_ct)

['PNBI', 'IP', 'WTH', 'XPLIM', 'TAUTOT'] ['PNBI', 'RGEO', 'XPLIM', 'AMIN', 'TAUTOT']


In [None]:
run = 1
# initial run with scaling for best
curr_start = time.time()
metrics_checks_best, datas_checks_best = predictability(data=df,
                            input_cols=4,
                            output_cols=1,
                            col_set=best_ct,
                            primkey_cols = ["TOK"],
                            targets=["TAUTOT"],
                            hidden_layers=[#(70,5,), 
                                           (70, 25, 5),
                                            (120,70,30,5)
                                          ],
                            alphas=[0.0005],
                            scaling="yes",
                            scoring="RMSE",
                            n_jobs=-1
                           )
runtime = round(time.time()-curr_start,2)
result_best_yes = pd.DataFrame.from_dict(metrics_checks_best).transpose()
result_best_yes["run"] = run
result_best_yes["runtime"] = runtime
result_best_yes["scaling"] = "yes"
# initial run without scaling for best
start = time.time()
metrics_checks_best, datas_checks_best = predictability(data=df,
                            input_cols=4,
                            output_cols=1,
                            col_set=best_ct,
                            primkey_cols = ["TOK"],
                            targets=["TAUTOT"],
                            hidden_layers=[#(70,5,), 
                                           (70, 25, 5),
                                            (120,70,30,5)
                                          ],
                            alphas=[0.0005],
                            scaling="no",
                            scoring="RMSE",
                            n_jobs=-1
                           )
runtime = round(time.time()-curr_start,2)
result_best_no = pd.DataFrame.from_dict(metrics_checks_best).transpose()
result_best_no["run"] = run
result_best_no["runtime"] = runtime
result_best_no["scaling"] = "no"

check_runtime_ct = pd.concat([result_best_yes, result_best_no])

for scaling in ["yes", "no"]:
    curr_start = time.time()
    metrics_checks, datas_checks = predictability(data=df,
                                input_cols=4,
                                output_cols=1,
                                col_set=worst_ct,
                                primkey_cols = ["TOK"],
                                targets=["TAUTOT"],
                                hidden_layers=[#(70,5,), 
                                               (70, 25, 5),
                                                (120,70,30,5)
                                              ],
                                alphas=[0.0005],
                                scaling=scaling,
                                scoring="RMSE",
                                n_jobs=-1
                               )
    runtime = round(time.time()-curr_start,2)
    result = pd.DataFrame.from_dict(metrics_checks).transpose()
    result["run"] = run
    result["runtime"] = runtime
    result["scaling"] = scaling

    check_runtime_ct = pd.concat([check_runtime_ct, result])

for run in np.arange(2,30):
    for inputs in [best_ct, worst_ct]:
        for scaling in ["yes", "no"]:
            curr_start = time.time()
            metrics_checks, datas_checks = predictability(data=df,
                                        input_cols=4,
                                        output_cols=1,
                                        col_set=inputs,
                                        primkey_cols = ["TOK"],
                                        targets=["TAUTOT"],
                                        hidden_layers=[#(70,5,), 
                                                       (70, 25, 5),
                                                        (120,70,30,5)
                                                      ],
                                        alphas=[0.0005],
                                        scaling=scaling,
                                        scoring="RMSE",
                                        n_jobs=-1
                                       )
            runtime = round(time.time()-curr_start,2)
            result = pd.DataFrame.from_dict(metrics_checks).transpose()
            result["run"] = run
            result["runtime"] = runtime
            result["scaling"] = scaling
    
            check_runtime_ct = pd.concat([check_runtime_ct, result])

check_runtime_ct = check_runtime_ct.set_index("run", append=True)
check_runtime_ct = check_runtime_ct.set_index("scaling", append=True)
check_runtime_ct = check_runtime_ct.set_index("runtime", append=True)


In [72]:
#check_runtime_ct.to_pickle("check_runtime_confinement_time.pkl")

In [23]:
check_runtime_ct = pd.read_pickle("check_runtime_confinement_time.pkl")

In [None]:

check_runtime_ct[["MLP r2"]]


In [24]:
print("These "+str(max(check_runtime_ct.index.get_level_values("run")))+" runs took "+
      str(round(np.sum(check_runtime_ct.index.get_level_values("runtime"))/60,1))+"min")

These 29 runs took 3.3min


##### analyse Power Loss now

In [31]:
# use best and worst Power Loss result for analysis
best_pl = best_results_4_1[0]
worst_pl = best_results_4_1[-1]
print(best_pl, worst_pl)

('PNBI', 'IP', 'WTH', 'VOL', 'PLTH') ('KAPPA', 'AREA', 'XPLIM', 'KAREA', 'PLTH')


As the Power Loss covers a wider range of orders of magnitue, test also influence of this on the runtime.

In [20]:
print("min: "+str(min(df["PLTH"]))+"\t range: "+str(max(df["PLTH"])-min(df["PLTH"])))

min: 270400.0	 range: 23449600.0


Divide PLTH values by 100.000

In [28]:
df_pl_divided = df.copy(deep=True)
df_pl_divided["PLTH"] = df_pl_divided["PLTH"].apply(lambda x: x/100000)
print("min: "+str(min(df_pl_divided["PLTH"]))+"\t range: "+
      str(max(df_pl_divided["PLTH"])-min(df_pl_divided["PLTH"])))


min: 2.704	 range: 234.49599999999998


In [29]:
divided_map ={"divided": df_pl_divided, "raw": df}

So we check the best and worst Power Loss result from the initial 4-1 run and have 4 different runs per input combination: with / without scaling and raw / divided data.

We then do this several times to get some statistics.

Note that random_state of train_test_split was always set to 1, so there is no change in the split from run to run.

In [None]:
run = 1
# two initial runs to set result dataframe with scaling for best
curr_start = time.time()
metrics_checks_best, datas_checks_best = predictability(data=divided_map["raw"],
                            input_cols=4,
                            output_cols=1,
                            col_set=best_pl,
                            primkey_cols = ["TOK"],
                            targets=["PLTH"],
                            hidden_layers=[#(70,5,), 
                                           (70, 25, 5),
                                            (120,70,30,5)
                                          ],
                            alphas=[0.0005],
                            scaling="yes",
                            scoring="RMSE",
                            n_jobs=-1
                           )
runtime = round(time.time()-curr_start,2)
result_1 = pd.DataFrame.from_dict(metrics_checks_best).transpose()
result_1["run"] = run
result_1["runtime"] = runtime
result_1["scaling"] = "yes"
result_1["raw/divided"] = "raw"

curr_start = time.time()
metrics_checks_best, datas_checks_best = predictability(data=divided_map["divided"],
                            input_cols=4,
                            output_cols=1,
                            col_set=best_pl,
                            primkey_cols = ["TOK"],
                            targets=["PLTH"],
                            hidden_layers=[#(70,5,), 
                                           (70, 25, 5),
                                            (120,70,30,5)
                                          ],
                            alphas=[0.0005],
                            scaling="yes",
                            scoring="RMSE",
                            n_jobs=-1
                           )
runtime = round(time.time()-curr_start,2)
result_2 = pd.DataFrame.from_dict(metrics_checks_best).transpose()
result_2["run"] = run
result_2["runtime"] = runtime
result_2["scaling"] = "yes"
result_2["raw/divided"] = "divided"

check_runtime_pl = pd.concat([result_1, result_2])

# further two runs without scaling for best
for div in ["raw", "divided"]:
    start = time.time()
    metrics_checks_best, datas_checks_best = predictability(data=divided_map[div],
                                input_cols=4,
                                output_cols=1,
                                col_set=best_pl,
                                primkey_cols = ["TOK"],
                                targets=["PLTH"],
                                hidden_layers=[#(70,5,), 
                                               (70, 25, 5),
                                                (120,70,30,5)
                                              ],
                                alphas=[0.0005],
                                scaling="no",
                                scoring="RMSE",
                                n_jobs=-1
                               )
    runtime = round(time.time()-curr_start,2)
    result = pd.DataFrame.from_dict(metrics_checks_best).transpose()
    result["run"] = run
    result["runtime"] = runtime
    result["scaling"] = "no"
    result["raw/divided"] = div
    
    check_runtime_pl = pd.concat([check_runtime_pl, result])
    
# respective four runs for worst
for scaling in ["yes", "no"]:
    for div in ["raw", "divided"]:
        curr_start = time.time()
        metrics_checks, datas_checks = predictability(data=divided_map[div],
                                    input_cols=4,
                                    output_cols=1,
                                    col_set=worst_pl,
                                    primkey_cols = ["TOK"],
                                    targets=["PLTH"],
                                    hidden_layers=[#(70,5,), 
                                                   (70, 25, 5),
                                                    (120,70,30,5)
                                                  ],
                                    alphas=[0.0005],
                                    scaling=scaling,
                                    scoring="RMSE",
                                    n_jobs=-1
                                   )
        runtime = round(time.time()-curr_start,2)
        result = pd.DataFrame.from_dict(metrics_checks).transpose()
        result["run"] = run
        result["runtime"] = runtime
        result["scaling"] = scaling
        result["raw/divided"] = div

        check_runtime_pl = pd.concat([check_runtime_pl, result])

# run over all combinations several time to get some statistics
for run in np.arange(2,13):
    for inputs in [best_pl, worst_pl]:
        for scaling in ["yes", "no"]:
            for div in ["raw", "divided"]:
                curr_start = time.time()
                metrics_checks, datas_checks = predictability(data=divided_map[div],
                                            input_cols=4,
                                            output_cols=1,
                                            col_set=inputs,
                                            primkey_cols = ["TOK"],
                                            targets=["PLTH"],
                                            hidden_layers=[#(70,5,), 
                                                           (70, 25, 5),
                                                            (120,70,30,5)
                                                          ],
                                            alphas=[0.0005],
                                            scaling=scaling,
                                            scoring="RMSE",
                                            n_jobs=-1
                                           )
                runtime = round(time.time()-curr_start,2)
                result = pd.DataFrame.from_dict(metrics_checks).transpose()
                result["run"] = run
                result["runtime"] = runtime
                result["scaling"] = scaling
                result["raw/divided"] = div

                check_runtime_pl = pd.concat([check_runtime_pl, result])

check_runtime_pl = check_runtime_pl.set_index("run", append=True)
check_runtime_pl = check_runtime_pl.set_index("scaling", append=True)
check_runtime_pl = check_runtime_pl.set_index("raw/divided", append=True)
check_runtime_pl = check_runtime_pl.set_index("runtime", append=True)


In [77]:
#check_runtime_pl.to_pickle("check_runtime_power_loss.pkl")

In [26]:
check_runtime_pl = pd.read_pickle("check_runtime_power_loss.pkl")

In [None]:

check_runtime_pl[["MLP r2"]]


In [27]:
print("These "+str(max(check_runtime_pl.index.get_level_values("run")))+" runs took "+
      str(round(np.sum(check_runtime_pl.index.get_level_values("runtime"))/60,1))+"min")

These 12 runs took 130.6min


In [None]:
# analyse "faster" fits
# that were also at least not as bad as to result in r2 < threshold_pl (to avoid quick runs that, however, got
# stuck in a local minimum)
threshold_pl = 0.1
threshold_runtime = 50
quick_pl_runs = check_runtime_pl.loc[(check_runtime_pl.index.get_level_values("runtime")<threshold_runtime) &
                                    (check_runtime_pl["MLP r2"]>threshold_pl)]
quick_pl_runs

In [40]:
print("Number quick results with scaling and divided values: "+str(len(quick_pl_runs.loc[
    (quick_pl_runs.index.get_level_values("scaling")=="yes") & 
    (quick_pl_runs.index.get_level_values("raw/divided")=="divided")]))+"\t average r2: "+str(
    np.mean(quick_pl_runs.loc[(quick_pl_runs.index.get_level_values("scaling")=="yes") & 
        (quick_pl_runs.index.get_level_values("raw/divided")=="divided")]["MLP r2"])))
print("Number quick results with scaling and raw values: "+str(len(quick_pl_runs.loc[
    (quick_pl_runs.index.get_level_values("scaling")=="yes") & 
    (quick_pl_runs.index.get_level_values("raw/divided")=="raw")]))+"\t\t average r2: "+str(
    np.mean(quick_pl_runs.loc[(quick_pl_runs.index.get_level_values("scaling")=="yes") & 
        (quick_pl_runs.index.get_level_values("raw/divided")=="raw")]["MLP r2"])))
print("Number quick results without scaling and divided values: "+str(len(quick_pl_runs.loc[
    (quick_pl_runs.index.get_level_values("scaling")=="no") & 
    (quick_pl_runs.index.get_level_values("raw/divided")=="divided")]))+"\t average r2: "+str(
    np.mean(quick_pl_runs.loc[(quick_pl_runs.index.get_level_values("scaling")=="no") & 
        (quick_pl_runs.index.get_level_values("raw/divided")=="divided")]["MLP r2"])))
print("Number quick results without scaling and raw values: "+str(len(quick_pl_runs.loc[
    (quick_pl_runs.index.get_level_values("scaling")=="no") & 
    (quick_pl_runs.index.get_level_values("raw/divided")=="raw")]))+"\t\t average r2: "+str(
    np.mean(quick_pl_runs.loc[(quick_pl_runs.index.get_level_values("scaling")=="no") & 
        (quick_pl_runs.index.get_level_values("raw/divided")=="raw")]["MLP r2"])))


Number quick results with scaling and divided values: 22	 average r2: 0.7280231133992033
Number quick results with scaling and raw values: 4		 average r2: 0.3654670026544008
Number quick results without scaling and divided values: 10	 average r2: 0.3486273925537749
Number quick results without scaling and raw values: 6		 average r2: 0.8748658802648178


In [None]:
# analyse "slower" fits
# that were also at least not as bad as to result in r2 < threshold_pl
slow_pl_runs = check_runtime_pl.loc[(check_runtime_pl.index.get_level_values("runtime")>threshold_runtime) &
                                    (check_runtime_pl["MLP r2"]>threshold_pl)]
slow_pl_runs

In [43]:
print("Number slow results with scaling and divided values: "+str(len(slow_pl_runs.loc[
    (slow_pl_runs.index.get_level_values("scaling")=="yes") & 
    (slow_pl_runs.index.get_level_values("raw/divided")=="divided")]))+"\t\t average r2: "+str(
    np.mean(slow_pl_runs.loc[(slow_pl_runs.index.get_level_values("scaling")=="yes") & 
        (slow_pl_runs.index.get_level_values("raw/divided")=="divided")]["MLP r2"])))
print("Number slow results with scaling and raw values: "+str(len(slow_pl_runs.loc[
    (slow_pl_runs.index.get_level_values("scaling")=="yes") & 
    (slow_pl_runs.index.get_level_values("raw/divided")=="raw")]))+"\t\t average r2: "+str(
    np.mean(slow_pl_runs.loc[(slow_pl_runs.index.get_level_values("scaling")=="yes") & 
        (slow_pl_runs.index.get_level_values("raw/divided")=="raw")]["MLP r2"])))
print("Number slow results without scaling and divided values: "+str(len(slow_pl_runs.loc[
    (slow_pl_runs.index.get_level_values("scaling")=="no") & 
    (slow_pl_runs.index.get_level_values("raw/divided")=="divided")]))+"\t average r2: "+str(
    np.mean(slow_pl_runs.loc[(slow_pl_runs.index.get_level_values("scaling")=="no") & 
        (slow_pl_runs.index.get_level_values("raw/divided")=="divided")]["MLP r2"])))
print("Number slow results without scaling and raw values: "+str(len(slow_pl_runs.loc[
    (slow_pl_runs.index.get_level_values("scaling")=="no") & 
    (slow_pl_runs.index.get_level_values("raw/divided")=="raw")]))+"\t\t average r2: "+str(
    np.mean(slow_pl_runs.loc[(slow_pl_runs.index.get_level_values("scaling")=="no") & 
        (slow_pl_runs.index.get_level_values("raw/divided")=="raw")]["MLP r2"])))


Number slow results with scaling and divided values: 2		 average r2: 0.4439555615648225
Number slow results with scaling and raw values: 20		 average r2: 0.7142967587809911
Number slow results without scaling and divided values: 2	 average r2: 0.3561767194730459
Number slow results without scaling and raw values: 18		 average r2: 0.5053268674128307


In [29]:
# analyse bad fits
check_runtime_pl[["MLP r2"]].loc[check_runtime_pl["MLP r2"] < threshold_pl]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,MLP r2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,run,scaling,raw/divided,runtime,Unnamed: 9_level_1
VOL,WTH,IP,PNBI,PLTH,1,no,divided,242.41,-1912.537722
VOL,WTH,IP,PNBI,PLTH,2,no,divided,188.13,-3688.71821
VOL,WTH,IP,PNBI,PLTH,3,no,divided,159.44,-328.519825
VOL,WTH,IP,PNBI,PLTH,4,no,divided,2.01,-933.10681
VOL,WTH,IP,PNBI,PLTH,5,no,divided,152.84,-2.381878
VOL,WTH,IP,PNBI,PLTH,6,no,divided,253.07,-2.322388
VOL,WTH,IP,PNBI,PLTH,7,no,divided,191.48,-2.392489
VOL,WTH,IP,PNBI,PLTH,8,no,divided,218.98,-2227.30011
VOL,WTH,IP,PNBI,PLTH,9,no,divided,67.22,-7360.180098
VOL,WTH,IP,PNBI,PLTH,10,no,divided,50.78,-2.333439


so the results of all runs of the best combination without scaling but manually divided PLTH-values!

# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

# First 9-1-run

In [28]:
df1 = pd.read_csv("tokamaks/DB3V13_filtered_plus_Bool2Int.csv")

In [29]:
JET_df = df1.loc[df1["TOK"]=="JET"]

### ~ » Info about data, columns at the end of the notebook « ~

In [30]:
# columns we may use:
cols = ["PFLOSS", "PL", "PLTH", "AMIN", "RGEO", "KAPPA", "KAPPAA", 
        "KAREA", "INDENT", "VOL", "NEL", "PALPHA", "PNBI", "PINJ",
        "WTH", "WTOT", "IP"]

In [31]:
for col in cols:
    print(col+": "+str(len(JET_df.loc[JET_df[col]!=0])))

PFLOSS: 2632
PL: 3951
PLTH: 3951
AMIN: 3951
RGEO: 3951
KAPPA: 3951
KAPPAA: 3951
KAREA: 3951
INDENT: 0
VOL: 3951
NEL: 3951
PALPHA: 106
PNBI: 2632
PINJ: 2632
WTH: 3951
WTOT: 3951
IP: 3951


so may in fact not use INDENT and PALPHA

In [32]:
cols = ["PLTH", "AMIN", "RGEO", "KAPPA", "KAPPAA", 
        "KAREA", "VOL", "NEL", "PNBI", "PINJ",
        "WTH", "WTOT", "IP"]

In [None]:
metrics_9_1, datas_9_1 = predictability(data=JET_df,
                                input_cols=9,
                                output_cols=1,
                                col_set=cols,
                                primkey_cols = prim_keys,
                                targets=["PLTH"],
                                hidden_layers=[(70,5,), (70, 25, 5)],
                                scaling="yes",
                                scoring="RMSE",
                                n_jobs=-1
                               )

In [None]:
'''
with open('JET_metrics_9_1_first.pkl', 'wb') as f:
    pickle.dump(metrics_9_1, f)
with open('JET_datas_9_1_first.pkl', 'wb') as f:
    pickle.dump(datas_9_1, f)
'''

In [9]:
with open('JET_metrics_9_1_first.pkl', 'rb') as f:
    metrics_9_1 = pickle.load(f)
with open('JET_datas_9_1_first.pkl', 'rb') as f:
    datas_9_1 = pickle.load(f)

In [10]:
metrics_9_1_df = pd.DataFrame.from_dict(metrics_9_1).transpose().sort_values(by="MLP r2", ascending=False)


#### check whether, in fact, always the same model size is chosen

In [35]:

chosen_layers = []
chosen_alphas = []
for key in list(datas_9_1.keys()):
    chosen_layers.append(datas_9_1[key]["GridSearchParams"]["mlp__hidden_layer_sizes"])
    chosen_alphas.append(datas_9_1[key]["GridSearchParams"]["mlp__alpha"])
for layer in set(chosen_layers):
    print(str(layer)+" frequency: "+str(chosen_layers.count(layer)))
for alpha in set(chosen_alphas):
    print(str(alpha)+" frequency: "+str(chosen_alphas.count(alpha)))

(70, 25, 5) frequency: 217
(70, 5) frequency: 3
1e-05 frequency: 42
0.0001 frequency: 73
0.001 frequency: 105


##### best results

In [36]:
metrics_9_1_df.head(5)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,MLP r2,linear r2,mean r2,MLP RMSE,linear RMSE,mean RMSE,MLP RMSE/std,linear RMSE/std,mean RMSE/std,MLP MAPE,linear MAPE,mean MAPE,MLP rae,linear rae,mean rae,MLP dcor,linear dcor,mean dcor
IP,PNBI,WTH,RGEO,KAPPAA,NEL,KAPPA,KAREA,AMIN,PLTH,0.952364,0.868638,-2.9e-05,971683.090493,1613582.0,4452070.0,0.218257,0.362439,1.000014,0.131841,0.239447,1.132448,1562.033783,0.298292,1186.827282,0.972386,0.936554,0.0
IP,PNBI,PINJ,WTH,RGEO,KAPPAA,NEL,KAPPA,KAREA,PLTH,0.951834,0.891461,-2.9e-05,977074.766381,1466726.0,4452070.0,0.219468,0.329453,1.000014,0.133135,0.212578,1.132448,1565.395769,0.263558,1186.827282,0.971597,0.942152,0.0
IP,PNBI,WTH,RGEO,KAPPAA,NEL,KAPPA,AMIN,VOL,PLTH,0.951701,0.868638,-2.9e-05,978417.625509,1613582.0,4452070.0,0.21977,0.362439,1.000014,0.130405,0.239447,1.132448,1563.762154,0.298292,1186.827282,0.972469,0.936554,0.0
IP,PNBI,PINJ,WTH,RGEO,NEL,KAPPA,KAREA,AMIN,PLTH,0.951605,0.891464,-2.9e-05,979392.138946,1466704.0,4452070.0,0.219989,0.329448,1.000014,0.131952,0.212575,1.132448,1561.081031,0.263551,1186.827282,0.972257,0.942153,0.0
IP,PNBI,WTH,RGEO,KAPPAA,NEL,KAREA,AMIN,VOL,PLTH,0.951533,0.868638,-2.9e-05,980119.998232,1613582.0,4452070.0,0.220152,0.362439,1.000014,0.130668,0.239447,1.132448,1564.633472,0.298292,1186.827282,0.971995,0.936554,0.0


##### worst results

In [37]:
pd.DataFrame.from_dict(metrics_9_1).transpose().sort_values(by="MLP r2", ascending=True).head(5)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,MLP r2,linear r2,mean r2,MLP RMSE,linear RMSE,mean RMSE,MLP RMSE/std,linear RMSE/std,mean RMSE/std,MLP MAPE,linear MAPE,mean MAPE,MLP rae,linear rae,mean rae,MLP dcor,linear dcor,mean dcor
IP,PNBI,WTH,KAPPAA,NEL,KAPPA,KAREA,AMIN,VOL,PLTH,-2.053631,0.868624,-2.9e-05,7779721.0,1613669.0,4452070.0,1.747464,0.362459,1.000014,0.99996,0.239572,1.132448,2040.438509,0.298348,1186.827282,0.0,0.936555,0.0
IP,PNBI,WTH,RGEO,KAPPAA,NEL,KAPPA,KAREA,VOL,PLTH,-2.05363,0.868638,-2.9e-05,7779720.0,1613582.0,4452070.0,1.747464,0.362439,1.000014,0.99996,0.239447,1.132448,2040.438033,0.298292,1186.827282,0.0,0.936554,0.0
IP,WTH,RGEO,KAPPAA,NEL,KAPPA,KAREA,AMIN,VOL,PLTH,0.833233,0.704137,-2.9e-05,1818072.0,2421592.0,4452070.0,0.408372,0.543933,1.000014,0.220551,0.358746,1.132448,1535.071438,0.485319,1186.827282,0.917575,0.857338,0.0
IP,WTH,RGEO,WTOT,KAPPAA,NEL,KAREA,AMIN,VOL,PLTH,0.835751,0.77959,-2.9e-05,1804293.0,2090119.0,4452070.0,0.405276,0.469478,1.000014,0.230517,0.322957,1.132448,1533.070262,0.422118,1186.827282,0.918582,0.901721,0.0
IP,PINJ,WTH,RGEO,NEL,KAPPA,KAREA,AMIN,VOL,PLTH,0.843968,0.744269,-2.9e-05,1758579.0,2251372.0,4452070.0,0.395008,0.505698,1.000014,0.223889,0.324547,1.132448,1541.922642,0.452728,1186.827282,0.923231,0.885257,0.0


### plotting different results

#### error plots

In [11]:
best_results_9_1 = metrics_9_1_df.sort_values(by="MLP r2", ascending=False).index

In [17]:
result = 2

In [None]:
plot_result(datas_9_1, best_results_9_1[result])

#### plot which input appears in which of the best results

In [55]:

input_cols = ["level_0", "level_1", "level_2", "level_3", "level_4", "level_5", "level_6", "level_7", "level_8"]
input_pos_df = metrics_9_1_df.reset_index()[input_cols].reset_index()

inputs = [i for i in cols if i not in ["PLTH"]]

for curr_input in inputs:
    input_pos_df[curr_input] = [1 if n > 0 else 0 for n in np.sum(input_pos_df[input_cols].values==curr_input, 1)]

input_pos_df = input_pos_df[inputs+["index"]]

In [56]:
input_pos_df.head(5)

Unnamed: 0,AMIN,RGEO,KAPPA,KAPPAA,KAREA,VOL,NEL,PNBI,PINJ,WTH,WTOT,IP,index
0,1,1,1,1,1,0,1,1,0,1,0,1,0
1,0,1,1,1,1,0,1,1,1,1,0,1,1
2,1,1,1,1,0,1,1,1,0,1,0,1,2
3,1,1,1,0,1,0,1,1,1,1,0,1,3
4,1,1,0,1,1,1,1,1,0,1,0,1,4


In [57]:
input_pos_plt_df = input_pos_df.melt(id_vars=["index"], var_name="input", value_name="value")

In [58]:
separator_conditions = []
sep = []
m=0
for curr_input in inputs:
    separator_conditions.append(input_pos_plt_df["input"]==curr_input)
    sep.append(m)
    m += 1
input_pos_plt_df["sep"] = np.select(separator_conditions, sep)

In [None]:
fig = px.line_3d(input_pos_plt_df, x = "sep", y="index", z="value", color="input",# size=[1]*len(input_pos_plt_df)
                color_discrete_sequence=px.colors.qualitative.Dark24
                )
fig.update_layout(
    scene = dict(
                xaxis=dict(title="input"),
                yaxis=dict(title="n-th best"),
                zaxis=dict(title="present y/n", range=[0,2])
    )
)
fig.show()

# New 13-1-run on extended inputs

In [9]:
df = pd.read_csv("tokamak_general/JET_(2036, 122)_DB3V13.csv")

In [63]:
df.head()

Unnamed: 0,TOK,TOK_ID,DIVNAME,LCUPDATE,DATE,SHOT,TIME,TIME_ID,T1,T2,...,IAEA92,DB2P5,DB2P8,DB3IS,DB3V5,IAE2000N,IAE2000X,HMWS2003,SELDB3,SELDB3X
0,JET,6,MarkGBSR,20031119,20020530,56145,62.6474,62647,62.56,62.75,...,0,0,0,0,0,0,0,0,1111111010,1110
1,JET,6,MarkGBSR,20031119,20020916,56603,64.882996,64883,64.8,65.0,...,0,0,0,0,0,0,0,0,1111111111,11111100
2,JET,6,MarkGBSR,20031119,20020916,56605,64.882996,64883,64.8,65.0,...,0,0,0,0,0,0,0,0,1111111111,11111100
3,JET,6,MarkGBSR,20031119,20020916,56606,64.882996,64883,64.8,65.0,...,0,0,0,0,0,0,0,0,1111111111,11111100
4,JET,6,MarkGBSR,20031119,20020916,56610,65.379761,65380,65.25,65.5,...,0,0,0,0,0,0,0,0,1111111111,11111100


In [None]:
set(df.columns)

### ~ » Info about data, columns at the end of the notebook « ~

In [12]:
# columns we may use:
cols_13 = ["PFLOSS", "PL", "PLTH", 
           "TAUTH", "TAUTOT",
           "AMIN", "RGEO", "KAPPA", "KAPPAA", 
           "KAREA", "INDENT", "VOL", "NEL", "PALPHA", "PNBI", "PINJ",
           "WTH", "WTOT", "IP", "PECRH", "PICRH", "PICRHC", "DELTA", "SEPLIM", "XPLIM", "AREA"]

In [17]:
# check number of non-zero and negative entries
for col in cols_13:
    print(col+"\t non-zero: "+str(len(df.loc[df[col]!=0]))+" \t neg.: "+str(len(df.loc[df[col]<0])))
print("overall entries: "+str(len(df)))

PFLOSS	 non-zero: 1573 	 neg.: 0
PL	 non-zero: 2036 	 neg.: 0
PLTH	 non-zero: 2036 	 neg.: 0
TAUTH	 non-zero: 2036 	 neg.: 0
TAUTOT	 non-zero: 2036 	 neg.: 0
AMIN	 non-zero: 2036 	 neg.: 0
RGEO	 non-zero: 2036 	 neg.: 0
KAPPA	 non-zero: 2036 	 neg.: 0
KAPPAA	 non-zero: 2036 	 neg.: 0
KAREA	 non-zero: 2036 	 neg.: 0
INDENT	 non-zero: 0 	 neg.: 0
VOL	 non-zero: 2036 	 neg.: 0
NEL	 non-zero: 2036 	 neg.: 0
PALPHA	 non-zero: 104 	 neg.: 0
PNBI	 non-zero: 1573 	 neg.: 0
PINJ	 non-zero: 1573 	 neg.: 0
WTH	 non-zero: 2036 	 neg.: 0
WTOT	 non-zero: 2036 	 neg.: 0
IP	 non-zero: 2036 	 neg.: 2036
PECRH	 non-zero: 0 	 neg.: 0
PICRH	 non-zero: 15 	 neg.: 0
PICRHC	 non-zero: 15 	 neg.: 0
DELTA	 non-zero: 2036 	 neg.: 0
SEPLIM	 non-zero: 2036 	 neg.: 0
XPLIM	 non-zero: 2018 	 neg.: 498
AREA	 non-zero: 2036 	 neg.: 0
overall entries: 2036


#### for power-law fitting:
* IP has only negative values – use IP --> -IP ?
* XPLIM, PFLOSS,  have both zero and negative entries
* PNBI, PINJ have zero entries but no negative ones

In [17]:
cols_13 = ["PLTH", "TAUTOT", "AMIN", "RGEO", "KAPPA", "KAPPAA", 
        "KAREA", "VOL", "NEL", "PNBI", "PINJ",
        "WTH", "WTOT", "IP", "DELTA", "SEPLIM", "XPLIM", "AREA"]

In [None]:
metrics_13_1, datas_13_1 = predictability(data=df,
                                input_cols=13,
                                output_cols=1,
                                col_set=cols_13,
                                primkey_cols = ["TOK"],
                                targets=["PLTH", "TAUTOT"],
                                hidden_layers=[#(70,5,), 
                                               (70, 25, 5),
                                                (120,70,30,5)
                                              ],
                                alphas=[0.0005],
                                scaling="yes",
                                scoring="RMSE",
                                n_jobs=-1
                               )

In [None]:
'''
with open('JET_metrics_13_1.pkl', 'wb') as f:
    pickle.dump(metrics_13_1, f)
with open('JET_datas_13_1.pkl', 'wb') as f:
    pickle.dump(datas_13_1, f)
    '''

In [19]:
with open('JET_metrics_13_1.pkl', 'rb') as f:
    metrics_13_1 = pickle.load(f)
with open('JET_datas_13_1.pkl', 'rb') as f:
    datas_13_1 = pickle.load(f)

#### check whether, in fact, always the same model size is chosen

note that we allowed only one value for alpha (0.0005) for the 13-1-run

In [68]:

chosen_layers_13 = []
chosen_alphas_13 = []
for key in list(datas_13_1.keys()):
    chosen_layers_13.append(datas_13_1[key]["GridSearchParams"]["mlp__hidden_layer_sizes"])
    chosen_alphas_13.append(datas_13_1[key]["GridSearchParams"]["mlp__alpha"])
for layer in set(chosen_layers_13):
    print(str(layer)+" frequency: "+str(chosen_layers_13.count(layer)))
for alpha in set(chosen_alphas_13):
    print(str(alpha)+" frequency: "+str(chosen_alphas_13.count(alpha)))

(70, 25, 5) frequency: 128
(120, 70, 30, 5) frequency: 432
0.0005 frequency: 560


In [20]:
metrics_13_1_df = pd.DataFrame.from_dict(metrics_13_1).transpose().sort_values(by="MLP r2", ascending=False)

#### best results

In [15]:
metrics_13_1_df.head(7)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,MLP r2,linear r2,mean r2,MLP RMSE,linear RMSE,mean RMSE,MLP RMSE/std,linear RMSE/std,mean RMSE/std,MLP MAPE,linear MAPE,mean MAPE,MLP rae,linear rae,mean rae,MLP dcor,linear dcor,mean dcor
XPLIM,PNBI,NEL,AREA,PINJ,IP,SEPLIM,RGEO,KAPPA,WTOT,AMIN,KAPPAA,WTH,PLTH,0.957076,0.899104,-0.006278,947842.639834,1453198.0,4589313.0,0.20718,0.317641,1.003134,0.131397,0.212356,1.265121,807.334496,0.249217,617.368137,0.974986,0.941913,0.0
XPLIM,PNBI,NEL,KAREA,AREA,PINJ,DELTA,VOL,IP,KAPPA,AMIN,KAPPAA,WTH,PLTH,0.956286,0.89427,-0.006278,956529.844089,1487606.0,4589313.0,0.209079,0.325162,1.003134,0.143667,0.222194,1.265121,805.99451,0.262328,617.368137,0.973905,0.937893,0.0
XPLIM,PNBI,NEL,KAREA,PINJ,DELTA,VOL,IP,KAPPA,WTOT,AMIN,KAPPAA,WTH,PLTH,0.955891,0.899104,-0.006278,960837.61681,1453198.0,4589313.0,0.21002,0.317641,1.003134,0.142679,0.212356,1.265121,807.69731,0.249217,617.368137,0.973608,0.941913,0.0
XPLIM,PNBI,NEL,KAREA,AREA,VOL,IP,SEPLIM,RGEO,KAPPA,WTOT,AMIN,KAPPAA,PLTH,0.955581,0.87925,-0.006278,964208.407302,1589762.0,4589313.0,0.210757,0.347491,1.003134,0.138266,0.231439,1.265121,808.563698,0.273524,617.368137,0.973999,0.93654,0.0
XPLIM,PNBI,NEL,KAREA,AREA,PINJ,DELTA,VOL,IP,WTOT,AMIN,KAPPAA,WTH,PLTH,0.95558,0.899104,-0.006278,964226.444436,1453198.0,4589313.0,0.210761,0.317641,1.003134,0.139989,0.212356,1.265121,806.995513,0.249217,617.368137,0.973356,0.941913,0.0
XPLIM,PNBI,NEL,KAREA,AREA,PINJ,DELTA,VOL,IP,KAPPA,WTOT,AMIN,WTH,PLTH,0.955255,0.899104,-0.006278,967740.342197,1453198.0,4589313.0,0.211529,0.317641,1.003134,0.14544,0.212356,1.265121,805.259915,0.249217,617.368137,0.973133,0.941913,0.0
XPLIM,PNBI,NEL,KAREA,AREA,PINJ,DELTA,IP,RGEO,KAPPA,AMIN,KAPPAA,WTH,PLTH,0.955102,0.89427,-0.006278,969396.905991,1487606.0,4589313.0,0.211891,0.325162,1.003134,0.143339,0.222194,1.265121,804.421764,0.262328,617.368137,0.973309,0.937893,0.0


#### worst results

In [71]:
pd.DataFrame.from_dict(metrics_13_1).transpose().sort_values(by="MLP r2", ascending=True).head(7)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,MLP r2,linear r2,mean r2,MLP RMSE,linear RMSE,mean RMSE,MLP RMSE/std,linear RMSE/std,mean RMSE/std,MLP MAPE,linear MAPE,mean MAPE,MLP rae,linear rae,mean rae,MLP dcor,linear dcor,mean dcor
XPLIM,PNBI,NEL,KAREA,PINJ,DELTA,VOL,SEPLIM,RGEO,KAPPA,WTOT,KAPPAA,WTH,PLTH,-2.387327,0.898074,-0.006278,8420100.0,1460603.0,4589313.0,1.840469,0.319259,1.003134,0.999979,0.207246,1.265121,1111.043212,0.246172,617.368137,0.000454,0.942058,0.0
XPLIM,PNBI,NEL,PINJ,DELTA,VOL,IP,SEPLIM,KAPPA,WTOT,AMIN,KAPPAA,WTH,PLTH,-2.387327,0.899104,-0.006278,8420099.0,1453198.0,4589313.0,1.840469,0.317641,1.003134,0.999979,0.212356,1.265121,1111.043048,0.249217,617.368137,0.000602,0.941913,0.0
XPLIM,NEL,KAREA,AREA,DELTA,VOL,IP,SEPLIM,RGEO,KAPPA,AMIN,KAPPAA,WTH,PLTH,0.774883,0.687805,-0.006278,2170663.0,2556241.0,4589313.0,0.474465,0.558744,1.003134,0.309509,0.416418,1.265121,767.842345,0.520833,617.368137,0.878427,0.834113,0.0
XPLIM,PNBI,KAREA,AREA,PINJ,DELTA,IP,SEPLIM,RGEO,KAPPA,AMIN,KAPPAA,WTH,PLTH,0.778427,0.893174,-0.006278,2153509.0,1495296.0,4589313.0,0.470715,0.326843,1.003134,0.420754,0.239137,1.265121,816.387227,0.280604,617.368137,0.87651,0.933306,0.0
XPLIM,KAREA,AREA,DELTA,VOL,IP,SEPLIM,RGEO,KAPPA,WTOT,AMIN,KAPPAA,WTH,PLTH,0.793129,0.745538,-0.006278,2080839.0,2307811.0,4589313.0,0.454831,0.504443,1.003134,0.283162,0.379875,1.265121,777.999519,0.461529,617.368137,0.896018,0.866491,0.0
XPLIM,NEL,KAREA,AREA,VOL,IP,SEPLIM,RGEO,KAPPA,WTOT,AMIN,KAPPAA,WTH,PLTH,0.803645,0.769265,-0.006278,2027263.0,2197584.0,4589313.0,0.44312,0.480349,1.003134,0.286623,0.365569,1.265121,779.341988,0.442766,617.368137,0.897615,0.890204,0.0
XPLIM,NEL,KAREA,AREA,DELTA,VOL,IP,SEPLIM,RGEO,KAPPA,WTOT,AMIN,KAPPAA,PLTH,0.811677,0.71782,-0.006278,1985363.0,2430254.0,4589313.0,0.433962,0.531206,1.003134,0.282796,0.400185,1.265121,782.327808,0.497752,617.368137,0.901246,0.854653,0.0


### plotting different results

#### error plots

In [21]:
best_results_13 = metrics_13_1_df.sort_values(by="MLP r2", ascending=False).index

In [22]:
result_13 = 298

In [None]:
plot_result(datas_13_1, best_results_13[result])

#### plot which input appears in which of the best results

In [18]:

input_cols_13 = ["level_0", "level_1", "level_2", "level_3", "level_4", "level_5", "level_6", "level_7", 
                 "level_8", "level_9", "level_10", "level_11", "level_12"]
input_pos_13_df = metrics_13_1_df.reset_index()[input_cols_13].reset_index()

inputs_13 = [i for i in cols_13 if i not in ["PLTH"]]

for curr_input in inputs_13:
    input_pos_13_df[curr_input] = [1 if n > 0 else 0 for n in 
                                   np.sum(input_pos_13_df[input_cols_13].values==curr_input, 1)]

input_pos_13_df = input_pos_13_df[inputs_13+["index"]]


In [19]:
input_pos_13_df.head(5)

Unnamed: 0,TAUTOT,AMIN,RGEO,KAPPA,KAPPAA,KAREA,VOL,NEL,PNBI,PINJ,WTH,WTOT,IP,DELTA,SEPLIM,XPLIM,AREA,index
0,0,1,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,0
1,0,1,0,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1
2,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,2
3,0,1,1,1,1,1,1,1,1,0,0,1,1,0,1,1,1,3
4,0,1,0,0,1,1,1,1,1,1,1,1,1,1,0,1,1,4


In [20]:
input_pos_plt_13_df = input_pos_13_df.melt(id_vars=["index"], var_name="input", value_name="value")

In [21]:
separator_conditions_13 = []
sep_13 = []
m_13=0
for curr_input in inputs_13:
    separator_conditions_13.append(input_pos_plt_13_df["input"]==curr_input)
    sep_13.append(m_13)
    m_13 += 1
input_pos_plt_13_df["sep"] = np.select(separator_conditions_13, sep_13)

In [None]:
fig = px.line_3d(input_pos_plt_13_df, x = "sep", y="index", z="value", color="input",# size=[1]*len(input_pos_plt_df)
                color_discrete_sequence=px.colors.qualitative.Dark24
                )
fig.update_layout(
    scene = dict(
                xaxis=dict(title="input"),
                yaxis=dict(title="n-th best"),
                zaxis=dict(title="present y/n", range=[0,2])
    )
)
fig.show()

### INFO ABOUT EACH VARIABLE

colname    dtype  unique  nan_total  percent_nan_total     info                                                                                           
0        TOK   object      19          0                0.0     1. __TOK__: This variable designates which tokamak has supplied the data.
 
56  HITER96L  float64    4044          0                0.0     142. __HITER96L__: Enhancement factor TAUTH ´ TAUC92 / ITERL96-P
 
55       H89  float64    3632          0                0.0     141. __H89__: Enhancement factor TAUTOT ´ TAUC92 / ITER89-P
 
54    TAUC93  float64     122          0                0.0     140. __TAUC93__: Correction factor for thermal confinement time TAUTH
 
53    TAUC92  float64     122          0                0.0     139. __TAUC92__: Correction factor for thermal confinement time TAUTH
 
52     TAUTH  float64    7039          0                0.0     138. __TAUTH__: Estimated thermal energy confinement time in seconds.
 
51    TAUTOT  float64    7083          0                0.0     137. __TAUTOT__: Estimated total energy confinement time in seconds.
 
50      PLTH  float64    6011          0                0.0     136. __PLTH__: Estimated Loss Power corrected for charge exchange and unconfined orbit losses in Wat...
 
57       H93  float64    5728          0                0.0     143. __H93__: Enhancement factor TAUTH ´ TAUC92 / ITERH93-P
 
49        PL  float64    6007          0                0.0     135. __PL__: Estimated Loss Power not corrected for charge exchange and unconfined orbit losses in w...
 
47      WTOT  float64    6412          0                0.0     133. __WTOT__: Estimated total plasma energy content in Joules.
 
46    ICFORM   object       2          0                0.0     126. __WFICFORM__: Total fast ion energy due to ICRH in joules estimated from approximate formula.
 
45    WFICRH   object      93          0                0.0     124. __WFICRH__: Estimate of the perpendicular fast ion energy content during ICRH heating in Joules.
 
44    WFFORM  float64    5386          0                0.0     122. __WFFORM__: Total fast ion energy due to NBI in joules estimated from approximate formula.
 
43      DWHC     bool       2          0                0.0     107. __DWHC__: Equal to 1 when DWDIA or DWMHD have been corrected by hand for ASDEX.
 
42    PALPHA    int32     107          0                0.0     103. __PALPHA__: Estimated alpha heating power in Deuterium-Tritium plasmas in watts.
 
41     PICRH  float64     782          0                0.0     102. __PICRH__: ICRH power in watts absorbed by the plasma. __Zero, if no ICRH is applied.
 
48       WTH  float64    6672          0                0.0     134. __WTH__: Estimated thermal plasma energy content in Joules.
 
40    PICRHC  float64     791          0                0.0     101. __PICRHC__: ICRH power in watts coupled to the plasma. __Zero, if no ICRH is applied.
 
58  HITER92Y  float64    4894          0                0.0     144. __HITER92Y__: Enhancement factor TAUTH ´ TAUC92 / ITERH92Y
 
60   HIPB98Y  float64    4373          0                0.0     146. __HIPB98Y__: Enhancement factor TAUTH ´ TAUC92 / IPB98(y)
 
76  HMWS2003     bool       2          0                0.0     173. __HMWS2003__: Standard dataset flag.
 
75  IAE2000X     bool       2          0                0.0     172. __IAE2000X__: Standard dataset flag.
 
74  IAE2000N     bool       2          0                0.0     171. __IAE2000N__: Standard dataset flag.
 
73     DB3V5     bool       2          0                0.0     170. __DB3V5__: Standard dataset flag.
 
72     DB3IS     bool       2          0                0.0     169. __DB3IS__: Standard dataset flag.
 
71     DB2P8     bool       2          0                0.0     168. __DB2P8__: Standard dataset flag.
 
70     DB2P5     bool       2          0                0.0     167. __DB2P5__: Standard dataset flag.
 
59    HEPS97  float64    4606          0                0.0     145. __HEPS97__: Enhancement factor TAUTH ´ TAUC93 / EPS97(ELMy)
 
69    IAEA92     bool       2          0                0.0     166. __IAEA92__: Standard dataset flag.
 
67    SELDB2    int32      59          0                0.0     164. __SELDB2__: Flagging variable for standard selection in DB.2
 
66    SELDB1    int32       6          0                0.0     163. __SELDB1__: Flagging variable making connections to DB.1.
 
65  STANDARD     bool       2          0                0.0     162. __STANDARD__: Standard dataset flag.
 
64  HIPB98Y4  float64    4741          0                0.0     150. __HIPB98Y4__: Enhancement factor TAUTH ´ TAUC92 / IPB98(y,4)
 
63  HIPB98Y3  float64    4558          0                0.0     149. __HIPB98Y3__: Enhancement factor TAUTH ´ TAUC92 / IPB98(y,3)
 
62  HIPB98Y2  float64    4468          0                0.0     148. __HIPB98Y2__: Enhancement factor TAUTH ´ TAUC92 / IPB98(y,2)
 
61  HIPB98Y1  float64    4370          0                0.0     147. __HIPB98Y1__: Enhancement factor TAUTH ´ TAUC92 / IPB98(y,1)
 
68   SELDB2X    int32      95          0                0.0     165. __SELDB2X__: Flagging variable for extra selection criteria in DB.2
 
77    SELDB3    int32      41          0                0.0     179. __SELDB3__: Flagging variable for new standard selection
 
39    ICFREQ  float64      32          0                0.0     98. __ICFREQ__: Frequency of ICRH waves in Hz. __Zero, if no ICRH is applied.
 
37   ECHFREQ    int64       4          0                0.0     93. __ECHFREQ__: ECRH frequency in Hz. __Zero if no ECRH is applied.
 
16    BGASZ2   object       5          0                0.0     31. __BGASZ2__: Charge number of the second neutral beam gas (integer value).
 
15    BGASA2    int32       4          0                0.0     30. __BGASA2__: Mass number of the second neutral beam gas (real value).
 
14     BGASZ   object       5          0                0.0     29. __BGASZ__: Charge number of the neutral beam gas (integer value).
 
13     BGASA  float64       8          0                0.0     28. __BGASA__: Mass number of the neutral beam gas (real value).
 
12     PGASZ    int32       3          0                0.0     27. __PGASZ__: Charge number of the plasma working gas (integer value).
 
11     PGASA  float64       6          0                0.0     26. __PGASA__: Mass number of the plasma working gas (real value).
 
10      MEFF  float64     402          0                0.0     25. __MEFF__: Effective atomic mass in AMU.
 
17     XGASZ    int32       8          0                0.0     34. __XGASZ__: Atomic Charge of extra fuelled gas.
 
9        TPI    int32       7          0                0.0     23. __TPI__: Time point indicator (ASDEX only).
 
7    TIME_ID    int32    2111          0                0.0     8. __TIME_ID__: Integer equivalent of TIME, defined as INT[1000 ´ ROUND{TIME, 0.001}]
 
6       TIME  float64    2147          0                0.0     7. __TIME__: Time during the shot at which the data are taken in seconds.
 
5       SHOT    int32    3453          0                0.0     6. __SHOT__: The shot from which the data are taken.
 
4       DATE    int32     588          0                0.0     4. __LCUPDATE__: The date of the most recent update for any variable listed in the database.
 
3   LCUPDATE    int32      79          0                0.0     4. __LCUPDATE__: The date of the most recent update for any variable listed in the database.
 
2    DIVNAME   object      19          0                0.0     3. __DIVNAME__: Describes each device's divertor configuration
 
1     TOK_ID    int32      19          0                0.0     2. __TOK_ID__: This integer variable designates which tokamak has supplied the data.
 
8      PHASE   object       9          0                0.0     12. __PHASE__: The phase of the discharge at TIME.
 
38     PECRH    int32      47          0                0.0     97. __PECRH__: ECRH power in watts absorbed by the plasma. __Zero if no ECRH is applied.
 
18     XGASA  float64      12          0                0.0     35. __XGASA__: Atomic Mass of extra fuelled gas.
 
20      RGEO  float64     806          0                0.0     37. __RGEO__: The plasma geometrical major radius in meters, from an MHD equilibrium fit, defined as...
 
36    PFLOSS  float64    5126          0                0.0     92. __PFLOSS__: Amount of neutral beam power in watts that is lost from the plasma through charge ex...
 
35      PNBI  float64    4275          0                0.0     91. __PNBI__: Total injected neutral beam power minus shine through in watts. __Zero if no beams are o...
 
34      PINJ  float64    3612          0                0.0     86. __PINJ__: The injected neutral beam power with beam of (BGASA, BGASZ) that passes into the torus...
 
33      POHM  float64    5954          0                0.0     84. __POHM__: Total Ohmic power in watts.
 
32   NELFORM     bool       2          0                0.0     76. __NELFORM__: Indicates if NEL is not a direct measurement.
 
31       NEL  float64    4967          0                0.0     75. __NEL__: Central line average electron density in m-3 from interferometer or LIDAR (JET).
 
30        IP  float64    3536          0                0.0     61. __IP__: The plasma current in Amperes determined from an external Rogowski loop with vessel curr...
 
19    CONFIG   object      11          0                0.0     36. __CONFIG__: The plasma configuration.
 
29      IEML  float64      88          0                0.0     59. __IEML__: Ergodic magnetic field coil current in Amperes.
 
27    WALMAT   object       6          0                0.0     51. __WALMAT__: The material of the vessel wall.
 
26       VOL  float64    3317          0                0.0     48. __VOL__: The plasmas volume in m3 determined from an MHD equilibrium fit.
 
25    INDENT  float64     228          0                0.0     46. __INDENT__: Indentation of the plasma determined from an MHD equilibrium fit.
 
24     KAREA  float64    1826          0                0.0     42. __KAREA__: New plasma elongation definition (= VOL / (2p RGEO) / ( p AMIN2) )
 
23    KAPPAA  float64    7107          0                0.0     41. __KAPPAA__: New plasma elongation definition (= AREA / ( p AMIN2) )
 
22     KAPPA  float64    1242          0                0.0     40. __KAPPA__: The plasma elongation determined from an MHD equilibrium fit.
 
21      AMIN  float64    2530          0                0.0     39. __AMIN__: The horizontal plasma minor radius in meters from an MHD equilibrium fit.
 
28        BT  float64    2904          0                0.0     58. __BT__: The vacuum Toroidal magnetic field in Tesla at RGEO determined from the TF coil current.
 
78   SELDB3X    int32      14          0                0.0     180. __SELDB3X__: Flagging variable for extra selection criteria in DB.3

##### Power losses:

* __PFLOSS__ Amount of neutral beam power in watts that is lost from the plasma through charge exchange
 and unconfined orbits.

* __PL__: Estimated Loss Power not corrected for charge exchange and unconfined orbit losses in watts.<br>
| TOK | PL = |
| :-: | :-: |
| ASDEX: | POHM + PNBI - DWDIA/3 - 2 DWMHD/3.| 
|AUG: | POHM + PNBI + PICRH + PECRH - DWMHD.| 
|CMOD: | POHM + PICRH - DWMHD.| 
|COMPASS: | POHM + PECRH - DWDIA.| 
|D3D: | POHM + PNBI + PECRH - DWMHD.| 
|JET: | POHM + PNBI + PICRH - DWDIA.| 
|JFT2M: | POHM + PNBI - DWDIA.| 
|JT60U: |POHM + PNBI - DWDIA| 
|MAST: | POHM + PNBI - DWMHD| 
|NSTX: | POHM + PNBI + PICRH - DWMHD| 
|PBXM: | POHM + PNBI - DWMHD.| 
|PDX: | POHM + PNBI - DWMHD.| 
|START: | POHM + PNBI - DWMHD.| 
|TCV: | POHM - DWMHD.| 
|TDEV: | POHM+PECRH-DWDIA| 
|TEXTOR: | POHM + PNBI + PICRH - DWDIA.| 
|TFTR: | POHM + PNBI - DWDIA| 
|TUMAN3M: | POHM - DWDIA.| 
|T10: | POHM + PECRH - DWDIA. |
* __PLTH__: Estimated Loss Power corrected for charge exchange and unconfined orbit losses in Watts.

__but__: PLTH = PL - PFLOSS<br>
=> so don't use all three

##### Geometric quantities about the plasma

* __AMIN__: The horizontal plasma minor radius in meters from an MHD equilibrium fit.

* __RGEO__: The plasma geometrical major radius in meters, from an MHD equilibrium fit, defined as the average of the minimum and the maximum radial extent of the plasma.

* __KAPPA__: The plasma elongation determined from an MHD equilibrium fit.

* __KAPPAA__: New plasma elongation definition (= AREA / ( p AMIN2) )

* __KAREA__: New plasma elongation definition (= VOL / (2p RGEO) / ( p AMIN2) )

* __INDENT__: Indentation of the plasma determined from an MHD equilibrium fit.

* __VOL__: The plasmas volume in m3 determined from an MHD equilibrium fit.

Not part of the data, but in info-file:

* __DELTA__: The triangularity of the plasma boundary from an MHD equilibrium fit.

* __DELTAU__: Upper triangularity of the plasma boundary from an MHD equilibrium fit.

* __DELTAL__: Lower triangularity of the plasma boundary from an MHD equilibrium fit.

* __SEPLIM__: The minimum distance between the separatrix flux surface and either the vessel wall or limiters in meters from an MHD equilibrium fit.

* __XPLIM__: The minimum distance between the X-point and either the vessel walls or limiters in meters from an MHD equilibrium fit. The value is positive if X-point is inside either the vessel wall or limiters.

* __AREA__: Area of plasma cross section in m2 determined from an MHD equilibrium fit.

##### Further plasma quantities

* __NEL__: Central line average electron density in m-3 from interferometer or LIDAR (JET).

* __PALPHA__: Estimated alpha heating power in Deuterium-Tritium plasmas in watts

* __PNBI__: Total injected neutral beam power minus shine through in watts.

* __PINJ__: The injected neutral beam power with beam of (BGASA, BGASZ) that passes into the torus..

* __PECRH__: ECRH power in watts absorbed by the plasma.<br>
=> always =0 for JET

* __PICRH__: ICRH power in watts absorbed by the plasma.

* __PICRHC__: ICRH power in watts coupled to the plasma.<br>
=> ICRH only !=0 in 37 entries for JET

* __WTH__: Estimated thermal plasma energy content in Joules

* __WTOT__: Estimated total plasma energy content in Joules.

* __IP__: The plasma current in Amperes determined from an external Rogowski loop ...

### tests

###### new predictability routine with power law fit

so don't use columns with negative entries like IP, PNBI and XPLIM

In [10]:
cols_test = ["TAUTOT", "WTH", "SEPLIM", "NEL", "KAPPA", "IP"]

In [11]:
# check whether only positive entries:
(df[cols_test]>0).all().all()

False

In [12]:
new_metrics_test, new_datas_test = predictability(data=df,
                                input_cols=4,
                                output_cols=1,
                                col_set=cols_test,
                                primkey_cols = ["TOK"],
                                targets=["TAUTOT"],
                                hidden_layers=[#(70,5,), 
                                               (70, 25, 5),
                                                (120,70,30,5)
                                              ],
                                alphas=[0.0005],
                                scaling="yes",
                                scoring="RMSE",
                                n_jobs=-1
                               )

Analysing ('WTH', 'NEL', 'SEPLIM', 'KAPPA', 'TAUTOT') now.
do power law fit: True
start MLP routine
Fitting 3 folds for each of 2 candidates, totalling 6 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



The analysis of this tuple took 3.41s.
-----1/5-----
Analysing ('WTH', 'NEL', 'SEPLIM', 'IP', 'TAUTOT') now.
do power law fit: False
start MLP routine
Fitting 3 folds for each of 2 candidates, totalling 6 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



The analysis of this tuple took 3.46s.
-----2/5-----
Analysing ('WTH', 'NEL', 'KAPPA', 'IP', 'TAUTOT') now.
do power law fit: False
start MLP routine
Fitting 3 folds for each of 2 candidates, totalling 6 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



The analysis of this tuple took 2.47s.
-----3/5-----
Analysing ('WTH', 'SEPLIM', 'KAPPA', 'IP', 'TAUTOT') now.
do power law fit: False
start MLP routine
Fitting 3 folds for each of 2 candidates, totalling 6 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



The analysis of this tuple took 3.0s.
-----4/5-----
Analysing ('NEL', 'SEPLIM', 'KAPPA', 'IP', 'TAUTOT') now.
do power law fit: False
start MLP routine
Fitting 3 folds for each of 2 candidates, totalling 6 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



The analysis of this tuple took 2.67s.
-----5/5-----
The whole run took 15.01s.


In [13]:
new_metrics_test_df = pd.DataFrame.from_dict(new_metrics_test).transpose().sort_values(by="MLP r2", ascending=False)

new_best_results_test = new_metrics_test_df.sort_values(by="MLP r2", ascending=False).index

In [14]:
new_metrics_test_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,MLP r2,linear r2,pow. law r2,mean r2,MLP RMSE,linear RMSE,pow. law RMSE,mean RMSE,MLP RMSE/std,linear RMSE/std,...,pow. law MAPE,mean MAPE,MLP rae,linear rae,pow. law rae,mean rae,MLP dcor,linear dcor,pow. law dcor,mean dcor
WTH,SEPLIM,KAPPA,IP,TAUTOT,0.605675,0.406506,,-4.7e-05,0.203174,0.249258,,0.323557,0.627953,0.770386,...,,0.546632,750.659998,0.675322,,609.979845,0.834767,0.715943,,0.0
NEL,SEPLIM,KAPPA,IP,TAUTOT,0.600907,0.400273,,-4.7e-05,0.204398,0.250563,,0.323557,0.631738,0.774421,...,,0.546632,757.861811,0.672503,,609.979845,0.82879,0.703441,,0.0
WTH,NEL,SEPLIM,KAPPA,TAUTOT,0.593827,0.317917,0.390495,-4.7e-05,0.206203,0.267214,0.252598,0.323557,0.637317,0.825883,...,0.271447,0.546632,774.297147,0.751575,0.628236,609.979845,0.818581,0.640322,0.726423,0.0
WTH,NEL,KAPPA,IP,TAUTOT,0.580246,0.439577,,-4.7e-05,0.209623,0.242214,,0.323557,0.647884,0.748614,...,,0.546632,779.195324,0.64643,,609.979845,0.820113,0.728304,,0.0
WTH,NEL,SEPLIM,IP,TAUTOT,0.542575,0.439577,,-4.7e-05,0.218827,0.242214,,0.323557,0.676332,0.748614,...,,0.546632,797.04392,0.64643,,609.979845,0.8409,0.728304,,0.0


In [16]:
new_test_comb = new_best_results_test[0]

In [None]:
plot_result(new_datas_test, new_best_results_test[3], plot_along=[
    #"mean", 
    "linear", 
    "pl"
])

##### test run old routine

In [4]:
cols_test = ["PLTH", "TAUTOT", "XPLIM", "PNBI", "NEL", "IP"]

In [None]:
metrics_test, datas_test = old_predictability(data=df,
                                input_cols=4,
                                output_cols=1,
                                col_set=cols_test,
                                primkey_cols = ["TOK"],
                                targets=["PLTH", "TAUTOT"],
                                hidden_layers=[#(70,5,), 
                                               (70, 25, 5),
                                                (120,70,30,5)
                                              ],
                                alphas=[0.0005],
                                scaling="yes",
                                scoring="RMSE",
                                n_jobs=-1
                               )

In [7]:
metrics_test_df = pd.DataFrame.from_dict(metrics_test).transpose().sort_values(by="MLP r2", ascending=True).head()

best_results_test = metrics_test_df.sort_values(by="MLP r2", ascending=False).index

In [8]:
best_results_test

MultiIndex([('IP', 'XPLIM', 'PNBI', 'NEL',   'PLTH'),
            ('IP', 'XPLIM', 'PNBI', 'NEL', 'TAUTOT')],
           )

In [9]:
test_comb = best_results_test[0]

In [None]:
plot_result(datas_test, best_results_test[1], plot_along=["mean", "linear"])

In [16]:
get_column_combinations_w_targets(np.arange(0,6), 4, 1, [0,1])

[(2, 3, 4, 5, 0), (2, 3, 4, 5, 1)]