# EDA Model Training vs Real Distributions

## Constant Fluctuations
The NN model was trained assuming that the following input values were always constant:
* `distgen:total_charge:value`
* `L0A_scale:voltage`
* `L0B_scale:voltage`

In order to validate whether that's true and whether their instability during live production will disrupt training, we want to see how much these values fluctuate in real machine behaviour over a day.

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

In [None]:
time_series = pd.read_pickle('data/full_2022-09-23_12_18_58-07_00__2022-09-23_23_03_01-07_00.pkl')
print(time_series.columns)

In [None]:
# time_series = pd.read_pickle('data/relevant_2022-09-23_12_18_58-07_00__2022-09-23_23_03_01-07_00.pkl')

In [None]:
with open('configs/pv_info.json', 'r') as f:
    pv_info = json.load(f)
    f.close()

In [None]:
constants = ['distgen:total_charge:value', 'L0A_scale:voltage', 'L0B_scale:voltage']
pv_constants = [pv_info['sim_name_to_pv_name'][constant] for constant in constants]
print(pv_constants)

In [None]:
constant_df = time_series[pv_constants].dropna()

In [None]:
constant_df.describe()

In [None]:
constant_df.hist()
plt.show()

In [None]:
sim_constant_df = pd.DataFrame()

# convert the PV values back to sim values
for sim_name, col_name in zip(constants, constant_df.columns):
    series = constant_df[col_name] * pv_info['pv_to_sim_factor'][col_name]
    sim_constant_df[sim_name] = series
    
sim_constant_df.hist()
plt.show()

In [None]:
sim_constant_df.describe()

## Input Ranges

In [None]:
with open('configs/model_info.json', 'r') as f:
    model_info = json.load(f)
    f.close()

In [None]:
input_features = []
for feature in model_info['model_in_list']:
    # find the pvname for this feature
    pv_name = pv_info['sim_name_to_pv_name'][feature]
    if pv_name not in ['Pulse_length']:
        input_features.append(pv_name.replace('BDES','BCTRL'))

input_features


In [None]:
len(input_features)

In [None]:
fig, ax = plt.subplots(3,5, figsize=(15,8))
ax = ax.ravel()

for i, input_name in enumerate(input_features):
    # plot the histogram of the real data
    sim_name = pv_info['pv_name_to_sim_name'][input_name.replace('BCTRL', 'BDES')]
    ax[i].set_title(input_name)

    # plot the range that was used during training using the sim to pv conversion
    sim_to_pv_factor = pv_info['sim_to_pv_factor'][sim_name]
    input_loc = model_info['loc_in'][sim_name]
    # print(sim_name, sim_to_pv_factor)
    train_min = model_info['train_input_mins'][input_loc] * sim_to_pv_factor
    train_max = model_info['train_input_maxs'][input_loc] * sim_to_pv_factor

    ax[i].axvline(train_min, c='r')
    ax[i].axvline(train_max, c='r')

    time_series[input_name].hist(ax=ax[i])

fig.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots()
# ax = ax.ravel()
input_name = 'QUAD:IN20:441:BCTRL'
print(time_series[input_name].describe())
print(time_series[input_name].info())

# plot the histogram of the real data
sim_name = pv_info['pv_name_to_sim_name'][input_name.replace('BCTRL', 'BDES')]
ax.set_title(f'{input_name} | {sim_name}')

# plot the range that was used during training using the sim to pv conversion
sim_to_pv_factor = pv_info['sim_to_pv_factor'][sim_name]
input_loc = model_info['loc_in'][sim_name]
train_min = model_info['train_input_mins'][input_loc] * sim_to_pv_factor
train_max = model_info['train_input_maxs'][input_loc] * sim_to_pv_factor

ax.axvline(train_min, c='r')
ax.axvline(train_max, c='r')

time_series[input_name].hist(ax=ax)

plt.show()