# Machine Predictions
In this notebook we analyse the predictions made by the **uncalibrated** surrogate model with the machine data and evaluate the errors. 

In [None]:
# set it up to autoreload updates
%load_ext autoreload
%autoreload 2

import json
import torch
import pandas as pd
from utils import load_lcls, plot_series, chunk_dataset
import matplotlib.pyplot as plt
import numpy as np


## Load Data

In [None]:
full_data = pd.read_pickle('data/full_2022-09-23_12_18_58-07_00__2022-09-23_23_03_01-07_00.pkl')
full_data['beam_size'] = np.sqrt(full_data['OTRS:IN20:621:XRMS'] * full_data['OTRS:IN20:621:YRMS'])
full_data.head()

## Load Model
Because we're dealing with the raw PV data, we have to add an additional transformation layer to the model to convert the PV units to the simulation units. We do the same on the output layer as well, converting the scales from m to mm. We also have to use the PV names as the inputs rather than the simulation ones. 

In [None]:
from lume_model.utils import variables_from_yaml
from botorch.models.transforms.input import AffineInputTransform, InputTransform
from lume_model.torch import PyTorchModel

In [None]:
class DebuggingPyTorchModel(PyTorchModel):
    def __init__(self, model_file: str, input_variables, output_variables, input_transformers, output_transformers, output_format, feature_order, output_order) -> None:
        super().__init__(model_file, input_variables, output_variables, input_transformers, output_transformers, output_format, feature_order, output_order)



In [None]:
with open('configs/pv_info.json', 'r') as f:
    pv_info = json.load(f)
    f.close()

with open('configs/lcls_pv_variables.yml') as f:
    input_variables, output_variables = variables_from_yaml(f)

with open('configs/normalization.json', "r") as f:
    norm_data = json.load(f)


In [None]:
features = list(input_variables.keys())
outputs = list(output_variables.keys())
print(features)
print(outputs)

valid_keys = [key for key in features + outputs if key in full_data.columns] + ['timestamp']

In [None]:
data = full_data[valid_keys].dropna().reindex()
data.info()

In [None]:
data.head()

In [None]:
class PVtoSimFactor(InputTransform, torch.nn.Module):
    def __init__(self, conversion: torch.Tensor) -> None:
        super().__init__()
        self._conversion = conversion
        self.transform_on_train = True
        self.transform_on_eval = True
        self.transform_on_fantasize = False

    def transform(self, x):
        return x * self._conversion

    def untransform(self, x):
        return x / self._conversion

input_conversions = PVtoSimFactor(torch.tensor([pv_info['pv_to_sim_factor'][feature_name.replace('BCTRL', 'BDES')] for feature_name in features]))

# converting from mm to m for measured sigma to sim sigma, leaving the others as is
output_conversions = PVtoSimFactor(torch.tensor([pv_info['pv_to_sim_factor'][output] for output in outputs]))

print(input_conversions._conversion)
print(output_conversions._conversion)

In [None]:
transformers = []
for ele in ["x", "y"]:
    scale = torch.tensor(norm_data[f"{ele}_scale"], dtype=torch.double)
    min_val = torch.tensor(norm_data[f"{ele}_min"], dtype=torch.double)
    transform = AffineInputTransform(
        len(norm_data[f"{ele}_min"]),
        1 / scale,
        -min_val / scale,
    )

    transformers.append(transform)

nn_model = PyTorchModel(
    'torch_model.pt',
    input_variables,
    output_variables,
    input_transformers=[input_conversions, transformers[0]],
    output_transformers=[transformers[1], output_conversions], #,  # first we go from nn to sim units, then from sim_units to PV units
    feature_order=features,
    output_order=outputs
)


## Make Predictions
Use the measured data to make predictions for each of the output values. 

**NOTE** for now we ignore the values of input where the measured data is outside of the training range as this causes the errors to become enormous. This might be a useful indicator in future for when to retrain / gather new simulations to retrain the base model. These cannot be retrained using a calibration layer because the errors make the training too unstable.

In [None]:
ignored_features = ['IRIS:LR20:130:CONFG_SEL','ACCL:IN20:400:L0B_ADES']  # we ignore these because all of the values are outside the training range
# ignored_features = []

input_dict = {}
for feature in features:
    if feature not in ignored_features:
        try:
            input_dict[feature] = torch.from_numpy(data[feature].values)
        except KeyError:
            # if we get a key error then we don't know what the name of the PV for this is, so we use the default.
            print(feature)
    else:
        print(feature)

In [None]:
data['timestamp']

If we don't ignore these features outside of the training range, we get unphysical values of sigma_x and sigma_y

In [None]:
len(data[list(input_dict.keys())])

In [None]:
result = nn_model.evaluate(input_dict)
result['beam_size'] = torch.sqrt(result['OTRS:IN20:621:XRMS'] * result['OTRS:IN20:621:YRMS'])

result = {key: value.detach().numpy() for key, value in result.items()}

model_df = pd.DataFrame(result)
print(len(model_df))
model_df['timestamp'] = data['timestamp'].copy()
print(len(model_df))
# TODO fix this - work out why it's making it different lengths
# model_df = pd.concat([data[list(input_dict.keys())].copy(), model_df], axis=1)
print(len(model_df))
model_df[outputs[0:2]].head()


In [None]:
len(model_df)

In [None]:
# model_df[list(input_dict.keys())].head()

In [None]:
data[outputs[0:2]].head()

In [None]:
fig, ax = plot_series(data, columns=list(input_dict.keys())+['OTRS:IN20:621:XRMS', 'OTRS:IN20:621:YRMS'], pred_df=model_df)
plt.show()

In [None]:
data[outputs[0:2]].describe()

In [None]:
model_df[outputs[0:2]].describe()

In [None]:
chunked_dfs = chunk_dataset(data)
chunked_model_dfs = chunk_dataset(model_df)

In [None]:
for true_df, pred_df in zip(chunked_dfs, chunked_model_dfs):
    fig, ax = plot_series(true_df, columns=list(input_dict.keys())+['OTRS:IN20:621:XRMS', 'OTRS:IN20:621:YRMS'], pred_df=pred_df)
    plt.show()