# Data Processing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import os
import glob
from copy import deepcopy
import json

In [None]:

filenames = glob.glob('data/machine/**/*.npy', recursive=True)
print(len(filenames))

In [None]:
char = deepcopy(filenames[0][-7])
print(char)

In [None]:
# replace the filenames if they've got a weird character in (only on linux)
char = filenames[0][-7]
new_filenames = []
for filename in glob.glob('data/machine/missing/*.npy'):
    if char != ':':
        new_filename = filename.replace(char, ':')
        os.rename(filename, new_filename)

In [None]:
# we use a set to find the timestamps to remove any duplicate timestamps we have
timestamps = set([filename[-29:-4] for filename in glob.glob('data/machine/**/*.npy', recursive=True)])
len(timestamps)

For each timestamp, we need to read both the input (`values_*`) file and the output (`img_*`) file and load it into a dataframe, setting the timestamp as the index. 

In [None]:
time_series = []
input_errors = []
output_errors = []

for timestamp in timestamps:
    data = {'timestamp': timestamp}
    try:
        input_data = dict(enumerate(np.load(f'data/machine/values_{timestamp}.npy', allow_pickle=True).flatten()))[0]
    except FileNotFoundError:
        try:
            input_data = dict(enumerate(np.load(f'data/machine/missing/values_{timestamp}.npy', allow_pickle=True).flatten()))[0]
        except FileNotFoundError:
            input_data = {}
            input_errors.append(timestamp)
    try:
        output_data = dict(enumerate(np.load(f'data/machine/imgs_{timestamp}.npy', allow_pickle=True).flatten()))[0]
    except FileNotFoundError:
        try:
            output_data = dict(enumerate(np.load(f'data/machine/missing/imgs_{timestamp}.npy', allow_pickle=True).flatten()))[0]
        except FileNotFoundError:
            output_data = {}
            output_errors.append(timestamp)
    data.update(input_data)
    data.update(output_data)
    time_series.append(data)
    
    
time_series = pd.DataFrame(time_series)
time_series['timestamp'] = pd.to_datetime(time_series['timestamp'])
# time_series = time_series.set_index('timestamp')
print(time_series[['timestamp', 'SOLN:IN20:121:BACT', 'QUAD:IN20:121:BACT']].head())
time_series = time_series.sort_values('timestamp').reset_index()
print(time_series[['timestamp', 'SOLN:IN20:121:BACT', 'QUAD:IN20:121:BACT']].head())
time_series = time_series.dropna(axis=1,how='all')

In [None]:
print(len(output_errors))
print(len(input_errors))

In [None]:
input_errors

In [None]:
time_series.info()

In [None]:
time_series.describe()

In [None]:
# save the full time series
start_time = str(time_series['timestamp'].iloc[0]).replace(' ', '_')
end_time = str(time_series['timestamp'].iloc[-1]).replace(' ', '_')
time_series.to_pickle(f'data/full_{start_time}__{end_time}.pkl')

## Filter Data
Now that we have our data loaded, we need to take the subset of the data that we use with our model. Some of the names in the PV info file are wrong (?) so we replace the names with the correct ones. 

In [None]:
with open('configs/pv_info.json', 'r') as f:
    pv_info = json.load(f)
    f.close()
    
input_pvs = [pv_name.replace('BDES', 'BCTRL') for pv_name in pv_info['pv_name_to_sim_name'].keys() if pv_name.replace('BDES', 'BCTRL') in time_series.columns]
output_pvs = ['OTRS:IN20:621:XRMS','OTRS:IN20:621:YRMS']
time_series_subset = time_series[['timestamp'] + input_pvs + output_pvs]
time_series_subset.head()

In [None]:
time_series_subset.info()

In [None]:
time_series_subset.columns

In [None]:
axes = ['magnets', 'outputs', 'others']

def plot_series(time_series_subset):

    fig, ax = plt.subplots(len(axes),figsize=(15,10))
    ax = ax.ravel()

    for col_no, col in enumerate(time_series_subset.columns[1:]):
        if 'QUAD' in col or 'SOLN' in col:
            ax[0].plot(time_series_subset['timestamp'], time_series_subset[col], '.-',markersize=5, label=col)
        elif 'OTRS' in col:
            ax[2].plot(time_series_subset['timestamp'], time_series_subset[col], '.-',markersize=5, label=col)
        else:
            ax[1].plot(time_series_subset['timestamp'], time_series_subset[col], '.-',markersize=5, label=col)
        # ax[i].set_ylabel(col)

    ax[0].legend()
    ax[1].legend()
    ax[2].legend()
    
    start_time = str(time_series_subset['timestamp'].iloc[0])
    end_time = str(time_series_subset['timestamp'].iloc[-1])
    fig.suptitle(f'{start_time[:-6]} -- {end_time[:-6]}')
    fig.tight_layout()
    plt.show()

plot_series(time_series_subset)

In [None]:
time_series_subset.to_pickle(f'data/relevant_{start_time}__{end_time}.pkl')

## Create Time Chunks
We can see here that there are some gaps in the data where different runs were executed. In order to visualise these better we want to break up the larger dataframe into smaller 'shunks' of each run.

In [None]:
def chunk_dataset(time_series_subset, time_gap='20 minutes'):
    gaps = time_series_subset['timestamp'].diff() > pd.to_timedelta(time_gap)
    chunk_indices = np.where(gaps == True)[0]

    dfs = []
    start_index = 0
    for chunk_idx in chunk_indices:
        print(start_index, chunk_idx)
        df = time_series_subset[start_index:chunk_idx]
        dfs.append(df)
        start_index = chunk_idx

    # then add the last one with the last chunk of data
    dfs.append(time_series_subset[start_index:])
    print(f'Found {len(dfs)} dataframes')
    return dfs

In [None]:
dfs = chunk_dataset(time_series_subset, time_gap='20 minutes')
# do a final check to make sure the length of all the chunks add up to the total number of points
np.sum(np.array([len(df) for df in dfs]))

In [None]:
dfs[0].head(1)

In [None]:
dfs[0].tail(2)

In [None]:
dfs[1].head(1)

In [None]:
for i, df in enumerate(dfs):
    plot_series(df)

In [None]:
# finally, save each of the dataframes to be loaded later
for df in dfs:
    start_time = str(df['timestamp'].iloc[0]).replace(' ', '_')
    end_time = str(df['timestamp'].iloc[-1]).replace(' ', '_')
    # filename = f'{start_time}__{end_time}'
    df.to_pickle(f'data/{start_time}__{end_time}.pkl')