<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [23]:
# Do once to get all libraries up to date
if False:
    !pip install -U pandas process_improve plotly IPython

import pathlib
import pandas as pd
pd.options.plotting.backend = "plotly"

from process_improve.batch.preprocessing import find_reference_batch, batch_dtw
from process_improve.batch.data_input import melted_to_dict, dict_to_wide
from process_improve.batch.plotting import plot__all_batches_per_tag, plot__multitags
from process_improve.multivariate.methods import PCA, MCUVScaler
import process_improve.datasets.batch as batch_ds 

from plotly.offline import iplot, init_notebook_mode
import plotly.io as pio
pio.renderers.default = "iframe"
from plotly.offline import iplot, plot, init_notebook_mode
init_notebook_mode(connected=True)  

# Settings


In [None]:
# Number of PCA components to fit
A = 4

# Ideally, use more than 1 tag to align on. These columns must exist in all data frames for all batches. There should be NO missing data in any columns.
columns_to_align = ["AgitatorPower", "AgitatorTorque", "JacketTemperature", "DryerTemp"]
tag_to_plot = columns_to_align[3] 

print(f"This notebook will show the plots for '{tag_to_plot}'. You can of course select any other tag.")

# Data import and visualization

In [None]:
# Import the data: a dictionary of dataframes
dryer_raw = pd.read_csv(pathlib.Path(batch_ds.__path__._recalculate()[0]) / "dryer.csv")
dryer_df_dict = melted_to_dict(dryer_raw, batch_id_col="batch_id")
print(f"The batches nummbers are: {dryer_df_dict.keys()}")
full_tag_list = dryer_raw.columns.to_list()
full_tag_list.remove('batch_id')
print(f"The tags in the data set are\n {full_tag_list}.")

In [None]:
# Plot some data, to get an idea of what is present
iplot(plot__all_batches_per_tag(df_dict=dryer_df_dict, 
                                tag=tag_to_plot,  
                                time_column ="ClockTime", 
                                x_axis_label="Time [hours]"
                            ))
                                
print("Note how the batches all have different durations. They are 'unaligned'.")

# Batch data alignment

In [None]:
# What is a good batch number to align on?
good_reference_candidate = find_reference_batch(dryer_df_dict, 
                                                columns_to_align=columns_to_align, 
                                                settings={"robust": False})
print(f"Batch number {good_reference_candidate} was found to be the most 'average' batch to use a reference batch")

In [None]:
# Align the data based on the reference candidate, using the specified tags/columns.
print("About to align the data. This can take up to a few minutes on a slow computer and a large number of batches ...")
aligned_out = batch_dtw(dryer_df_dict, columns_to_align=columns_to_align,
        reference_batch=good_reference_candidate,
        settings={
            "robust": False,
            
            # High tolerance of around 1.0 will run alignment only once; 
            # Typically set to 0.1, so that at least 2 or 3 iterations of alignment take place
            "tolerance": 0.05, 
             
            "show_progress": True, # show progress: shows total "distance" of batch relative to the reference
        },  
    )
print('Done.')

In [None]:
print('Weight history. The higher the weight, the greater the importance of that tag in the ')
print('alignment process; tags with a very small number have little influence in alignment.')
display(pd.DataFrame(aligned_out['weight_history'], columns=columns_to_align))

In [None]:
print(f"Plot the aligned (synced) data for tag: {tag_to_plot}")
iplot(plot__all_batches_per_tag(aligned_out['aligned_batch_dfdict'], tag_to_plot, x_axis_label='Normalized duration', html_aspect_ratio_w_over_h=2))

In [None]:
print("Let's visualize all the aligned batch data in all tags simultaneously. You can zoom in one tag and all others will follow.")
plot__multitags(
    aligned_out['aligned_batch_dfdict'], 
    tag_list=full_tag_list,
    settings = dict(nrows = 3)
    )

# Principal component analysis on the aligned batch data

Unfold the data into a fat matrix with one row per batch and many columns. Columns are grouped per tag: so all the data for the first tag, then the next tag, etc.

In [None]:
wide_matrix = dict_to_wide(aligned_out['aligned_batch_dfdict'])
print(f"The unfolded matrix has {wide_matrix.shape[0]} rows and {wide_matrix.shape[1]} columns")

In [24]:
#  Preprocess the data: center and scale
scaler = MCUVScaler().fit(wide_matrix)
wide_matrix_mcuv = scaler.fit_transform(wide_matrix)
# 3. Fit an initial PCA model, with "A" principal components. 
pca = PCA(n_components=A).fit(wide_matrix_mcuv)
print(f'The cumulative R2, per component, is:\n{pca.R2cum}')

limit_SPE_95 = pca.SPE_limit(conf_level=0.95)
limit_T2_95 = pca.T2_limit(conf_level=0.95)

# Plot the SPE metric
name = "Squared prediction error"
fig = pca.squared_prediction_error.reset_index().iloc[:,[0,-1]].plot.scatter(x="batch_id", y=str(A), title=name + f" after {A} components, with the 95% confidence limit")
fig.update_layout(xaxis_title_text="Batch number")
fig.update_layout(yaxis_title_text=name)
fig.add_hline(y = limit_SPE_95, line_color="red", annotation_text="95% limit", annotation_position="bottom right")
fig.show()

name = "$\\text{Hotelling's}~T^2$"
fig = pca.Hotellings_T2.reset_index().iloc[:,[0,-1]].plot.scatter(x="batch_id", y=str(A), title=name + f" after {A} components, with the 95% confidence limit")
fig.update_layout(xaxis_title_text="Batch number")
fig.update_layout(yaxis_title_text=name)
fig.add_hline(y = limit_T2_95, line_color="red", annotation_text="95% limit", annotation_position="bottom right")
fig.show()


The cumulative R2, per component, is:
1    0.182630
2    0.309237
3    0.397748
4    0.458345
Name: Cumulative model's R^2, per component, dtype: float64
