<a href="https://colab.research.google.com/github/kgdunn/pid-book/blob/master/Model_inversion_demonstration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U numpy pandas scipy matplotlib sklearn process_improve ipysheet ipywidgets plotly notebook 
# The above line only needs to be run the first time. After that you will have all the packages necessary.
import numpy as np
import pandas as pd
import scipy as sp
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from process_improve.multivariate import PCA, MCUVScaler
from ipysheet import sheet, column, to_dataframe, row
import ipywidgets as widgets
import plotly.graph_objects as go
from plotly.subplots import make_subplots
plt.rcParams["figure.figsize"] = (15,5)
plt.rcParams['figure.dpi']= 200

## Strategy

1. Load the data from CSV
2. Preprocess the data
3. Fit the PCA model (2 components for this demonstration)
4. Show the scores and loadings
5. Show the model inversion scores sliders
6. Show the values of the x-variables that could be used to achieve a desired set of score values (with the mouse)

In [None]:
# 1. Loading the data file
foods = pd.read_csv("https://openmv.net/file/food-texture.csv").drop(['Unnamed: 0',], axis=1)
print('Random sample of 7 rows')
display(foods.sample(7))

In [None]:
# 2. Preprocess the data: mean-center (MC) and unit-variance (UV)
scaler = MCUVScaler().fit(foods)
foods_mcuv = scaler.fit_transform(foods)

In [None]:
# 3. Fit the PCA model, with "A" principal components. A = 2 for this case study.
A = 2
pca = PCA(n_components=A).fit(foods_mcuv)

# Plot the SPE metric
plt.figure(figsize=(15,3))
plt.plot(pca.squared_prediction_error.iloc[:,-1])
plt.plot([0, pca.N], [pca.SPE_limit(conf_level=0.95), pca.SPE_limit(conf_level=0.95)], 'r')
plt.grid()
plt.title(f"Squared prediction error plot, after {A} components, with the 95% confidence limit");

# Plot the Hotelling's T2 metric
plt.figure(figsize=(15,3))
plt.plot(pca.Hotellings_T2.iloc[:,-1])
plt.plot([0, pca.N], [pca.T2_limit(conf_level=0.95), pca.T2_limit(conf_level=0.95)], 'r')
plt.grid()
plt.title(f"Hotelling's $T^2$ plot, after {A} components, with the 95% confidence limit");

In [None]:
# 4. Plot the model to understand relationships between variables
def score_plot(a_horiz, a_vert, ax=None, hide_annotations=False): # score plot 
    if ax is None:
        ax = plt.subplot(1, 2, 1)
    else:
        plt.sca(ax)
    plt.plot(pca.t_scores.iloc[:,a_horiz-1], pca.t_scores.iloc[:, a_vert-1], 'k.', )
    if not(hide_annotations):
        plt.title(f'Score plot: $t_{a_horiz}$ vs $t_{a_vert}$')
        plt.xlabel(f'$t_{a_horiz}$ scores')
        plt.ylabel(f'$t_{a_vert}$ scores')
    T2_limit_alpha = 2 # TODO: determine from MV software
    ci_x, ci_y = pca.ellipse_coordinates(score_horiz=a_horiz, 
                                         score_vert=a_vert,
                                         T2_limit_conf_level=0.95)
    plt.plot(ci_x, ci_y, '-', color=np.array([219,112,147])/255.0, linewidth=2)
    
    if not(hide_annotations):
        plt.axvline(linewidth=2)
        plt.axhline(linewidth=2)
        ax.set_aspect('equal')
        ax.grid()
    
    return ax

def loadings_plot(a_horiz, a_vert):# loadings plot
    ax = plt.subplot(1, 2, 2)
    for idx, label in enumerate(list(foods)):
        fuzz = 0.02
        plt.plot(pca.loadings.iloc[idx, a_horiz-1], pca.loadings.iloc[idx, a_vert-1], 'k.', )
        plt.text(pca.loadings.iloc[idx, a_horiz-1]+fuzz, pca.loadings.iloc[idx, a_vert-1]-fuzz, label)

    plt.title(f'Loadings plot: $[t]_{a_horiz}$ vs $p_{a_vert}$')
    plt.xlabel(f'$p_{a_horiz}$ loadings')
    plt.ylabel(f'$p_{a_vert}$ loadings')
    plt.axvline(linewidth=2)
    plt.axhline(linewidth=2)
    ax.set_aspect('equal')
    ax.grid()
    
    return ax
    
fig = plt.figure(figsize=(15, 10))
plt.subplots_adjust(top=0.9, bottom=0.1, left=0.0, right=0.9, hspace=0.5, wspace=0.2)
score_plot(a_horiz=1, a_vert=2);
loadings_plot(a_horiz=1, a_vert=2);

In [None]:
# 5. PCA model inversion with the sliders.
def invert_model(t1, t2):
    data = pd.concat([foods, 
                      pca.t_scores["PC 1"],
                      pca.t_scores["PC 2"]], axis=1)
    
    spm = scatter_matrix(data, figsize=(7, 7));
    x_pred_mcuv = pca.inverse_transform([t1, t2])
    x_pred_orig = scaler.inverse_transform(x_pred_mcuv)    
    x_pred_plot = x_pred_orig.tolist()
    x_pred_plot.extend((t1, t2))
    ax_t1 = pca.K     # which axes to use for the 2 scores?
    ax_t2 = pca.K + 1
    for i in range(0, pca.K+2):
        for j in range(0, pca.K+2):
            if i < j and not((ax_t1==i) and (ax_t2==j)):
                plt.sca(spm[i][j])
                plt.plot(x_pred_plot[j], x_pred_plot[i], 'r.', markersize=10)
                
            if i==j:
                plt.sca(spm[i][j])
                plt.axvline(x=x_pred_plot[j],color='r', linewidth=2)
    
    plt.sca(spm[ax_t1, ax_t2])
    score_plot(a_horiz=1, a_vert=2, ax=spm[ax_t1, ax_t2], hide_annotations=True)
    plt.plot(x_pred_plot[ax_t2], x_pred_plot[ax_t1], 'r.', markersize=10)
   

t_1 = widgets.FloatSlider(min=-4, max=+4, step=0.2, value=0, continuous_update=False, 
                         orientation='vertical', readout_format='.1f', description='\(t_1\) value')
t_2 = widgets.FloatSlider(min=-3, max=+3, step=0.1, value=0, continuous_update=False, 
                         readout_format='.1f', description='\(t_2\) value')

ui = widgets.HBox([t_1, t_2])
out = widgets.interactive_output(invert_model, {'t1': t_1, 't2': t_2,});

a_horiz=0
a_vert=1
display(ui, out);

In [None]:
# 6. Set up interactive plotting.

a_horiz=1
a_vert=2
def score_plot_points(a_horiz: int, a_vert: int, T2_limit_conf_level: float): # score plot of t1 vs t2
    
    
    points = go.Scatter(x=pca.t_scores.iloc[:,a_horiz-1], 
                        y=pca.t_scores.iloc[:,a_vert-1], 
                        mode='markers', 
                        marker_color='rgba(0, 0, 0, 1)',
                        name="Scores",
                        hoverinfo='none',
                        showlegend=False)
    
    loadings = go.Scatter(x=pca.loadings.iloc[:,a_horiz-1]*pca.t_scores.iloc[:,a_horiz-1].abs().max(), 
                          y=pca.loadings.iloc[:,a_vert-1]*pca.t_scores.iloc[:,a_vert-1].abs().max(), 
                          text=pca.loadings.index,
                          textposition="bottom center",                            
                          textfont=dict(family="arial",
                                        size=18,     
                                        color="lightskyblue"),
                          mode='markers+text',             
                          marker_color="lightskyblue",
                          marker_size=15,
                          marker_symbol='x',
                          name="Loadings",
                          hoverinfo='none',
                          showlegend=False)
    

    ci_x, ci_y = pca.ellipse_coordinates(a_horiz, a_vert, T2_limit_conf_level=T2_limit_conf_level)
    ellipse = go.Scatter(x=ci_x, 
                         y=ci_y, 
                         mode='lines', 
                         marker_color='rgba(219,112,147, 1)',
                         name='',
                         hoverinfo='none',
                         showlegend=False)
    
    x_range = ([max(max(abs(points['x'])), max(abs(ellipse['x'])))] * 2  * np.array([-1.05, 1.05])).tolist()
    y_range = ([max(max(abs(points['y'])), max(abs(ellipse['y'])))] * 2  * np.array([-1.05, 1.05])).tolist()
    return points, ellipse, x_range, y_range, loadings

  
if True: # plot set up, etc
    points, ellipse,x_range, y_range, loadings = score_plot_points(a_horiz, a_vert, T2_limit_conf_level=0.95)

    resolution = 5
    x_map = np.linspace(x_range[0], x_range[1], resolution+1)
    y_map = np.linspace(y_range[0], y_range[1], resolution+1)
    z = np.zeros((resolution+1, resolution+1))+1
    clickmap = go.Heatmap(x=x_map,y=y_map,z=z, 
                          showscale= False,                                 
                          hoverinfo='none',
                          colorscale=[[0, "rgba(255, 255, 255, 0)"], [1, "rgba(255, 255, 255, 0 )"]] 
                                   )

    scores_layout=dict(width=600,
                       height=600,
                       title_text=f'$t_{a_horiz} \\text{{vs}}\\, t_{a_vert}$ with the loadings (light blue)',
                       hovermode="closest",                    
                       autosize=True,
                       margin= dict(l=10, r=10, b=5, t=80), # Defaults: l=80, r=80, t=100, b=80
                       spikedistance=0,       
                       xaxis=dict(
                           title=dict(
                               text=f'$t_{a_horiz}$ scores',
                               font=dict(size=16),
                           ),
                           mirror=True, # ticks are mirrored at the top of the frame also
                           autorange=False,
                           range=x_range,
                           showspikes=True,
                           visible=True,
                       ),
                       yaxis=dict(
                           title=dict(
                               text=f'$t_{a_vert}$ scores',
                               font=dict(size=16),
                           ),
                           type="linear",
                           autorange=False,
                           range=y_range,
                           showspikes=True,
                           visible=True,
                           domain=[0, 1],
                       ),
                      )

    f = go.FigureWidget([clickmap, loadings, points, ellipse, ], layout=scores_layout)
    clickground = f.data[0]
    f.layout.hovermode = 'x'    

    box_layout = widgets.Layout(display='inline-flex', flex_flow='row', align_items='stretch', width='90%')
    box_auto = widgets.Box(children=[f,], layout=box_layout)
    display(widgets.VBox([box_auto, ]) );

    sliders = [widgets.FloatSlider(min=-4, max=+4, step=0.2, value=0,readout_format='.1f') for _ in range(pca.K)]
    true_value = [widgets.FloatText( value=np.nan, readout_format='.1f') for _ in range(pca.K)]
    checkboxes = [widgets.Checkbox(value=False, disabled=False, indent=False) for _ in range(pca.K)]
    # Or replace this with a list of the column names
    properties = list(pca.loadings.index)
    sheet1=sheet(columns=4, 
                 column_headers=["X variable", "Normalized value", "Real value", "Constrained?"], 
                 column_width=[10, 40, 40, 10])
    column0=column(0, properties)
    column1=column(1, sliders)
    column2=column(2, true_value)
    column2=column(3, checkboxes)
    display(sheet1)

    sheet2=sheet(rows=1,
                 columns=2, 
                 column_headers=["Squared prediction error", "Hotelling's T^2"],  
                 column_width=[40, 40])
    SPE_handle = widgets.FloatText(value=np.nan, readout_format='.1f')
    T2_handle = widgets.FloatText(value=np.nan, readout_format='.1f')
    s2_row0=row(0, [SPE_handle, T2_handle])
    display(sheet2)

def update_point(trace, points, selector):
    t1 = points.xs[0]
    t2 = points.ys[0]
    scores = np.array([t1,t2]) # 1 x A vector
    
    # Free and constrained variables
    loadings = pca.components_.T # K x A matrix
    idxF, idxC = [not(c.value) for c in checkboxes], [c.value for c in checkboxes]
    Rf, Rc = loadings[idxF, :], loadings[idxC, :]
    xc = np.array([s.value for s in sliders])[idxC].reshape((-1,1))
    
    # Subtract out (i.e. eliminate from the optimization) the the constrained parts of the multivariate projection
    x_pred_mcuv = ((scores - xc.T @ Rc) @ loadings.T @ loadings @ loadings.T).ravel()
    x_pred_mcuv[idxC] = xc.ravel()
    x_pred_orig = scaler.inverse_transform(x_pred_mcuv.ravel())
    for idx, slider in enumerate(sliders):
        slider.value = x_pred_mcuv[idx]
        true_value[idx].value = f"{x_pred_orig[idx]:.3g}"
  
    # Find the SPE and T2 value for this point:
    X_hat = scores @ loadings.T
    error_X = x_pred_mcuv - X_hat.ravel()   
    
    SPE_handle.value = 0.0 #f"{np.sum(error_X ** 2):.3g}" <-- we are always on the model plane.
    T2_handle.value = f"{np.sum((scores / pca.scaling_factor_for_scores)**2):.3g}"
           
clickground.on_hover(update_point)
widgets.VBox([out]);

# TODO:
# * show loadings points superimposed

**NOTE**

The last cell above will NOT work in Google's Colab. 
You can download the notebook (click on "File", then "Download .ipynb")
and run that notebook on your own hardware. Then you will have a fully interactive score plot 
where the model-inversion happens in real time.