# Component Ordering

What is the order of the components (columns) of the PARAFAC2 decomposition factor matrices? As shown in [PARAFAC2 Decomposition As Shapes](pca_analysis/experiments/notebooks/decomposition_as_shapes.ipynb), the pure analytes corresponding to the selected rank of the model are represented throughout $A$, $B$, and $C$ as their columns, where $A$ is the concentration, $B$ is the elution profile and $C$ is the spectral profile. Observe:

Are they $I$ ordered? It is not something that is discussed by @kiers_parafac2parti_1999 or @bro_parafac2partii_1999.


In [None]:
%load_ext autoreload
%autoreload 2

from tensorly.decomposition import parafac2 as tl_parafac2
import matplotlib.pyplot as plt
from pca_analysis.get_sample_data import get_zhang_data
from pca_analysis.experiments.notebooks.code.parafac2 import Parafac2
import scipy.spatial.distance as distance
from matplotlib.patches import Patch


ModuleNotFoundError: No module named 'pca_analysis.experiments'

In [None]:
raw_data = get_zhang_data()

raw_data.head(3)


NameError: name 'get_zhang_data' is not defined

In [None]:
_decomp, err = tl_parafac2(
    raw_data.to_numpy(),
    3,
    return_errors=True,
    verbose=True,
    n_iter_max=500,
    nn_modes="all",
)


In [None]:
parafac2 = Parafac2(_decomp, raw_data)


In [None]:
parafac2.pure.C.shape


In [None]:
# present the three decompositions and the original data

fig, axs = plt.subplots(2, 2, layout="constrained")

mz = 44
mz_idx = 39
sample_idx = 10

# original data

axs = axs.ravel()


# Raw Data
raw_data.isel(sample=sample_idx, mz=mz_idx).plot.line(ax=axs[0], color="black")
axs[0].set_title("raw data")
axs[0].set_ylabel("abs")

# A
A = parafac2.pure.A

# see <https://stackoverflow.com/questions/71256037/control-order-of-tab10-colors-in-colormap> for source
colors = [plt.cm.tab10.colors[i] for i in range(0, A.shape[1])]
labels = [i for i in range(0, len(colors))]

axs[1].bar(range(0, len(A[sample_idx])), A[sample_idx], color=colors, label=labels)
axs[1].set_title("A")
axs[1].set_xlabel("K")

# B
axs[2].plot(parafac2.pure.B[sample_idx])
axs[2].set_title("B")
axs[2].set_xlabel("I")
axs[2].set_ylabel("y")

# C
C_44 = parafac2.pure.C[mz_idx]
axs[3].bar(
    x=range(0, len(C_44)),
    height=C_44,
    color=colors,
)
axs[3].set_title("C")
axs[3].set_xlabel("J")
axs[3].set_ylabel("y")

fig.suptitle("Comparison of Raw Data and its Decompositions")
fig.text(0, -0.05, r"For a $X \in R^{I, J, K}$ where mz ($J$) = 44, sample ($K$) = 10")

legend_elements = [Patch(color=color, label=i) for i, color in enumerate(colors)]

fig.legend()


In [None]:
for idx, k in enumerate(A.T):
    plt.plot(k, label=idx)

plt.xlabel("J")
plt.ylabel("y")
plt.title("A as a function of K")
plt.legend()


The ordering of the components is not repeatable, even if the solution is. For example in the above plot the noise (the lowest intensity signal) may be component 0, 1 or 2. We can demonstrate the random nature of component ordering through a series of correlation matrixes over a number of runs:

In [None]:
def run_parafac(data):
    _decomp, err = tl_parafac2(
        raw_data.to_numpy(),
        3,
        return_errors=True,
        verbose=True,
        n_iter_max=500,
        nn_modes="all",
    )
    parafac2 = Parafac2(_decomp, raw_data)
    return parafac2


def run_decomps(data, n_runs=4):
    runs = []

    for run in range(0, n_runs):
        runs.append(run_parafac(data))

    return runs


runs = run_decomps(raw_data)


In [None]:
def build_run_corr(runs):
    """
    Create an L2 correlation matrix along the runs and plot as a heatmap
    """
    As = [run.pure.A for run in runs]

    # Using A_0 as a reference

    A_0 = As[0]

    corrs = []
    for A in As:
        corrs.append(distance.cdist(A_0.T, A.T))

    import seaborn as sns

    fig, axs = plt.subplots(2, 2, layout="constrained")

    flat_axs = axs.ravel()

    for i, corr in enumerate(corrs):
        sns.heatmap(corr, ax=flat_axs[i], xticklabels="auto", annot=True, cbar=False)
        flat_axs[i].set_title(i)

    fig.suptitle("Correlation Matrices")

    s = """
    Correlation Matrices between the 0th run (0) and other runs.
    If all runs had the same component ordering, all maps would appear the same
    """
    fig.text(x=0.0, y=-0.1, s=s)

    # want to iterate over each column through the run mode.


build_run_corr(runs)


Thus time-wise peak labeling would have to be done by observing the peak maxima point for each peak individually then labeling.

At a later date I will need to investigate how the components are ordered during the decomposition. I assume its based on random initialisation, but I would have thought if anything that the largest peak (component) would always be first, and ordered thusly.