---
cdt: 2024-08-24T00:00:00
title: First Look at PARAFAC2 on Core Dataset
description: "Can I run PARAFAC on my data. This experiment will merely prove that I can run the tensorly implementation of PARAFAC2 with my dataset. It will require the following: 1. my data is in the right format 2. the tensorly PARAFAC2 code, 3. interpretation of results. What are the results? visualisation of the decomposed modes and a fit, or reconstruction report."
conclusion: ""
status: open
project: parafac2
---

# Tensorly Demonstration Code

The following has been adapted from <https://tensorly.org/stable/auto_examples/decomposition/plot_parafac2.html>.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorly as tl
import duckdb as db
import polars as pl
from IPython.display import Markdown


# Data Preparation


In [None]:
def get_data():
    """
    Get the data out of the db.
    """

    db_path = "/Users/jonathan/mres_thesis/wine_analysis_hplc_uv/wines.db"

    with db.connect(db_path) as con:
        data_query = """--sql
            CREATE OR REPLACE TEMP TABLE raw_shiraz AS (
            SELECT
                *
            FROM
                pbl.sample_metadata
            WHERE
                detection='raw'
            AND
              varietal='shiraz'
            ORDER BY
                sample_num
            );
            SELECT
                *
            FROM
                pbl.chromatogram_spectra_long as cs
            JOIN
            raw_shiraz
            USING
                (id)
            WHERE
                cs.mins < 30
            ORDER BY
                sample_num, cs.wavelength, idx
                ;
            """

        get_sm_query = """--sql
        select * from raw_shiraz;
        """

        data = con.sql(data_query).pl()
        sm = con.sql(get_sm_query).pl()

        return data, sm


long_data, sm = get_data()
display(Markdown("## Sample Metadata"), sm)
display(Markdown("## Sample Metadata"), long_data.head(), long_data.tail())


In [None]:
# checking for duplicate samples

long_data.filter(pl.col("wavelength").eq(256)).group_by("sample_num").len().sort(
    "sample_num"
)


In [None]:
long_data.group_by("sample_num").agg(
    pl.col("wavelength").min().alias("wavelength_min"),
    pl.col("wavelength").max().alias("wavelength_max"),
).describe()


time ranges..

In [None]:
long_data.group_by("sample_num").agg(
    pl.col("mins").min().alias("min"), pl.col("mins").max().alias("max")
).describe()


Varies a bit.. cut it off at 25 mins.

## Fold to Tensor

Need to fold the data across the sample and wavelength modes to form a 3 mode tensor. See <https://tensorly.org/stable/user_guide/tensor_basics.html#folding>

In [None]:
long_df = long_data.filter(pl.col("mins").le(25)).select(
    "sample_num", "mins", "absorbance", "wavelength"
)
long_df


In [None]:
# rank sample_num to have as continuous numerical

long_df = long_df.with_columns(
    pl.col("sample_num").rank("dense").alias("sample_num_rank")
)
long_df


In [None]:
long_df.select("sample_num_rank").n_unique()


Are all wavelength ranges the same?

In [None]:
long_df.select("wavelength")


In [None]:
dfs = [
    df
    for df in long_df.select("sample_num_rank", "mins", "absorbance", "wavelength")
    .pivot(on="wavelength", index=["sample_num_rank", "mins"], values="absorbance")
    .partition_by("sample_num_rank")
]

print([df.shape for df in dfs])
I = 1
J = 3750
K = long_df.select("wavelength").n_unique() + 2

df_1_reshaped = dfs[1].to_numpy().reshape(1, J, K)

print("reshaped frame shape:", df_1_reshaped.shape)


In [None]:
# lookup for wavelength to numpy indice mapping

wavelength_ranking = long_df.select(pl.col("wavelength").unique()).with_columns(
    pl.col("wavelength").rank("dense").sub(1).alias("rank")
)
wavelength_ranking.filter(pl.col("wavelength").eq(256))


In [None]:
wavelength_ranking[0]


In [None]:
np_arrays = [df.to_numpy() for df in dfs]
tensor = np.stack(np_arrays)
print(tensor.shape)
plt.plot(tensor[1, :, 33])


In [None]:
tensor.shape


# Running the data

Lets give the data a whirl, now that its tensor-i-fied. I suspect some white wines would be an easier prospect considering the absence of a shifting baseline.

In [None]:
best_err = np.inf
decomposition = None

true_rank = 30

for run in range(1):
    print(f"Training model {run}...")
    trial_decomposition, trial_errs = parafac2(
        tensor,
        true_rank,
        return_errors=True,
        tol=1e-8,
        n_iter_max=500,
        random_state=run,
        verbose=True,
    )
    print(f"Number of iterations: {len(trial_errs)}")
    print(f"Final error: {trial_errs[-1]}")
    if best_err > trial_errs[-1]:
        best_err = trial_errs[-1]
        err = trial_errs
        decomposition = trial_decomposition
    print("-------------------------------")
print(f"Best model error: {best_err}")


In [None]:
decomposition


In [None]:
est_B[0].shape


In [None]:
tensor[[1]].shape


In [None]:
def fit_parafac_2(tensor, true_rank, **kwargs):
    best_err = np.inf
    decomposition = None

    true_rank = 30

    for run in range(1):
        print(f"Training model {run}...")
        trial_decomposition, trial_errs = parafac2(
            tensor,
            true_rank,
            return_errors=True,
            tol=1e-8,
            n_iter_max=500,
            random_state=run,
            verbose=True,
            **kwargs,
        )
        print(f"Number of iterations: {len(trial_errs)}")
        print(f"Final error: {trial_errs[-1]}")
        if best_err > trial_errs[-1]:
            best_err = trial_errs[-1]
            decomposition = trial_decomposition
        print("-------------------------------")
    print(f"Best model error: {best_err}")

    est_tensor = tl.parafac2_tensor.parafac2_to_tensor(decomposition)
    est_weights, (est_A, est_B, est_C) = tl.parafac2_tensor.apply_parafac2_projections(
        decomposition
    )

    return decomposition, est_tensor, est_weights, est_A, est_B, est_C


decomposition, est_tensor, est_weights, est_A, est_B, est_C = fit_parafac_2(
    tensor=tensor[[1]], true_rank=30, nn_modes="all"
)


In [None]:
decomposition.projections


In [None]:
sample = 0

print(est_B[sample].shape)
x = np.sum(np.abs(est_B[sample]), axis=1)
print(x.shape)
plt.plot(x);


In [None]:
tensor_diff = tensor[[1]] - est_tensor
tensor_diff / tensor[[1]]


In [None]:
x = est_tensor[0, :, 33]
print(x.shape)
plt.plot(x)
plt.plot(tensor[0, :, 33])


In [None]:
plt.plot(est_B[0]);
