In [None]:
import init_notebook

# Task 2 - Diffusion Maps

This task focuses on dimensionality reduction using diffusion maps.

In [None]:
%load_ext autoreload
%autoreload 2
import numpy as np
import math as m
import matplotlib.pyplot as plt
from difusion_maps import diffusion_map, DiffusionMap
from sklearn.datasets import make_swiss_roll
from mpl_toolkits.mplot3d import Axes3D  # For 3D plotting
import pandas as pd


## Part 1
In this part of the task a dataset is created with the following formula:
$$
X = \{x_k\in \R^2\}_{k=1}^{N} \\
$$
$$
x_k = (\cos(t_k), \sin(t_k)) \\
$$
$$
t_k = \frac{2\pi k}{N+1}
$$

In [None]:
N = 1000

def t(k: int) -> float:
    return (2 * m.pi * k) / (N + 1)

t_values = np.array([t(k) for k in range(N)], dtype=np.float64)

data_x = np.cos(t_values)
data_y = np.sin(t_values)

data = np.dstack((data_x, data_y))[0]
plt.plot(data_x, data_y)
plt.title(f"Part 1 data with {N} points")
plt.show()

In [None]:
mapping = DiffusionMap(data, num_eigenvalues=5)

eigen_figure = plt.figure(figsize=(10,10))
constant_plot = eigen_figure.add_subplot(311)
first_period_plot = eigen_figure.add_subplot(312)
second_period_plot = eigen_figure.add_subplot(313)

constant_plot.plot(t_values, np.around(mapping.points[:,0], 10), label=f'Value{0}={mapping.eigen_value(0):.5f}')
first_period_plot.plot(t_values, mapping.points[:,1], label=f'Value{1}={mapping.eigen_value(1):.5f}')
first_period_plot.plot(t_values, mapping.points[:,2], label=f'Value{2}={mapping.eigen_value(2):.5f}')
second_period_plot.plot(t_values, mapping.points[:,3], label=f'Value{3}={mapping.eigen_value(3):.5f}')
second_period_plot.plot(t_values, mapping.points[:,4], label=f'Value{4}={mapping.eigen_value(4):.5f}')


unit = 0.25
x_tick = np.arange(0, 2+unit, unit)

x_label = [r"$" + format(r, ".2g")+ r"\pi$" for r in x_tick]

constant_plot.set_xticks(x_tick*np.pi)
constant_plot.set_xticklabels(x_label, fontsize=10)
first_period_plot.set_xticks(x_tick*np.pi)
first_period_plot.set_xticklabels(x_label, fontsize=10)
second_period_plot.set_xticks(x_tick*np.pi)
second_period_plot.set_xticklabels(x_label, fontsize=10)

constant_plot.legend(loc="upper left")
first_period_plot.legend(loc="upper left")
second_period_plot.legend(loc="upper left")
eigen_figure.subplots_adjust(hspace=0.5)
eigen_figure.show()

## Part 2


In [None]:
# Step 1: Generate the "swiss roll" dataset
np.random.seed(42)  # Set a seed for reproducibility
X, X_color = make_swiss_roll(n_samples=5000)

rng = np.random.default_rng(1)
idx_plot = rng.permutation(5000)[0:5000]

# Step 1.1: Plot the initial 3D "swiss roll" dataset
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[idx_plot, 0], X[idx_plot, 1], X[idx_plot, 2], c=X_color[idx_plot], cmap=plt.cm.Spectral, s=20)
ax.set_title("Swiss Roll Dataset")
ax.set_xlabel("X")
ax.set_ylabel("Y")
ax.set_zlabel("Z")
plt.show()

# Step 2: Apply the diffusion maps algorithm
eigenvalues, eigenvectors = diffusion_map(X, diameter_percent=0.05, num_eigenvalues=10)

# Step 3: Plot the first non-constant eigenfunction against other eigenfunctions
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for i, ax in enumerate(axes.flatten()):
    ax.scatter(eigenvectors[idx_plot, 0], eigenvectors[idx_plot, i], s=5, c=X_color[idx_plot], cmap=plt.cm.Spectral, alpha=0.5)
    ax.set_title(f'φ1 vs φ{i + 2}')
    ax.set_xlabel('φ1')
    ax.set_ylabel(f'φ{i + 2}')

plt.tight_layout()
plt.show()

# Step 4: Determine when φl, l > 1 is no longer a function of φ1
# You may visually inspect the plots to identify the point where the relationship changes.

# Step 5: Compute the three principal components of the "swiss roll" dataset
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

# Step 6: Discuss why it's impossible to represent the data using only two principal components.

# Step 7: Investigate the effect of using only 1000 data points
X_1000, _ = make_swiss_roll(n_samples=1000)
eigenvalues_1000, eigenvectors_1000 = diffusion_map(X_1000, diameter_percent=0.05, num_eigenvalues=10)


In [None]:
# read the trajectories data
trajs = pd.read_csv("../data/data_DMAP_PCA_vadere.txt", header=None, sep=" ")
trajs.head()

## Bonus

In this section we use the `datafold` tool to plot the eigenvectors of the swiss roll dataset.

In [None]:
import datafold.dynfold as dfold
import datafold.pcfold as pfold
from datafold.dynfold import LocalRegressionSelection
from datafold.utils.plot import plot_pairwise_eigenvector

In [None]:

nr_samples = 5000
idx_plot = rng.permutation(nr_samples)[0:nr_samples_plot]

# plot
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
    X[idx_plot, 0],
    X[idx_plot, 1],
    X[idx_plot, 2],
    c=X_color[idx_plot],
    cmap=plt.cm.Spectral,
)
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_zlabel("z")
ax.set_title("point cloud on S-shaped manifold")
ax.view_init(10, 70)

X_pcm = pfold.PCManifold(X)
X_pcm.optimize_parameters()

print(f"epsilon={X_pcm.kernel.epsilon}, cut-off={X_pcm.cut_off}")