# Prepare Data

This notebook prepares the data necessary to generate the visualizations and animations.
It creates two files: `data/digits.parquet` and `data/epochs.parquet`.
Because these two files are already present in the repository, you don't necessarily need to run this notebook.

In [2]:
import polars as pl
import numpy as np
import umap

from sklearn.datasets import fetch_openml

In [3]:
mnist = fetch_openml("mnist_784", version=1)

In [7]:
num_digits = 30_000
num_epochs = 200

## Save the MNIST dataset

In [11]:
df_digits = (
    pl.from_pandas(mnist.data[:num_digits])
    .with_columns(digit=pl.Series(mnist.target[:num_digits]))
)
df_digits.write_parquet("data/digits.parquet")

## Apply UMAP

In [12]:
reducer = umap.UMAP(random_state=42, n_epochs=list(range(num_epochs)))
final_embedding = reducer.fit_transform(df_digits.drop("digit"))
np.save(f"epoch-{num_epochs-1:06d}.npy", final_embedding)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [13]:
! mkdir epochs && mv *.npy epochs

## Save intermediate embeddings at every epoch

In [21]:
df_epochs = (
    pl.concat([
        (
            pl.from_numpy(np.load(f"epochs/epoch-{i:06d}.npy"), schema=["x", "y"])
            .with_columns(
                digit=pl.Series(mnist.target[:num_digits]),
                epoch=pl.lit(i).cast(pl.UInt16)
            ).with_row_index()
        ) for i in range(num_epochs)
    ])
    .sort("epoch", "index")
    .with_columns(pl.col("x", "y") - pl.col("x", "y").min())
    .with_columns(pl.col("x", "y") / pl.col("x", "y").max())
)

In [22]:
df_epochs

index,x,y,digit,epoch
u32,f32,f32,cat,u16
0,0.481639,0.526492,"""5""",0
1,0.61237,0.573228,"""0""",0
2,0.42596,0.636468,"""4""",0
3,0.400917,0.414444,"""1""",0
4,0.41914,0.602288,"""9""",0
…,…,…,…,…
29995,0.382934,0.505783,"""8""",199
29996,0.36772,0.719987,"""9""",199
29997,0.826281,0.416917,"""6""",199
29998,0.207882,0.718433,"""7""",199


In [23]:
df_epochs.write_parquet("data/epochs.parquet")