In [None]:
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.manifold import LocallyLinearEmbeddding

import numpy as np
import pandas as pd

In [15]:
mnist = fetch_openml('mnist_784', version=1, cache=True)
X, y = mnist["data"].values, mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
# X_centered = X - X.mean(axis=0)
# U, s, Vt = np.linalg.svd(X_centered)
# c1 = Vt.T[:, 0]
# c2 = Vt.T[:, 1]

In [17]:
# W2 = Vt.T[:, :2]
# X2D = X_centered.dot(W2)

In [18]:
# pca = PCA(n_components=2)
# X2D = pca.fit_transform(X)

In [19]:
# pca.explained_variance_ratio_

In [20]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [21]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

In [22]:
pca = PCA(n_components=154)
X_reduced = pca.fit_transform(X_train)
X_recovered = pca.inverse_transform(X_reduced)

In [23]:
rnd_pca =PCA(n_components=154, svd_solver="randomized")
X_reduced = rnd_pca.fit(X_train)

In [24]:
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)
    
X_reduced = inc_pca.transform(X_train)

In [27]:
# X_mm = np.memmap("./visuals/memmap", dtype="float32", mode="readonly", shape=(m, n))

# batch_size = m // n_batches
# inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
# inc_pca.fit(X_mm)

In [None]:
rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fist_transform(X)

In [None]:
clf = Pipeline([
    ("kcpa", KernelPCA(n_components=2)),
    ("log_reg", LogisticRegression())
])

In [None]:
param_grid = [{
    "kpca__gamma": np.linspace(0.03, 0.05, 10),
    "kcpa__kernel": ["rbf", "sigmoid"]
}]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)

In [None]:
print(grid_search.best_params_)

In [None]:
rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.0433,
                    fit_inverse_transform=True)
X_reduced = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)

In [None]:
mean_squared_error(X, X_preimage)

In [None]:
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced = lle.fit_transform(X)