In [None]:
# 2020-10-25 created by Akson

In [None]:
# Code8.1
# 建立3维数据集

import numpy as np

np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

In [None]:
# Code8.2
# 基于numpy.svd的降维

# 居中数据
X_centered = X - X.mean(axis = 0)
print('X[0]: %s' % X[0])
print(X.mean(axis = 0)[0])
print('X_centered[0]: %s' % X_centered[0])

# 奇异值分解
U, s, Vt = np.linalg.svd(X_centered)
# 取前两个主成分对应的向量
c1 = Vt.T[:, 0]
c2 = Vt.T[:, 1]

print(c1)
print(c2)

# 投影
W2 = Vt.T[:, :2]
X2D = X_centered.dot(W2)
print(X2D.shape)

In [None]:
# Code8.3
# 使用sklearn降维

from sklearn.decomposition import PCA

pca = PCA(n_components = 2) # 参数可选0.0到1.0之间的数表示要求降维后所剩的方差
X2D = pca.fit_transform(X)
print(X2D.shape)

# 可解释方差比
print(pca.explained_variance_ratio_)

In [None]:
# Code8.4
# 加载MNIST数据集

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import time

mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(np.uint8)

X = mnist['data']
y = mnist['target']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Code8.5
# 使用PCA压缩并还原数据集

time_start = time.time()
pca = PCA(n_components = 154)
X_reduced = pca.fit_transform(X_train)
time_end = time.time()
X_recovered = pca.inverse_transform(X_reduced)

print(time_end - time_start)


In [None]:
# Code8.6
# 随机PCA

time_start = time.time()
rnd_pca = PCA(n_components = 154, svd_solver = 'randomized')
X_reduce_rnd = rnd_pca.fit_transform(X_train)
time_end = time.time()

print(time_end - time_start)

In [None]:
# Code8.7
# 增量PCA

from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca = IncrementalPCA(n_components = 154)

for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)
    
X_reduced = inc_pca.transform(X_train)

In [None]:
# Code8.8
# 建立瑞士卷数据集

from sklearn.datasets import make_swiss_roll

X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
y = t > 6.9

In [None]:
# Code8.9
# 内核PCA

from sklearn.decomposition import KernelPCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('kpca', KernelPCA(n_components = 2)),
    ('log_reg', LogisticRegression())
])

param_grid = [{
    'kpca__gamma': np.linspace(0.03, 0.05, 10),
    'kpca__kernel': ['rbf', 'sigmoid']
}]

grid_search = GridSearchCV(clf, param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X, y)

print(grid_search.best_params_)


In [None]:
# Code8.10
# 计算重构原像误差

from sklearn.decomposition import KernelPCA
from sklearn.metrics import mean_squared_error

rbf_pca = KernelPCA(n_components = 154, kernel = 'rbf', gamma = 0.0433, fit_inverse_transform = True)
X_reduced = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)

print(mean_squared_error(X, X_preimage))

In [None]:
# Code8.11
# LLE

from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components = 2, n_neighbors = 10)
X_reduced = lle.fit_transform(X)