In [None]:
# Load a high-dimensional dataset and visualize its structure with PCA.

from sklearn.datasets import load_digits
import matplotlib.pyplot as plt

# Load the digits dataset
digits = load_digits()
X = digits.data
y = digits.target

# Display the shape of the dataset
print("Data shape:", X.shape)

In [None]:
# Reduce dimensions with PCA, transforming data into 2D for visualization.

from sklearn.decomposition import PCA

# Initialize PCA to reduce to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Display the shape of transformed data
print("Transformed Data shape:", X_pca.shape)

In [None]:
# Plot the data in 2D using the first two principal components to observe structure and clustering.

plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', s=10, alpha=0.7)
plt.colorbar(scatter, label="Digit Label")
plt.title("2D PCA Visualization of Digits Dataset")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()

In [None]:
# Analyze the variance captured by each principal component to assess information retention.

# Initialize PCA to capture as much variance as possible
pca_full = PCA().fit(X)

# Plot the cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca_full.explained_variance_ratio_) + 1), 
         pca_full.explained_variance_ratio_.cumsum(), marker='o')
plt.title("Cumulative Explained Variance by PCA Components")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.grid()
plt.show()

In [None]:
# Reduce dimensions to 3D and visualize data for a more detailed view of clustering.

from mpl_toolkits.mplot3d import Axes3D

# Initialize PCA for 3 components
pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X)

# 3D plot of the first three principal components
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X_pca_3d[:, 0], X_pca_3d[:, 1], X_pca_3d[:, 2], c=y, cmap='viridis', s=10, alpha=0.6)
plt.colorbar(scatter, label="Digit Label")
ax.set_title("3D PCA Visualization of Digits Dataset")
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")
ax.set_zlabel("Principal Component 3")
plt.show()