# Module 04: Dimensionality Reduction

Techniques for exploring and visualizing high-dimensional data.

## Learning Objectives

1. Understand why dimensionality reduction is useful
2. Apply PCA for linear dimensionality reduction
3. Use t-SNE for visualization
4. Interpret reduced representations
5. Apply to chemical engineering datasets

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

## The Curse of Dimensionality

Real datasets often have many features:
- Process data: 100s of sensor readings
- Spectroscopy: 1000s of wavelengths
- Molecular descriptors: 100s of properties

Problems with high dimensions:
- Hard to visualize
- Many features may be correlated
- Models can overfit
- Distance metrics become less meaningful

**Dimensionality reduction** finds a lower-dimensional representation that preserves important structure.

In [None]:
# Create synthetic catalyst characterization data
np.random.seed(42)
n_samples = 200

# Three types of catalysts with different properties
catalyst_type = np.random.choice(['Type_A', 'Type_B', 'Type_C'], n_samples)

# Base properties for each type
base_props = {
    'Type_A': [100, 50, 0.8, 300, 25, 1.2, 0.05, 2.5],
    'Type_B': [150, 30, 0.5, 250, 35, 0.8, 0.08, 3.0],
    'Type_C': [80, 70, 0.9, 350, 20, 1.5, 0.03, 2.0]
}

# Generate features with noise
features = []
for cat in catalyst_type:
    base = np.array(base_props[cat])
    noise = np.random.normal(0, 0.1, len(base)) * base
    features.append(base + noise)

# Create DataFrame
feature_names = ['surface_area', 'pore_volume', 'acidity', 'crystallite_size',
                 'metal_dispersion', 'reduction_temp', 'impurity_level', 'particle_size']

df = pd.DataFrame(features, columns=feature_names)
df['catalyst_type'] = catalyst_type

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Visualizing 8 dimensions is hard!
# Let's look at pairwise relationships
fig, axes = plt.subplots(2, 3, figsize=(12, 8))

colors = {'Type_A': 'blue', 'Type_B': 'red', 'Type_C': 'green'}

pairs = [('surface_area', 'pore_volume'), ('acidity', 'crystallite_size'),
         ('metal_dispersion', 'reduction_temp'), ('surface_area', 'acidity'),
         ('pore_volume', 'particle_size'), ('impurity_level', 'metal_dispersion')]

for ax, (x, y) in zip(axes.flat, pairs):
    for cat in ['Type_A', 'Type_B', 'Type_C']:
        mask = df['catalyst_type'] == cat
        ax.scatter(df.loc[mask, x], df.loc[mask, y], c=colors[cat], label=cat, alpha=0.6)
    ax.set_xlabel(x)
    ax.set_ylabel(y)

axes[0, 0].legend()
plt.tight_layout()
plt.show()

## Principal Component Analysis (PCA)

PCA finds orthogonal directions of maximum variance.

In [None]:
# Prepare data: extract features and scale
X = df[feature_names].values
y = df['catalyst_type'].values

# Scaling is important for PCA!
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Original shape: {X.shape}")
print(f"Mean of scaled data: {X_scaled.mean(axis=0).round(2)}")
print(f"Std of scaled data: {X_scaled.std(axis=0).round(2)}")

In [None]:
# Fit PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Explained variance
print("Explained variance ratio per component:")
for i, var in enumerate(pca.explained_variance_ratio_):
    print(f"  PC{i+1}: {var:.3f} ({var*100:.1f}%)")

print(f"\nCumulative variance with 2 components: {pca.explained_variance_ratio_[:2].sum():.1%}")

In [None]:
# Scree plot
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Individual variance
axes[0].bar(range(1, len(pca.explained_variance_ratio_) + 1), 
            pca.explained_variance_ratio_, edgecolor='black')
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].set_title('Scree Plot')

# Cumulative variance
cumulative = np.cumsum(pca.explained_variance_ratio_)
axes[1].plot(range(1, len(cumulative) + 1), cumulative, 'o-')
axes[1].axhline(y=0.9, color='r', linestyle='--', label='90% threshold')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('Cumulative Variance')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Visualize in 2D
plt.figure(figsize=(10, 8))

for cat in ['Type_A', 'Type_B', 'Type_C']:
    mask = y == cat
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], label=cat, alpha=0.6, s=50)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('PCA of Catalyst Properties')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Interpret principal components: loadings
loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f'PC{i+1}' for i in range(len(feature_names))],
    index=feature_names
)

print("PC Loadings (contribution of each feature):")
print(loadings[['PC1', 'PC2', 'PC3']].round(3))

In [None]:
# Biplot: visualize samples and loadings together
fig, ax = plt.subplots(figsize=(12, 10))

# Plot samples
for cat in ['Type_A', 'Type_B', 'Type_C']:
    mask = y == cat
    ax.scatter(X_pca[mask, 0], X_pca[mask, 1], label=cat, alpha=0.5, s=30)

# Plot loadings as arrows
scale = 3  # Scale factor for visibility
for i, feature in enumerate(feature_names):
    ax.arrow(0, 0, loadings.iloc[i, 0]*scale, loadings.iloc[i, 1]*scale,
             head_width=0.1, head_length=0.05, fc='black', ec='black')
    ax.text(loadings.iloc[i, 0]*scale*1.1, loadings.iloc[i, 1]*scale*1.1, 
            feature, fontsize=9)

ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
ax.set_title('PCA Biplot')
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()

## t-SNE: Nonlinear Visualization

t-SNE preserves local structure and is better for visualization of clusters.

In [None]:
# Fit t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

# Visualize
plt.figure(figsize=(10, 8))

for cat in ['Type_A', 'Type_B', 'Type_C']:
    mask = y == cat
    plt.scatter(X_tsne[mask, 0], X_tsne[mask, 1], label=cat, alpha=0.6, s=50)

plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('t-SNE of Catalyst Properties')
plt.legend()
plt.show()

In [None]:
# Effect of perplexity
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for ax, perp in zip(axes, [5, 30, 100]):
    tsne = TSNE(n_components=2, perplexity=perp, random_state=42)
    X_tsne = tsne.fit_transform(X_scaled)
    
    for cat in ['Type_A', 'Type_B', 'Type_C']:
        mask = y == cat
        ax.scatter(X_tsne[mask, 0], X_tsne[mask, 1], label=cat, alpha=0.6)
    
    ax.set_title(f'Perplexity = {perp}')
    ax.legend()

plt.tight_layout()
plt.show()

## PCA vs t-SNE

| Aspect | PCA | t-SNE |
|--------|-----|-------|
| Type | Linear | Nonlinear |
| Preserves | Global structure | Local structure |
| Interpretable | Yes (loadings) | No |
| Speed | Fast | Slow |
| New data | Can project | Cannot project |
| Use for | Features, preprocessing | Visualization |

In [None]:
# Side-by-side comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# PCA
for cat in ['Type_A', 'Type_B', 'Type_C']:
    mask = y == cat
    axes[0].scatter(X_pca[mask, 0], X_pca[mask, 1], label=cat, alpha=0.6, s=50)
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')
axes[0].set_title('PCA')
axes[0].legend()

# t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

for cat in ['Type_A', 'Type_B', 'Type_C']:
    mask = y == cat
    axes[1].scatter(X_tsne[mask, 0], X_tsne[mask, 1], label=cat, alpha=0.6, s=50)
axes[1].set_xlabel('t-SNE 1')
axes[1].set_ylabel('t-SNE 2')
axes[1].set_title('t-SNE')
axes[1].legend()

plt.tight_layout()
plt.show()

## Using PCA for Feature Reduction

PCA can be used as a preprocessing step before modeling.

In [None]:
# Keep components that explain 90% of variance
pca_90 = PCA(n_components=0.90)
X_reduced = pca_90.fit_transform(X_scaled)

print(f"Original features: {X_scaled.shape[1]}")
print(f"Reduced features: {X_reduced.shape[1]}")
print(f"Explained variance: {pca_90.explained_variance_ratio_.sum():.1%}")

## Summary

| Technique | Use Case |
|-----------|----------|
| PCA | Feature reduction, preprocessing, interpretable visualization |
| t-SNE | Cluster visualization, exploring structure |

Key points:
- Always scale your data before dimensionality reduction
- Check explained variance to choose number of components
- Use loadings to interpret PCA components
- t-SNE is for visualization only (don't use distances meaningfully)

## Next Steps

Now we'll learn how to build predictive models using linear regression.