# Module 6.1: Deep Learning Analysis - Interactive Notebook

This notebook provides hands-on implementation of deep learning techniques for semiconductor manufacturing process engineering. We'll explore multi-layer perceptrons (MLPs) for tabular process data using both PyTorch and TensorFlow.

## Learning Objectives
- Implement MLPs for regression and classification tasks
- Compare PyTorch vs TensorFlow implementations  
- Apply regularization techniques to prevent overfitting
- Optimize hyperparameters for semiconductor datasets
- Evaluate models using manufacturing-specific metrics

## Outline
1. Environment Setup & Data Loading
2. Neural Network Theory Visualization
3. Simple MLP Implementation (PyTorch)
4. TensorFlow Implementation Comparison
5. Regularization Techniques Deep Dive
6. Hyperparameter Optimization
7. Manufacturing Metrics & Evaluation
8. Production Pipeline Integration
9. Case Study: Yield Prediction
10. Advanced Topics & Next Steps

## 1. Environment Setup & Data Loading

In [None]:
# Core imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Configure plotting
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10
sns.set_palette("husl")

print("✓ Core libraries imported successfully")

In [None]:
# Set up data paths (Module 6 is in advanced tier)
from pathlib import Path
DATA_DIR = Path('../../../datasets').resolve()

print(f"Data directory: {DATA_DIR}")
print(f"Data directory exists: {DATA_DIR.exists()}")

# Import our pipeline module
import sys
sys.path.append('.')
from importlib import import_module

try:
    pipeline_module = import_module('6-1-deep-learning-pipeline')
    print("✓ Pipeline module loaded successfully")
except ImportError as e:
    print(f"⚠ Could not import pipeline: {e}")
    print("Will implement functions directly in notebook")

In [None]:
# Deep learning framework imports with graceful fallbacks
HAS_TORCH = False
HAS_TF = False

try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    import torch.nn.functional as F
    from torch.utils.data import DataLoader, TensorDataset
    
    # Set deterministic behavior
    torch.manual_seed(RANDOM_SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(RANDOM_SEED)
    
    HAS_TORCH = True
    print("✓ PyTorch loaded successfully")
    print(f"  PyTorch version: {torch.__version__}")
    print(f"  CUDA available: {torch.cuda.is_available()}")
    
except ImportError:
    print("⚠ PyTorch not available")

try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers, Sequential
    
    # Set deterministic behavior
    tf.random.set_seed(RANDOM_SEED) 
    
    HAS_TF = True
    print("✓ TensorFlow loaded successfully")
    print(f"  TensorFlow version: {tf.__version__}")
    print(f"  GPU available: {len(tf.config.list_physical_devices('GPU')) > 0}")
    
except ImportError:
    print("⚠ TensorFlow not available")

# Scikit-learn for comparison and metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor
print("✓ Scikit-learn imported successfully")

### Generate Synthetic Semiconductor Data

We'll create realistic semiconductor process datasets for both regression (yield prediction) and classification (defect detection) tasks.

In [None]:
def generate_semiconductor_yield_data(n_samples=1000, n_features=10, noise_level=0.1):
    """
    Generate synthetic semiconductor yield data with realistic parameter relationships
    """
    np.random.seed(RANDOM_SEED)
    
    # Generate process parameters
    data = {}
    
    # Temperature parameters (key drivers)
    data['temperature'] = np.random.normal(450, 20, n_samples)  # Celsius
    data['temp_ramp_rate'] = np.random.normal(5, 1, n_samples)  # C/min
    
    # Pressure parameters  
    data['pressure'] = np.random.normal(2.5, 0.3, n_samples)  # Torr
    data['pump_speed'] = np.random.normal(100, 10, n_samples)  # L/s
    
    # Gas flow parameters
    data['n2_flow'] = np.random.normal(200, 30, n_samples)  # sccm
    data['ar_flow'] = np.random.normal(50, 10, n_samples)   # sccm
    
    # Time parameters
    data['process_time'] = np.random.normal(300, 30, n_samples)  # seconds
    data['dwell_time'] = np.random.normal(60, 10, n_samples)     # seconds
    
    # Equipment parameters
    data['rf_power'] = np.random.normal(1500, 100, n_samples)   # Watts
    data['chamber_cycles'] = np.random.poisson(10, n_samples)   # cycles since clean
    
    # Add additional features if needed
    for i in range(n_features - len(data)):
        data[f'param_{i+1}'] = np.random.normal(0, 1, n_samples)
    
    X = pd.DataFrame(data)
    
    # Create complex yield relationships
    # Main effects
    yield_base = 85.0  # Base yield %
    
    # Temperature effects (optimal around 450C)
    temp_effect = -0.05 * (X['temperature'] - 450)**2 / 100
    
    # Pressure effects (optimal around 2.5 Torr)
    pressure_effect = -2.0 * (X['pressure'] - 2.5)**2
    
    # Flow rate interaction
    flow_effect = 0.01 * X['n2_flow'] * X['ar_flow'] / 10000
    
    # Time effects (diminishing returns)
    time_effect = 0.02 * np.log(X['process_time'] / 300)
    
    # Chamber condition effects
    chamber_effect = -0.3 * X['chamber_cycles']
    
    # RF power effects
    rf_effect = 0.005 * (X['rf_power'] - 1500) / 100
    
    # Non-linear interactions
    interaction_1 = -0.001 * X['temperature'] * X['pressure']
    interaction_2 = 0.0001 * X['rf_power'] * X['process_time'] / 1000
    
    # Combine all effects
    y = (yield_base + temp_effect + pressure_effect + flow_effect + 
         time_effect + chamber_effect + rf_effect + 
         interaction_1 + interaction_2)
    
    # Add noise
    y += np.random.normal(0, noise_level * y.std(), n_samples)
    
    # Constrain to realistic yield range
    y = np.clip(y, 60, 98)
    
    return X, y

def generate_semiconductor_defect_data(n_samples=1000, n_features=8):
    """
    Generate synthetic semiconductor defect classification data
    """
    np.random.seed(RANDOM_SEED)
    
    # Process parameters
    data = {}
    data['etch_rate'] = np.random.normal(100, 15, n_samples)     # nm/min
    data['selectivity'] = np.random.normal(20, 3, n_samples)    # ratio
    data['uniformity'] = np.random.normal(2, 0.5, n_samples)    # % 1-sigma
    data['particles'] = np.random.poisson(5, n_samples)         # count/wafer
    data['endpoint_time'] = np.random.normal(45, 8, n_samples)  # seconds
    data['gas_pressure'] = np.random.normal(15, 2, n_samples)   # mTorr
    data['plasma_power'] = np.random.normal(800, 50, n_samples) # Watts
    data['wafer_temp'] = np.random.normal(25, 5, n_samples)     # Celsius
    
    X = pd.DataFrame(data)
    
    # Define defect probabilities based on parameter combinations
    # 0: No defect, 1: Micro-trenching, 2: Sidewall damage, 3: Incomplete etch
    
    defect_prob = np.zeros((n_samples, 4))
    
    # Base probabilities
    defect_prob[:, 0] = 0.7  # No defect (most common)
    
    # Micro-trenching (high etch rate + low selectivity)
    micro_trench_risk = ((X['etch_rate'] > 110) & (X['selectivity'] < 18)).astype(float)
    defect_prob[:, 1] = 0.05 + 0.25 * micro_trench_risk
    
    # Sidewall damage (high power + particles)
    sidewall_risk = ((X['plasma_power'] > 850) | (X['particles'] > 7)).astype(float)
    defect_prob[:, 2] = 0.05 + 0.2 * sidewall_risk
    
    # Incomplete etch (low etch rate + high pressure)
    incomplete_risk = ((X['etch_rate'] < 90) & (X['gas_pressure'] > 16)).astype(float)
    defect_prob[:, 3] = 0.03 + 0.15 * incomplete_risk
    
    # Normalize probabilities
    defect_prob = defect_prob / defect_prob.sum(axis=1, keepdims=True)
    
    # Sample defect categories
    y = np.array([np.random.choice(4, p=p) for p in defect_prob])
    
    return X, y

# Generate both datasets
print("Generating synthetic semiconductor datasets...")

# Regression dataset (yield prediction)
X_yield, y_yield = generate_semiconductor_yield_data(n_samples=1500, n_features=10)
print(f"✓ Yield dataset: {X_yield.shape[0]} samples, {X_yield.shape[1]} features")
print(f"  Yield range: {y_yield.min():.1f}% - {y_yield.max():.1f}%")

# Classification dataset (defect detection)
X_defects, y_defects = generate_semiconductor_defect_data(n_samples=1200, n_features=8)
defect_names = ['No Defect', 'Micro-trenching', 'Sidewall Damage', 'Incomplete Etch']
print(f"✓ Defect dataset: {X_defects.shape[0]} samples, {X_defects.shape[1]} features")
print(f"  Defect distribution:")
for i, name in enumerate(defect_names):
    count = np.sum(y_defects == i)
    print(f"    {name}: {count} ({count/len(y_defects)*100:.1f}%)")

## 2. Neural Network Theory Visualization

Let's visualize key concepts in neural networks to build intuition.

In [None]:
# Visualize activation functions
x = np.linspace(-5, 5, 1000)

# Define activation functions
relu = np.maximum(0, x)
sigmoid = 1 / (1 + np.exp(-x))
tanh = np.tanh(x)
leaky_relu = np.where(x > 0, x, 0.01 * x)

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Activation Functions in Neural Networks', fontsize=16)

axes[0,0].plot(x, relu, 'b-', linewidth=2)
axes[0,0].set_title('ReLU: f(x) = max(0, x)')
axes[0,0].grid(True, alpha=0.3)
axes[0,0].set_ylabel('Output')

axes[0,1].plot(x, sigmoid, 'r-', linewidth=2)
axes[0,1].set_title('Sigmoid: f(x) = 1/(1+e^(-x))')
axes[0,1].grid(True, alpha=0.3)

axes[1,0].plot(x, tanh, 'g-', linewidth=2)
axes[1,0].set_title('Tanh: f(x) = tanh(x)')
axes[1,0].grid(True, alpha=0.3)
axes[1,0].set_xlabel('Input')
axes[1,0].set_ylabel('Output')

axes[1,1].plot(x, leaky_relu, 'm-', linewidth=2)
axes[1,1].set_title('Leaky ReLU: f(x) = max(0.01x, x)')
axes[1,1].grid(True, alpha=0.3)
axes[1,1].set_xlabel('Input')

plt.tight_layout()
plt.show()

print("\nActivation Function Properties:")
print("• ReLU: Fast, prevents vanishing gradients, can cause dead neurons")
print("• Sigmoid: Smooth, bounded [0,1], suffers from vanishing gradients")
print("• Tanh: Zero-centered [-1,1], faster convergence than sigmoid")
print("• Leaky ReLU: Prevents dead neurons, small gradient for negative inputs")

In [None]:
# Visualize loss landscapes and optimization
def create_loss_landscape():
    """Create a 2D loss landscape for visualization"""
    x = np.linspace(-3, 3, 100)
    y = np.linspace(-3, 3, 100)
    X, Y = np.meshgrid(x, y)
    
    # Complex loss function with multiple local minima
    Z = (X**2 + Y**2) + 0.5 * np.sin(3*X) * np.cos(3*Y) + 0.1 * (X**4 + Y**4)
    
    return X, Y, Z

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Loss landscape
X, Y, Z = create_loss_landscape()
contour = axes[0].contour(X, Y, Z, levels=20, cmap='viridis')
axes[0].set_title('Neural Network Loss Landscape')
axes[0].set_xlabel('Weight 1')
axes[0].set_ylabel('Weight 2')
plt.colorbar(contour, ax=axes[0], label='Loss')

# Simulate different optimization paths
# SGD path (more zigzag)
sgd_path_x = np.array([2.5, 2.3, 2.0, 1.8, 1.5, 1.2, 0.9, 0.6, 0.3, 0.1, 0.0])
sgd_path_y = np.array([2.2, 1.9, 1.8, 1.4, 1.2, 0.9, 0.7, 0.5, 0.3, 0.1, 0.0])

# Adam path (more direct)
adam_path_x = np.array([2.5, 2.1, 1.6, 1.2, 0.8, 0.4, 0.1, 0.0])
adam_path_y = np.array([2.2, 1.8, 1.4, 1.0, 0.6, 0.3, 0.1, 0.0])

axes[0].plot(sgd_path_x, sgd_path_y, 'ro-', alpha=0.7, label='SGD', markersize=4)
axes[0].plot(adam_path_x, adam_path_y, 'bo-', alpha=0.7, label='Adam', markersize=4)
axes[0].plot(0, 0, 'g*', markersize=15, label='Global Minimum')
axes[0].legend()

# Learning curves comparison
epochs = np.arange(1, 101)
sgd_loss = 5 * np.exp(-0.03 * epochs) + 0.1 * np.sin(0.2 * epochs) + 0.2
adam_loss = 5 * np.exp(-0.05 * epochs) + 0.05 * np.sin(0.1 * epochs) + 0.1
momentum_loss = 5 * np.exp(-0.04 * epochs) + 0.08 * np.sin(0.15 * epochs) + 0.15

axes[1].plot(epochs, sgd_loss, 'r-', label='SGD', linewidth=2)
axes[1].plot(epochs, adam_loss, 'b-', label='Adam', linewidth=2)
axes[1].plot(epochs, momentum_loss, 'g-', label='SGD + Momentum', linewidth=2)
axes[1].set_title('Optimizer Convergence Comparison')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Training Loss')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_yscale('log')

plt.tight_layout()
plt.show()

print("\nOptimizer Characteristics:")
print("• SGD: Simple, can oscillate, sensitive to learning rate")
print("• Adam: Adaptive learning rates, fast convergence, good default choice")
print("• SGD + Momentum: Better than plain SGD, helps escape local minima")

## 3. Simple MLP Implementation (PyTorch)

Let's implement a multi-layer perceptron from scratch using PyTorch for yield prediction.

In [None]:
if HAS_TORCH:
    class MLPRegressor(nn.Module):
        def __init__(self, input_dim, hidden_dims=[64, 32], dropout=0.3):
            super(MLPRegressor, self).__init__()
            
            self.input_dim = input_dim
            self.hidden_dims = hidden_dims
            self.dropout = dropout
            
            # Build network layers
            layers = []
            prev_dim = input_dim
            
            for hidden_dim in hidden_dims:
                layers.extend([
                    nn.Linear(prev_dim, hidden_dim),
                    nn.ReLU(),
                    nn.Dropout(dropout)
                ])
                prev_dim = hidden_dim
            
            # Output layer (no activation for regression)
            layers.append(nn.Linear(prev_dim, 1))
            
            self.network = nn.Sequential(*layers)
            
            # Initialize weights
            self.apply(self._init_weights)
        
        def _init_weights(self, module):
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.constant_(module.bias, 0)
        
        def forward(self, x):
            return self.network(x).squeeze()
        
        def predict(self, x):
            self.eval()
            with torch.no_grad():
                if isinstance(x, np.ndarray):
                    x = torch.tensor(x, dtype=torch.float32)
                return self.forward(x).numpy()
    
    # Training function
    def train_pytorch_model(model, X_train, y_train, X_val, y_val, 
                           num_epochs=100, learning_rate=0.001, batch_size=32):
        # Convert to PyTorch tensors
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
        X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
        y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
        
        # Create data loaders
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        
        # Setup optimizer and loss
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, factor=0.5)
        criterion = nn.MSELoss()
        
        # Training history
        history = {'train_loss': [], 'val_loss': []}
        
        # Training loop
        for epoch in range(num_epochs):
            # Training phase
            model.train()
            train_loss = 0.0
            
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                
                # Gradient clipping to prevent exploding gradients
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                
                optimizer.step()
                train_loss += loss.item()
            
            # Validation phase
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_val_tensor)
                val_loss = criterion(val_outputs, y_val_tensor).item()
            
            # Update learning rate
            scheduler.step(val_loss)
            
            # Record history
            avg_train_loss = train_loss / len(train_loader)
            history['train_loss'].append(avg_train_loss)
            history['val_loss'].append(val_loss)
            
            # Print progress
            if (epoch + 1) % 20 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], '
                      f'Train Loss: {avg_train_loss:.4f}, '
                      f'Val Loss: {val_loss:.4f}, '
                      f'LR: {optimizer.param_groups[0]["lr"]:.6f}')
        
        return history
    
    # Prepare data
    scaler = StandardScaler()
    X_yield_scaled = scaler.fit_transform(X_yield)
    
    # Train-validation split
    X_train, X_val, y_train, y_val = train_test_split(
        X_yield_scaled, y_yield, test_size=0.2, random_state=RANDOM_SEED
    )
    
    print(f"Training PyTorch MLP on yield data...")
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Validation set: {X_val.shape[0]} samples")
    
    # Create and train model
    pytorch_model = MLPRegressor(input_dim=X_train.shape[1], 
                                hidden_dims=[128, 64, 32], 
                                dropout=0.3)
    
    print(f"\nModel architecture:")
    print(pytorch_model)
    
    # Train the model
    history = train_pytorch_model(pytorch_model, X_train, y_train, X_val, y_val,
                                 num_epochs=100, learning_rate=0.001, batch_size=32)
    
    print("\n✓ PyTorch training completed!")
    
else:
    print("⚠ PyTorch not available - skipping PyTorch implementation")