# Feature Engineering: PyTorch vs TensorFlow

**Learning Objectives:**
- Master preprocessing techniques for neural networks
- Compare categorical encoding approaches
- Learn normalization and feature selection methods
- Build framework-specific data pipelines

**Prerequisites:** NumPy/Pandas foundations, data preparation

**Estimated Time:** 40 minutes

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

# Add src to path
sys.path.append(os.path.join('..', '..', 'src'))

from foundations.data_utils import get_tutorial_tabular_data
from utils.comparison_tools import create_side_by_side_comparison

# Try to import frameworks
try:
    import torch
    import torch.nn as nn
    from torch.utils.data import Dataset, DataLoader
    PYTORCH_AVAILABLE = True
    print(f"✅ PyTorch {torch.__version__} available")
except ImportError:
    PYTORCH_AVAILABLE = False
    print("❌ PyTorch not available")

try:
    import tensorflow as tf
    from tensorflow.keras import layers, models
    TENSORFLOW_AVAILABLE = True
    print(f"✅ TensorFlow {tf.__version__} available")
except ImportError:
    TENSORFLOW_AVAILABLE = False
    print("❌ TensorFlow not available")

# Set random seeds
np.random.seed(42)
if PYTORCH_AVAILABLE:
    torch.manual_seed(42)
if TENSORFLOW_AVAILABLE:
    tf.random.set_seed(42)

## 1. Data Loading and Exploration

Loading tabular data and understanding its characteristics.

In [None]:
print("=" * 60)
print("DATA LOADING AND EXPLORATION")
print("=" * 60)

# Load tabular data with mixed types
data = get_tutorial_tabular_data(num_samples=2000, return_as_dataframe=True, include_categorical=True)
df = data['dataframe']

print(f"Dataset shape: {df.shape}")
print(f"\nColumn information:")
print(df.info())

print(f"\nFirst few rows:")
print(df.head())

print(f"\nTarget distribution:")
print(df['target'].value_counts().sort_index())

# Identify column types
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove target from features
if 'target' in numeric_columns:
    numeric_columns.remove('target')
if 'target' in categorical_columns:
    categorical_columns.remove('target')

print(f"\nColumn types:")
print(f"  Numeric columns ({len(numeric_columns)}): {numeric_columns}")
print(f"  Categorical columns ({len(categorical_columns)}): {categorical_columns}")

# Basic statistics
print(f"\nNumeric features statistics:")
print(df[numeric_columns].describe())

if categorical_columns:
    print(f"\nCategorical features:")
    for col in categorical_columns:
        print(f"  {col}: {df[col].nunique()} unique values - {list(df[col].unique())}")

## 2. Feature Engineering Techniques

Applying various preprocessing techniques to prepare data for neural networks.

In [None]:
print("\n" + "=" * 60)
print("FEATURE ENGINEERING TECHNIQUES")
print("=" * 60)

# Create a copy for processing
df_processed = df.copy()

# 1. Handle missing values (if any)
print("\n1. Missing Value Analysis:")
missing_counts = df_processed.isnull().sum()
print(f"Missing values per column:")
for col, count in missing_counts.items():
    if count > 0:
        print(f"  {col}: {count} ({count/len(df_processed)*100:.1f}%)")

if missing_counts.sum() == 0:
    print("  No missing values found ✅")
else:
    # Fill missing values
    for col in numeric_columns:
        if df_processed[col].isnull().sum() > 0:
            df_processed[col].fillna(df_processed[col].median(), inplace=True)
    
    for col in categorical_columns:
        if df_processed[col].isnull().sum() > 0:
            df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)

# 2. Categorical Encoding
print("\n2. Categorical Encoding:")

encoded_dfs = {}

if categorical_columns:
    # Method 1: Label Encoding
    df_label_encoded = df_processed.copy()
    label_encoders = {}
    
    for col in categorical_columns:
        le = LabelEncoder()
        df_label_encoded[col] = le.fit_transform(df_label_encoded[col])
        label_encoders[col] = le
        print(f"  Label encoded {col}: {df_processed[col].nunique()} categories → {df_label_encoded[col].nunique()} integers")
    
    encoded_dfs['label_encoded'] = df_label_encoded
    
    # Method 2: One-Hot Encoding
    df_onehot = df_processed.copy()
    
    for col in categorical_columns:
        # Create dummy variables
        dummies = pd.get_dummies(df_onehot[col], prefix=col, drop_first=True)
        df_onehot = pd.concat([df_onehot, dummies], axis=1)
        df_onehot.drop(col, axis=1, inplace=True)
        print(f"  One-hot encoded {col}: {df_processed[col].nunique()} categories → {len(dummies.columns)} binary features")
    
    encoded_dfs['onehot_encoded'] = df_onehot
    
    print(f"\n  Original shape: {df_processed.shape}")
    print(f"  Label encoded shape: {df_label_encoded.shape}")
    print(f"  One-hot encoded shape: {df_onehot.shape}")

else:
    print("  No categorical columns to encode")
    encoded_dfs['original'] = df_processed

# 3. Feature Scaling
print("\n3. Feature Scaling:")

# Use label encoded version for scaling demonstration
df_for_scaling = encoded_dfs.get('label_encoded', df_processed).copy()

# Separate features and target
X = df_for_scaling.drop('target', axis=1)
y = df_for_scaling['target']

print(f"  Features shape: {X.shape}")
print(f"  Target shape: {y.shape}")

# Split data first
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\n  Train set: {X_train.shape}")
print(f"  Test set: {X_test.shape}")

# Apply StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n  Scaling statistics (training set):")
print(f"    Original - Mean: {X_train.mean().mean():.3f}, Std: {X_train.std().mean():.3f}")
print(f"    Scaled - Mean: {X_train_scaled.mean():.3f}, Std: {X_train_scaled.std():.3f}")

# 4. Feature Selection
print("\n4. Feature Selection:")

# Select top k features using ANOVA F-test
k_best = min(10, X_train_scaled.shape[1])  # Select top 10 or all features if less
selector = SelectKBest(score_func=f_classif, k=k_best)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Get selected feature names
selected_features = X.columns[selector.get_support()].tolist()
feature_scores = selector.scores_[selector.get_support()]

print(f"  Selected {k_best} best features from {X_train_scaled.shape[1]} total features")
print(f"  Selected features and their scores:")
for feature, score in zip(selected_features, feature_scores):
    print(f"    {feature}: {score:.2f}")

print(f"\n  Final feature matrix shape: {X_train_selected.shape}")

# Store processed data
processed_data = {
    'X_train_scaled': X_train_scaled,
    'X_test_scaled': X_test_scaled,
    'X_train_selected': X_train_selected,
    'X_test_selected': X_test_selected,
    'y_train': y_train.values,
    'y_test': y_test.values,
    'feature_names': X.columns.tolist(),
    'selected_features': selected_features,
    'scaler': scaler,
    'selector': selector
}

print(f"\n✅ Feature engineering completed!")
print(f"  Original features: {len(X.columns)}")
print(f"  Selected features: {len(selected_features)}")
print(f"  Training samples: {X_train_selected.shape[0]}")
print(f"  Test samples: {X_test_selected.shape[0]}")

## 3. PyTorch Data Pipeline

Creating PyTorch-specific data pipelines and preprocessing.

In [None]:
if PYTORCH_AVAILABLE:
    print("\n" + "=" * 60)
    print("PYTORCH DATA PIPELINE")
    print("=" * 60)
    
    # Custom Dataset class with preprocessing
    class TabularDataset(Dataset):
        def __init__(self, features, targets, transform=None):
            self.features = torch.FloatTensor(features)
            self.targets = torch.LongTensor(targets)
            self.transform = transform
        
        def __len__(self):
            return len(self.features)
        
        def __getitem__(self, idx):
            feature = self.features[idx]
            target = self.targets[idx]
            
            if self.transform:
                feature = self.transform(feature)
            
            return feature, target
    
    # Custom transforms
    class AddNoise:
        """Add Gaussian noise for data augmentation"""
        def __init__(self, noise_factor=0.1):
            self.noise_factor = noise_factor
        
        def __call__(self, tensor):
            noise = torch.randn_like(tensor) * self.noise_factor
            return tensor + noise
    
    class FeatureDropout:
        """Randomly set some features to zero"""
        def __init__(self, dropout_prob=0.1):
            self.dropout_prob = dropout_prob
        
        def __call__(self, tensor):
            mask = torch.rand_like(tensor) > self.dropout_prob
            return tensor * mask.float()
    
    # Create datasets with different preprocessing
    print("\n🔥 Creating PyTorch Datasets:")
    
    # Basic dataset
    train_dataset_basic = TabularDataset(
        processed_data['X_train_selected'], 
        processed_data['y_train']
    )
    
    test_dataset_basic = TabularDataset(
        processed_data['X_test_selected'], 
        processed_data['y_test']
    )
    
    # Dataset with augmentation
    train_dataset_augmented = TabularDataset(
        processed_data['X_train_selected'], 
        processed_data['y_train'],
        transform=AddNoise(noise_factor=0.05)
    )
    
    print(f"  Basic training dataset: {len(train_dataset_basic)} samples")
    print(f"  Augmented training dataset: {len(train_dataset_augmented)} samples")
    print(f"  Test dataset: {len(test_dataset_basic)} samples")
    
    # Create DataLoaders
    train_loader_basic = DataLoader(train_dataset_basic, batch_size=64, shuffle=True)
    train_loader_augmented = DataLoader(train_dataset_augmented, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset_basic, batch_size=64, shuffle=False)
    
    print(f"\n  DataLoaders created:")
    print(f"    Basic training batches: {len(train_loader_basic)}")
    print(f"    Augmented training batches: {len(train_loader_augmented)}")
    print(f"    Test batches: {len(test_loader)}")
    
    # Demonstrate batch processing
    print(f"\n  Sample batch:")
    for batch_features, batch_targets in train_loader_basic:
        print(f"    Features shape: {batch_features.shape}")
        print(f"    Targets shape: {batch_targets.shape}")
        print(f"    Feature range: [{batch_features.min():.3f}, {batch_features.max():.3f}]")
        print(f"    Target classes: {torch.unique(batch_targets).tolist()}")
        break
    
    # Compare augmented vs non-augmented
    print(f"\n  Augmentation comparison:")
    
    # Get same sample from both datasets
    basic_sample, _ = train_dataset_basic[0]
    augmented_sample, _ = train_dataset_augmented[0]
    
    print(f"    Original sample (first 5 features): {basic_sample[:5]}")
    print(f"    Augmented sample (first 5 features): {augmented_sample[:5]}")
    print(f"    Difference: {(augmented_sample[:5] - basic_sample[:5]).abs()}")
    
    # Store PyTorch data
    pytorch_data = {
        'train_loader_basic': train_loader_basic,
        'train_loader_augmented': train_loader_augmented,
        'test_loader': test_loader,
        'num_features': processed_data['X_train_selected'].shape[1],
        'num_classes': len(np.unique(processed_data['y_train']))
    }
    
    print(f"\n✅ PyTorch pipeline ready!")
    print(f"  Input features: {pytorch_data['num_features']}")
    print(f"  Output classes: {pytorch_data['num_classes']}")

else:
    print("PyTorch not available - skipping PyTorch pipeline")
    pytorch_data = None

## 4. TensorFlow Data Pipeline

Creating TensorFlow-specific data pipelines and preprocessing.

In [None]:
if TENSORFLOW_AVAILABLE:
    print("\n" + "=" * 60)
    print("TENSORFLOW DATA PIPELINE")
    print("=" * 60)
    
    # Create tf.data.Dataset
    print("\n🟠 Creating TensorFlow Datasets:")
    
    # Basic dataset
    train_dataset_tf = tf.data.Dataset.from_tensor_slices((
        processed_data['X_train_selected'].astype(np.float32),
        processed_data['y_train'].astype(np.int32)
    ))
    
    test_dataset_tf = tf.data.Dataset.from_tensor_slices((
        processed_data['X_test_selected'].astype(np.float32),
        processed_data['y_test'].astype(np.int32)
    ))
    
    print(f"  Training dataset created: {len(processed_data['y_train'])} samples")
    print(f"  Test dataset created: {len(processed_data['y_test'])} samples")
    
    # Add preprocessing functions
    def add_noise(features, labels, noise_factor=0.05):
        """Add Gaussian noise to features"""
        noise = tf.random.normal(tf.shape(features), stddev=noise_factor)
        return features + noise, labels
    
    def feature_dropout(features, labels, dropout_rate=0.1):
        """Randomly set some features to zero"""
        mask = tf.random.uniform(tf.shape(features)) > dropout_rate
        return features * tf.cast(mask, tf.float32), labels
    
    # Create different pipeline configurations
    batch_size = 64
    
    # Basic pipeline
    train_pipeline_basic = (
        train_dataset_tf
        .shuffle(buffer_size=1000)
        .batch(batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )
    
    # Augmented pipeline
    train_pipeline_augmented = (
        train_dataset_tf
        .map(lambda x, y: add_noise(x, y, 0.05), num_parallel_calls=tf.data.AUTOTUNE)
        .shuffle(buffer_size=1000)
        .batch(batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )
    
    # Test pipeline
    test_pipeline = (
        test_dataset_tf
        .batch(batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )
    
    print(f"\n  Pipelines created:")
    print(f"    Basic training pipeline: {len(list(train_pipeline_basic))} batches")
    print(f"    Augmented training pipeline: {len(list(train_pipeline_augmented))} batches")
    print(f"    Test pipeline: {len(list(test_pipeline))} batches")
    
    # Demonstrate batch processing
    print(f"\n  Sample batch:")
    for batch_features, batch_targets in train_pipeline_basic.take(1):
        print(f"    Features shape: {batch_features.shape}")
        print(f"    Targets shape: {batch_targets.shape}")
        print(f"    Feature range: [{tf.reduce_min(batch_features):.3f}, {tf.reduce_max(batch_features):.3f}]")
        print(f"    Target classes: {tf.unique(batch_targets)[0].numpy().tolist()}")
    
    # Compare augmented vs non-augmented
    print(f"\n  Augmentation comparison:")
    
    # Get samples from both pipelines
    basic_batch = next(iter(train_pipeline_basic))
    augmented_batch = next(iter(train_pipeline_augmented))
    
    basic_features = basic_batch[0][0]  # First sample, features
    augmented_features = augmented_batch[0][0]  # First sample, features
    
    print(f"    Original sample (first 5 features): {basic_features[:5].numpy()}")
    print(f"    Augmented sample (first 5 features): {augmented_features[:5].numpy()}")
    
    # Performance optimization example
    print(f"\n  Performance optimization:")
    
    # Optimized pipeline with caching
    train_pipeline_optimized = (
        train_dataset_tf
        .cache()  # Cache dataset in memory
        .map(lambda x, y: add_noise(x, y, 0.05), num_parallel_calls=tf.data.AUTOTUNE)
        .shuffle(buffer_size=1000)
        .batch(batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )
    
    print(f"    Optimized pipeline with caching created")
    print(f"    Features: prefetching, parallel mapping, caching")
    
    # Store TensorFlow data
    tensorflow_data = {
        'train_pipeline_basic': train_pipeline_basic,
        'train_pipeline_augmented': train_pipeline_augmented,
        'train_pipeline_optimized': train_pipeline_optimized,
        'test_pipeline': test_pipeline,
        'num_features': processed_data['X_train_selected'].shape[1],
        'num_classes': len(np.unique(processed_data['y_train']))
    }
    
    print(f"\n✅ TensorFlow pipeline ready!")
    print(f"  Input features: {tensorflow_data['num_features']}")
    print(f"  Output classes: {tensorflow_data['num_classes']}")

else:
    print("TensorFlow not available - skipping TensorFlow pipeline")
    tensorflow_data = None

## 5. Feature Engineering Best Practices

Comparing approaches and summarizing best practices.

In [None]:
print("\n" + "=" * 60)
print("FEATURE ENGINEERING BEST PRACTICES")
print("=" * 60)

# Visualization of feature importance
if 'selected_features' in processed_data:
    print("\n📊 Feature Importance Visualization:")
    
    plt.figure(figsize=(12, 6))
    
    # Plot feature scores
    feature_scores = processed_data['selector'].scores_[processed_data['selector'].get_support()]
    selected_features = processed_data['selected_features']
    
    plt.subplot(1, 2, 1)
    plt.barh(range(len(selected_features)), feature_scores, color='skyblue', alpha=0.7)
    plt.yticks(range(len(selected_features)), selected_features)
    plt.xlabel('ANOVA F-Score')
    plt.title('Selected Feature Importance')
    plt.gca().invert_yaxis()
    
    # Plot correlation matrix of selected features
    plt.subplot(1, 2, 2)
    selected_data = df_for_scaling[selected_features]
    correlation_matrix = selected_data.corr()
    
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
    plt.title('Feature Correlation Matrix')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    
    plt.tight_layout()
    plt.show()

# Side-by-side framework comparison
pytorch_pipeline_code = """
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

class TabularDataset(Dataset):
    def __init__(self, features, targets, transform=None):
        self.features = torch.FloatTensor(features)
        self.targets = torch.LongTensor(targets)
        self.transform = transform
    
    def __getitem__(self, idx):
        feature = self.features[idx]
        target = self.targets[idx]
        
        if self.transform:
            feature = self.transform(feature)
        
        return feature, target

# Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

# Create dataset and dataloader
dataset = TabularDataset(X_scaled, y_train)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Data augmentation transform
class AddNoise:
    def __init__(self, noise_factor=0.1):
        self.noise_factor = noise_factor
    
    def __call__(self, tensor):
        noise = torch.randn_like(tensor) * self.noise_factor
        return tensor + noise
"""

tensorflow_pipeline_code = """
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

# Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

# Create tf.data.Dataset
dataset = tf.data.Dataset.from_tensor_slices((
    X_scaled.astype(np.float32),
    y_train.astype(np.int32)
))

# Data augmentation function
def add_noise(features, labels, noise_factor=0.1):
    noise = tf.random.normal(tf.shape(features), stddev=noise_factor)
    return features + noise, labels

# Create optimized pipeline
pipeline = (
    dataset
    .cache()  # Cache in memory
    .map(lambda x, y: add_noise(x, y), 
         num_parallel_calls=tf.data.AUTOTUNE)
    .shuffle(buffer_size=1000)
    .batch(64)
    .prefetch(tf.data.AUTOTUNE)
)
"""

print("\n" + create_side_by_side_comparison(
    pytorch_pipeline_code, tensorflow_pipeline_code, "Data Pipeline Implementation"
))

# Best practices summary
print("\n💡 Feature Engineering Best Practices:")

best_practices = {
    "Data Preprocessing": [
        "Handle missing values before any other processing",
        "Apply feature scaling (StandardScaler, MinMaxScaler)",
        "Encode categorical variables appropriately",
        "Split data before preprocessing to avoid data leakage"
    ],
    "Feature Selection": [
        "Use statistical tests (ANOVA, chi-square) for initial selection",
        "Consider correlation between features",
        "Apply domain knowledge for feature engineering",
        "Use cross-validation for robust feature selection"
    ],
    "Framework-Specific": [
        "PyTorch: Use custom Dataset classes for complex preprocessing",
        "TensorFlow: Leverage tf.data for optimized pipelines",
        "Both: Implement data augmentation for better generalization",
        "Both: Use appropriate batch sizes for your hardware"
    ],
    "Performance": [
        "Cache preprocessed data when possible",
        "Use parallel processing for data loading",
        "Prefetch batches to overlap computation and I/O",
        "Monitor memory usage with large datasets"
    ]
}

for category, practices in best_practices.items():
    print(f"\n{category}:")
    for practice in practices:
        print(f"  • {practice}")

print("\n🎯 Key Takeaways:")
print("  • Proper feature engineering is crucial for neural network performance")
print("  • Both frameworks offer powerful data pipeline capabilities")
print("  • PyTorch provides more flexibility with custom Dataset classes")
print("  • TensorFlow's tf.data API offers excellent optimization features")
print("  • Data augmentation can improve model generalization")
print("  • Always validate preprocessing steps with domain experts")

print(f"\n✅ Feature engineering tutorial completed!")
if pytorch_data and tensorflow_data:
    print(f"  Both PyTorch and TensorFlow pipelines are ready for model training")
elif pytorch_data:
    print(f"  PyTorch pipeline is ready for model training")
elif tensorflow_data:
    print(f"  TensorFlow pipeline is ready for model training")
else:
    print(f"  Preprocessing completed - frameworks not available for pipeline creation")