# Data Preparation: NumPy/Pandas to PyTorch/TensorFlow

**Learning Objectives:**
- Understand how PyTorch and TensorFlow consume NumPy/Pandas data
- Learn data loading patterns and tensor conversion techniques
- Compare performance of different data loading approaches

**Prerequisites:** NumPy essentials, Pandas for ML

**Estimated Time:** 40 minutes

In [None]:
import os
import sys

import numpy as np

# Add src to path for our utilities
sys.path.append(os.path.join('..', '..', 'src'))

# Import frameworks
try:
    import torch
    from torch.utils.data import DataLoader, TensorDataset
    PYTORCH_AVAILABLE = True
    print(f"✅ PyTorch {torch.__version__} available")
except ImportError:
    PYTORCH_AVAILABLE = False
    print("❌ PyTorch not available")

try:
    import tensorflow as tf
    TENSORFLOW_AVAILABLE = True
    print(f"✅ TensorFlow {tf.__version__} available")
except ImportError:
    TENSORFLOW_AVAILABLE = False
    print("❌ TensorFlow not available")

# Set random seeds
np.random.seed(42)
if PYTORCH_AVAILABLE:
    torch.manual_seed(42)
if TENSORFLOW_AVAILABLE:
    tf.random.set_seed(42)

## 1. Basic NumPy to Tensor Conversion

In [None]:
# Create sample NumPy data
np_data = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
print(f"NumPy array: {np_data.shape}, dtype: {np_data.dtype}")
print(np_data)

# PyTorch conversion
if PYTORCH_AVAILABLE:
    torch_tensor = torch.from_numpy(np_data)
    print(f"\nPyTorch tensor: {torch_tensor.shape}, dtype: {torch_tensor.dtype}")
    print(torch_tensor)

# TensorFlow conversion
if TENSORFLOW_AVAILABLE:
    tf_tensor = tf.constant(np_data)
    print(f"\nTensorFlow tensor: {tf_tensor.shape}, dtype: {tf_tensor.dtype}")
    print(tf_tensor)

## 2. Pandas DataFrame Integration

In [None]:
# Import our data utilities
from foundations.data_utils import get_tutorial_tabular_data

# Get sample data as DataFrame
data = get_tutorial_tabular_data(num_samples=100, return_as_dataframe=True)
df = data['dataframe']

print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 5 rows:")
print(df.head())

# Separate features and target
feature_columns = [col for col in df.columns if col != 'target']
X = df[feature_columns].values.astype(np.float32)
y = df['target'].values.astype(np.int64)

print(f"\nFeatures: {X.shape}, dtype: {X.dtype}")
print(f"Targets: {y.shape}, dtype: {y.dtype}")

## 3. Framework-Specific Data Loading

In [None]:
# PyTorch DataLoader
if PYTORCH_AVAILABLE:
    print("🔥 PyTorch DataLoader:")
    X_torch = torch.from_numpy(X)
    y_torch = torch.from_numpy(y)
    dataset = TensorDataset(X_torch, y_torch)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

    print(f"  Dataset size: {len(dataset)}")
    print(f"  Number of batches: {len(dataloader)}")

    # Show first batch
    for batch_X, batch_y in dataloader:
        print(f"  First batch - X: {batch_X.shape}, y: {batch_y.shape}")
        break

# TensorFlow Dataset
if TENSORFLOW_AVAILABLE:
    print("\n🟠 TensorFlow Dataset:")
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    dataset = dataset.batch(16).shuffle(100)

    print("  Dataset created successfully")

    # Show first batch
    for batch_X, batch_y in dataset.take(1):
        print(f"  First batch - X: {batch_X.shape}, y: {batch_y.shape}")

print("\n✅ Data preparation comparison complete!")