In [None]:
import os
import numpy as np

# Check CUDA availability
try:
    import cupy as cp
    import cudf
    CUDA_AVAILABLE = cp.cuda.is_available()
    if CUDA_AVAILABLE:
        %load_ext cudf.pandas
        print("🎉 CUDA is available! Using GPU acceleration")
        print(f"CUDA Version: {cp.cuda.runtime.runtimeGetVersion()}")
        print(f"cuDF Version: {cudf.__version__}")
    else:
        print("⚠️ CUDA is available but no GPU was detected")
except ImportError:
    CUDA_AVAILABLE = False
    print("⚠️ CUDA tools not available. Installing required packages:")
    print("pip install cupy-cuda12x cudf-cuda12x")  # Adjust version as needed

# For non-GPU operations we'll still need pandas
import pandas as pd

# Set pandas to use cuDF when possible if CUDA is available
if CUDA_AVAILABLE:
    # This will make pd.read_csv, pd.DataFrame etc. use GPU automatically
    pd.set_option('compute.use_cudf', True)

# CUDA Acceleration for BCI Data Compression

This notebook demonstrates how to use CUDA acceleration via cuDF to improve the performance of our BCI data compression pipeline. We'll cover:

1. Setting up the CUDA environment
2. Initializing cuDF
3. Converting operations to use GPU acceleration
4. Benchmarking performance improvements
5. Managing GPU memory effectively

## 1. Setting Up the CUDA Environment

First, let's verify our CUDA environment and install necessary dependencies. We'll need:
- CUDA Toolkit
- cuDF
- cuPy
- numba

We'll check the GPU availability and CUDA version first.

In [None]:
import sys
import numpy as np

try:
    import cupy as cp
    import cudf
    from numba import cuda

    # Check CUDA availability
    print(f"CUDA available: {cp.cuda.is_available()}")
    print(f"CUDA version: {cp.cuda.runtime.runtimeGetVersion()}")
    print(f"Number of GPUs: {cp.cuda.runtime.getDeviceCount()}")

    # Get GPU device information
    device = cp.cuda.runtime.getDevice()
    props = cp.cuda.runtime.getDeviceProperties(device)
    print(f"\nGPU Device: {props['name'].decode()}")
    print(f"Compute Capability: {props['major']}.{props['minor']}")
    print(f"Total Memory: {props['totalGlobalMem'] / 1e9:.2f} GB")

    CUDA_AVAILABLE = True

except ImportError as e:
    print(f"CUDA environment not available: {e}")
    print("Will fall back to CPU implementation")
    CUDA_AVAILABLE = False

## 2. Initializing cuDF

Now that we've verified our CUDA environment, let's initialize cuDF and configure it to work with our existing pandas code. We'll:
1. Load the cuDF extension
2. Configure pandas API compatibility
3. Set up memory management preferences

In [None]:
if CUDA_AVAILABLE:
    # Load cuDF extension for pandas API compatibility
    %load_ext cudf.pandas

    # Configure pandas API settings
    import pandas as pd
    pd.set_option('compute.use_numba', True)
    pd.set_option('compute.use_cudf', True)

    # Set up cuDF memory pool
    import rmm
    rmm.reinitialize(
        pool_allocator=True,
        initial_pool_size=1 << 30  # 1GB initial pool
    )

    print("✅ cuDF initialized successfully")
    print("✅ Pandas API compatibility enabled")
    print("✅ Memory pool configured")

## 3. Converting Operations to Use GPU Acceleration

Let's convert some common BCI data operations to use GPU acceleration. We'll demonstrate with:
1. Loading and preprocessing neural data
2. Feature extraction
3. Signal filtering
4. Data compression operations

In [None]:
def create_sample_data(n_channels=32, n_samples=10000):
    """Generate sample neural data"""
    return np.random.randn(n_channels, n_samples)

# Create sample data
data = create_sample_data()
print("Created sample data:", data.shape)

if CUDA_AVAILABLE:
    # Convert to GPU arrays
    gpu_data = cp.asarray(data)
    print("\nData transferred to GPU:", gpu_data.shape)

    # Example 1: Basic preprocessing
    def preprocess_gpu(data):
        # Z-score normalization
        mean = cp.mean(data, axis=1, keepdims=True)
        std = cp.std(data, axis=1, keepdims=True)
        return (data - mean) / std

    normalized_data = preprocess_gpu(gpu_data)
    print("Normalized data stats:")
    print(f"Mean: {cp.mean(normalized_data):.3f}")
    print(f"Std: {cp.std(normalized_data):.3f}")

    # Example 2: Feature extraction using cuDF
    def extract_features_gpu(data):
        df = cudf.DataFrame(data)
        features = {
            'mean': df.mean().values,
            'std': df.std().values,
            'max': df.max().values,
            'min': df.min().values
        }
        return features

    features = extract_features_gpu(normalized_data)
    print("\nExtracted features (first channel):")
    for name, value in features.items():
        print(f"{name}: {value[0]:.3f}")
else:
    print("Running on CPU - GPU acceleration not available")

## 4. Benchmarking Performance

Let's compare the performance between CPU and GPU implementations for some common operations:
1. Data normalization
2. Feature extraction
3. Signal filtering

In [None]:
if CUDA_AVAILABLE:
    import time
    from scipy import signal

    # Create larger dataset for meaningful benchmarks
    large_data = create_sample_data(64, 100000)

    def benchmark_operation(name, cpu_func, gpu_func, data, n_runs=5):
        # CPU timing
        cpu_times = []
        for _ in range(n_runs):
            start = time.time()
            cpu_result = cpu_func(data)
            cpu_times.append(time.time() - start)

        # GPU timing
        gpu_data = cp.asarray(data)
        gpu_times = []
        for _ in range(n_runs):
            start = time.time()
            gpu_result = gpu_func(gpu_data)
            cp.cuda.Stream.null.synchronize()
            gpu_times.append(time.time() - start)

        avg_cpu = np.mean(cpu_times)
        avg_gpu = np.mean(gpu_times)
        speedup = avg_cpu / avg_gpu

        print(f"\n{name} Benchmark:")
        print(f"CPU time: {avg_cpu:.4f}s")
        print(f"GPU time: {avg_gpu:.4f}s")
        print(f"Speedup: {speedup:.1f}x")

    # 1. Normalization benchmark
    def cpu_normalize(data):
        mean = np.mean(data, axis=1, keepdims=True)
        std = np.std(data, axis=1, keepdims=True)
        return (data - mean) / std

    benchmark_operation(
        "Normalization",
        cpu_normalize,
        preprocess_gpu,
        large_data
    )

    # 2. Feature extraction benchmark
    def cpu_extract_features(data):
        df = pd.DataFrame(data)
        return {
            'mean': df.mean().values,
            'std': df.std().values,
            'max': df.max().values,
            'min': df.min().values
        }

    benchmark_operation(
        "Feature Extraction",
        cpu_extract_features,
        extract_features_gpu,
        large_data
    )

else:
    print("GPU benchmarking not available - CUDA required")

## 5. Memory Management

Proper GPU memory management is crucial for optimal performance. Here's how to:
1. Monitor memory usage
2. Clear unused memory
3. Handle large datasets efficiently

In [None]:
if CUDA_AVAILABLE:
    def print_memory_stats():
        """Print current GPU memory usage"""
        free, total = cp.cuda.runtime.memGetInfo()
        used = total - free
        print(f"GPU Memory Usage:")
        print(f"Used: {used / 1e9:.2f} GB")
        print(f"Free: {free / 1e9:.2f} GB")
        print(f"Total: {total / 1e9:.2f} GB")

    # Initial memory state
    print("Initial memory state:")
    print_memory_stats()

    # Create some large arrays
    print("\nAllocating large arrays...")
    arrays = []
    for i in range(3):
        arrays.append(cp.random.randn(10000, 10000))
        print(f"\nAfter allocating array {i+1}:")
        print_memory_stats()

    # Clear memory
    print("\nClearing memory...")
    arrays.clear()
    cp.get_default_memory_pool().free_all_blocks()
    print_memory_stats()

    # Demonstrate proper context management
    print("\nUsing context manager for automatic cleanup...")
    with cp.cuda.Device(0):
        # Work with temporary arrays
        temp_array = cp.random.randn(5000, 5000)
        print("\nInside context:")
        print_memory_stats()

    print("\nAfter context exit:")
    print_memory_stats()

else:
    print("Memory management demo not available - CUDA required")

## Conclusion

We've demonstrated how to:
1. Set up CUDA acceleration with cuDF
2. Convert existing operations to use GPU processing
3. Achieve significant speedups for common operations
4. Properly manage GPU memory

For best results:
- Monitor memory usage carefully
- Use context managers for automatic cleanup
- Batch operations when possible
- Profile performance to identify bottlenecks

Remember to check `CUDA_AVAILABLE` before using GPU operations and provide CPU fallbacks for compatibility.

## 6. Scikit-learn Acceleration with cuML

We can accelerate scikit-learn operations using RAPIDS cuML, which provides GPU-accelerated versions of common machine learning algorithms. This is particularly useful for:
- Dimensionality reduction (PCA, UMAP)
- Clustering (K-means, DBSCAN)
- Classification and regression
- Cross-validation

In [None]:
if CUDA_AVAILABLE:
    try:
        # Load cuML extension for scikit-learn acceleration
        %load_ext cuml.accel

        import cuml
        from cuml.preprocessing import StandardScaler
        from cuml.decomposition import PCA
        from cuml.cluster import KMeans
        print("✅ cuML successfully loaded")

        # Generate sample data for ML tasks
        n_samples = 10000
        n_features = 32
        X = cp.random.randn(n_samples, n_features)

        # Example 1: Standardization and PCA
        print("\n🔄 Testing preprocessing and dimensionality reduction...")
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        pca = PCA(n_components=8)
        X_pca = pca.fit_transform(X_scaled)

        print(f"Original shape: {X.shape}")
        print(f"After PCA: {X_pca.shape}")
        print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")

        # Example 2: Clustering
        print("\n🔍 Testing clustering...")
        kmeans = KMeans(n_clusters=5, random_state=42)
        clusters = kmeans.fit_predict(X_scaled)

        print(f"Number of samples in each cluster:")
        unique, counts = cp.unique(clusters, return_counts=True)
        for cluster, count in zip(unique, counts):
            print(f"Cluster {cluster}: {count} samples")

    except ImportError as e:
        print(f"cuML not available: {e}")
        print("Install cuML with: pip install cuml")
else:
    print("CUDA required for cuML acceleration")

In [None]:
if CUDA_AVAILABLE:
    try:
        import sklearn
        from sklearn.preprocessing import StandardScaler as CPUStandardScaler
        from sklearn.decomposition import PCA as CPUPca
        from sklearn.cluster import KMeans as CPUKMeans

        print("🔥 Benchmarking scikit-learn vs cuML...")

        # Create larger dataset for meaningful comparison
        n_samples = 50000
        n_features = 64
        X_cpu = np.random.randn(n_samples, n_features)
        X_gpu = cp.asarray(X_cpu)

        def benchmark_ml(name, cpu_func, gpu_func, cpu_data, gpu_data):
            # CPU timing
            cpu_start = time.time()
            cpu_result = cpu_func(cpu_data)
            cpu_time = time.time() - cpu_start

            # GPU timing
            gpu_start = time.time()
            gpu_result = gpu_func(gpu_data)
            cp.cuda.Stream.null.synchronize()
            gpu_time = time.time() - gpu_start

            speedup = cpu_time / gpu_time
            print(f"\n{name}:")
            print(f"CPU time: {cpu_time:.3f}s")
            print(f"GPU time: {gpu_time:.3f}s")
            print(f"Speedup: {speedup:.1f}x")
            return cpu_result, gpu_result

        # Test 1: StandardScaler
        print("\n📊 Testing StandardScaler...")
        cpu_scaler = CPUStandardScaler()
        gpu_scaler = StandardScaler()

        benchmark_ml(
            "StandardScaler",
            lambda x: cpu_scaler.fit_transform(x),
            lambda x: gpu_scaler.fit_transform(x),
            X_cpu, X_gpu
        )

        # Test 2: PCA
        print("\n🔄 Testing PCA...")
        cpu_pca = CPUPca(n_components=16)
        gpu_pca = PCA(n_components=16)

        benchmark_ml(
            "PCA",
            lambda x: cpu_pca.fit_transform(x),
            lambda x: gpu_pca.fit_transform(x),
            X_cpu, X_gpu
        )

        # Test 3: KMeans
        print("\n🔍 Testing KMeans...")
        cpu_kmeans = CPUKMeans(n_clusters=8, random_state=42)
        gpu_kmeans = KMeans(n_clusters=8, random_state=42)

        benchmark_ml(
            "KMeans",
            lambda x: cpu_kmeans.fit_predict(x),
            lambda x: gpu_kmeans.fit_predict(x),
            X_cpu, X_gpu
        )

    except ImportError as e:
        print(f"Benchmarking skipped - required packages not available: {e}")
else:
    print("GPU benchmarking not available - CUDA required")