# Environment Setup and Verification

## Overview
This notebook sets up and verifies the computational environment for the **Spatial-scRNA-seq Immunotherapy Resistance Atlas** project.

### Objectives
1. Verify all required packages are installed
2. Check GPU availability for deep learning models
3. Configure project paths and settings
4. Set up logging and reproducibility

### Prerequisites
- Conda/Mamba environment created from `environment.yml`
- NVIDIA GPU with CUDA support (recommended for scVI, cell2location)

---

## 1. Environment Installation

If you haven't created the conda environment yet, run:

```bash
# Using mamba (faster)
mamba env create -f environment.yml

# Or using conda
conda env create -f environment.yml

# Activate environment
conda activate spatial-immunoresist
```

## 2. Import Core Packages and Verify Installation

In [None]:
# Standard library
import sys
import os
import warnings
from pathlib import Path

# Set up warnings
warnings.filterwarnings('ignore')

print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")

In [None]:
# Core scverse ecosystem
import scanpy as sc
import anndata as ad
import squidpy as sq
import muon as mu

print(f"scanpy version: {sc.__version__}")
print(f"anndata version: {ad.__version__}")
print(f"squidpy version: {sq.__version__}")
print(f"muon version: {mu.__version__}")

In [None]:
# Deep learning
import torch
import scvi

print(f"PyTorch version: {torch.__version__}")
print(f"scvi-tools version: {scvi.__version__}")

In [None]:
# Data handling
import numpy as np
import pandas as pd
import scipy

print(f"NumPy version: {np.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"SciPy version: {scipy.__version__}")

In [None]:
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

print(f"Matplotlib version: {plt.matplotlib.__version__}")
print(f"Seaborn version: {sns.__version__}")

## 3. GPU Availability Check

GPU acceleration is critical for:
- **scVI/scANVI**: Variational inference for batch correction
- **cell2location**: Spatial deconvolution

Without GPU, these methods will be significantly slower.

In [None]:
# Check CUDA availability
print("=" * 50)
print("GPU CONFIGURATION")
print("=" * 50)

if torch.cuda.is_available():
    print(f"CUDA available: True")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    
    for i in range(torch.cuda.device_count()):
        print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB")
    
    # Set default device
    device = torch.device("cuda:0")
    print(f"\nDefault device set to: {device}")
else:
    print("CUDA available: False")
    print("WARNING: GPU not available. Deep learning models will run on CPU (slower).")
    device = torch.device("cpu")

In [None]:
# Configure scvi-tools to use GPU
scvi.settings.seed = 42

if torch.cuda.is_available():
    scvi.settings.dl_pin_memory_gpu_training = True
    print("scvi-tools configured for GPU training")
else:
    print("scvi-tools configured for CPU training")

## 4. Configure Project Paths

In [None]:
# Define project root
PROJECT_ROOT = Path("../..").resolve()
print(f"Project root: {PROJECT_ROOT}")

# Define paths
PATHS = {
    'data_raw': PROJECT_ROOT / 'data' / 'raw',
    'data_raw_scrna': PROJECT_ROOT / 'data' / 'raw' / 'scrna',
    'data_raw_spatial': PROJECT_ROOT / 'data' / 'raw' / 'spatial',
    'data_processed': PROJECT_ROOT / 'data' / 'processed',
    'data_processed_scrna': PROJECT_ROOT / 'data' / 'processed' / 'scrna',
    'data_processed_spatial': PROJECT_ROOT / 'data' / 'processed' / 'spatial',
    'data_external': PROJECT_ROOT / 'data' / 'external',
    'results': PROJECT_ROOT / 'results',
    'figures': PROJECT_ROOT / 'results' / 'figures',
    'tables': PROJECT_ROOT / 'results' / 'tables',
    'models': PROJECT_ROOT / 'results' / 'models',
    'config': PROJECT_ROOT / 'config',
}

# Verify all paths exist
print("\nVerifying project directories:")
for name, path in PATHS.items():
    exists = path.exists()
    status = "OK" if exists else "MISSING"
    print(f"  {name}: {status}")

## 5. Load Configuration

In [None]:
import yaml

# Load analysis parameters
config_path = PATHS['config'] / 'analysis_params.yaml'

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

print(f"Configuration loaded from: {config_path}")
print(f"\nProject: {config['project']['name']}")
print(f"Version: {config['project']['version']}")

In [None]:
# Display dataset configuration
print("\nDatasets to analyze:")
print("\nscRNA-seq datasets:")
for dataset in config['datasets']['scrna']:
    print(f"  - {dataset['id']}: {dataset['cancer_type']} ({dataset['treatment']})")

print("\nSpatial datasets:")
for dataset in config['datasets']['spatial']:
    print(f"  - {dataset['id']}: {dataset['cancer_type']} ({dataset['platform']})")

## 6. Configure Scanpy Settings

In [None]:
# Scanpy settings
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(
    dpi=config['visualization']['figure_dpi'],
    facecolor='white',
    frameon=False,
)

# Set results directory for scanpy
sc.settings.figdir = str(PATHS['figures'])
sc.settings.cachedir = str(PROJECT_ROOT / '.cache')

print("Scanpy settings configured")
print(f"  Figure directory: {sc.settings.figdir}")
print(f"  Cache directory: {sc.settings.cachedir}")

## 7. Set Random Seed for Reproducibility

In [None]:
import random

SEED = config['random_seed']

# Set seeds for all libraries
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

# For deterministic operations (may slow down)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

print(f"Random seed set to: {SEED}")
print("All random number generators initialized for reproducibility")

## 8. Test Additional Packages

In [None]:
# Test cell-cell communication packages
print("Testing additional packages...\n")

try:
    import liana
    print(f"liana version: {liana.__version__}")
except ImportError as e:
    print(f"liana: NOT INSTALLED - {e}")

try:
    import decoupler as dc
    print(f"decoupler version: {dc.__version__}")
except ImportError as e:
    print(f"decoupler: NOT INSTALLED - {e}")

try:
    import celltypist
    print(f"celltypist version: {celltypist.__version__}")
except ImportError as e:
    print(f"celltypist: NOT INSTALLED - {e}")

try:
    import cell2location
    print(f"cell2location: INSTALLED")
except ImportError as e:
    print(f"cell2location: NOT INSTALLED - {e}")

## 9. Create Utility Functions

In [None]:
def save_figure(fig, name, formats=['pdf', 'png']):
    """
    Save figure in multiple formats.
    
    Parameters
    ----------
    fig : matplotlib.figure.Figure
        Figure to save
    name : str
        Filename without extension
    formats : list
        List of formats to save
    """
    for fmt in formats:
        filepath = PATHS['figures'] / f"{name}.{fmt}"
        fig.savefig(filepath, dpi=300, bbox_inches='tight')
        print(f"Saved: {filepath}")


def load_config():
    """
    Load analysis configuration.
    
    Returns
    -------
    dict
        Configuration dictionary
    """
    with open(PATHS['config'] / 'analysis_params.yaml', 'r') as f:
        return yaml.safe_load(f)


print("Utility functions defined")

## 10. Environment Summary

In [None]:
print("=" * 60)
print("ENVIRONMENT SETUP COMPLETE")
print("=" * 60)
print(f"\nProject: {config['project']['name']}")
print(f"Python: {sys.version.split()[0]}")
print(f"GPU: {'Available' if torch.cuda.is_available() else 'Not available'}")
print(f"Random seed: {SEED}")
print(f"\nCore packages:")
print(f"  scanpy: {sc.__version__}")
print(f"  anndata: {ad.__version__}")
print(f"  scvi-tools: {scvi.__version__}")
print(f"  squidpy: {sq.__version__}")
print(f"\nReady to proceed with data acquisition!")
print("=" * 60)

---

## Next Steps

Proceed to the data acquisition notebooks:
- `01a_scrna_data_download.ipynb` - Download scRNA-seq datasets from GEO
- `01b_spatial_data_download.ipynb` - Download spatial transcriptomics datasets