# scRNA-seq Data Download from GEO

## Overview
This notebook downloads and organizes single-cell RNA sequencing datasets from the Gene Expression Omnibus (GEO) for the immunotherapy resistance atlas.

### Datasets
| GEO ID | Cancer Type | Treatment | Patients |
|--------|-------------|-----------|----------|
| GSE115978 | Melanoma | anti-PD-1 | 48 |
| GSE123813 | BCC | anti-PD-1 | 10 |
| GSE212966 | PDAC | TME | - |
| GSE149614 | HCC | ICIs | - |
| GSE197177 | PDAC | TME | - |
| GSE206785 | Gastric | Metastasis | - |
| GSE183904 | Gastric | T cell | - |
| GSE130000 | Ovarian | Relapse | - |
| GSE202642 | HCC | Adjacent | - |

### Output
- Raw count matrices organized by GEO ID
- Metadata files with sample information

---

## 1. Setup

In [None]:
import os
import sys
from pathlib import Path
import yaml
import requests
import gzip
import shutil
import tarfile
from tqdm import tqdm
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad

# Try to import GEOparse
try:
    import GEOparse
    print(f"GEOparse version: {GEOparse.__version__}")
except ImportError:
    print("GEOparse not installed. Install with: pip install GEOparse")

# Project paths
PROJECT_ROOT = Path("../..").resolve()
DATA_RAW = PROJECT_ROOT / 'data' / 'raw' / 'scrna'
CONFIG_PATH = PROJECT_ROOT / 'config' / 'analysis_params.yaml'

print(f"Data will be saved to: {DATA_RAW}")

In [None]:
# Load configuration
with open(CONFIG_PATH, 'r') as f:
    config = yaml.safe_load(f)

# Get dataset list
datasets = config['datasets']['scrna']

print(f"Datasets to download: {len(datasets)}")
for ds in datasets:
    print(f"  - {ds['id']}: {ds['cancer_type']}")

## 2. Define Download Functions

We provide multiple methods to download GEO data:
1. **GEOparse**: Parse GEO SOFT files and download supplementary data
2. **Direct FTP**: Download from NCBI FTP server
3. **SRA Tools**: For raw sequencing data (if needed)

In [None]:
def download_file(url, output_path, chunk_size=8192):
    """
    Download file with progress bar.
    
    Parameters
    ----------
    url : str
        URL to download
    output_path : Path
        Output file path
    chunk_size : int
        Chunk size for streaming download
    """
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(output_path, 'wb') as f:
        with tqdm(total=total_size, unit='B', unit_scale=True, desc=output_path.name) as pbar:
            for chunk in response.iter_content(chunk_size=chunk_size):
                f.write(chunk)
                pbar.update(len(chunk))


def get_geo_supplementary_files(geo_id, output_dir):
    """
    Download supplementary files from GEO using GEOparse.
    
    Parameters
    ----------
    geo_id : str
        GEO accession (e.g., 'GSE115978')
    output_dir : Path
        Directory to save files
    
    Returns
    -------
    list
        List of downloaded file paths
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"\nDownloading {geo_id}...")
    
    # Parse GEO entry
    gse = GEOparse.get_GEO(geo=geo_id, destdir=str(output_dir), silent=True)
    
    downloaded_files = []
    
    # Get supplementary files
    for gsm_name, gsm in gse.gsms.items():
        for supp_file in gsm.metadata.get('supplementary_file', []):
            if supp_file:
                filename = supp_file.split('/')[-1]
                filepath = output_dir / filename
                
                if not filepath.exists():
                    print(f"  Downloading: {filename}")
                    download_file(supp_file, filepath)
                else:
                    print(f"  Already exists: {filename}")
                
                downloaded_files.append(filepath)
    
    return downloaded_files


def extract_archive(archive_path, output_dir):
    """
    Extract tar.gz or gz files.
    
    Parameters
    ----------
    archive_path : Path
        Path to archive
    output_dir : Path
        Extraction directory
    """
    archive_path = Path(archive_path)
    output_dir = Path(output_dir)
    
    if archive_path.suffix == '.gz' and '.tar' in archive_path.suffixes:
        # tar.gz file
        with tarfile.open(archive_path, 'r:gz') as tar:
            tar.extractall(output_dir)
    elif archive_path.suffix == '.gz':
        # gzip file
        output_file = output_dir / archive_path.stem
        with gzip.open(archive_path, 'rb') as f_in:
            with open(output_file, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

print("Download functions defined")

## 3. Download Datasets

### Important Notes:
- Some datasets may require manual download due to size or access restrictions
- Check each GEO page for data format (10x Genomics, Smart-seq2, etc.)
- Large datasets may take significant time to download

In [None]:
# Create download tracking
download_status = []

for dataset in datasets:
    geo_id = dataset['id']
    cancer_type = dataset['cancer_type']
    
    output_dir = DATA_RAW / geo_id
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Check if already downloaded
    existing_files = list(output_dir.glob('*'))
    
    status = {
        'geo_id': geo_id,
        'cancer_type': cancer_type,
        'output_dir': str(output_dir),
        'status': 'exists' if existing_files else 'pending'
    }
    
    download_status.append(status)

# Display status
status_df = pd.DataFrame(download_status)
print("Download status:")
display(status_df)

### 3.1 GSE115978 - Melanoma anti-PD-1

This is a key dataset with 48 patients treated with anti-PD-1, including pre- and post-treatment samples.

**Reference**: Sade-Feldman et al., Cell 2018

In [None]:
# Download GSE115978
geo_id = "GSE115978"
output_dir = DATA_RAW / geo_id

# This dataset has processed count matrices
# Files typically include:
# - TPM matrix
# - Metadata with response information

print(f"Downloading {geo_id}...")
print(f"Output directory: {output_dir}")
print("\nNote: This dataset may require manual download from GEO.")
print(f"GEO URL: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={geo_id}")

# Uncomment to download using GEOparse:
# files = get_geo_supplementary_files(geo_id, output_dir)

### 3.2 Manual Download Instructions

For large datasets or those with complex structures, manual download is recommended:

1. Go to https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSEXXX
2. Scroll to "Supplementary file" section
3. Download the count matrix files (usually .txt.gz or .h5 format)
4. Place in the corresponding `data/raw/scrna/GSEXXX/` directory

In [None]:
# Alternative: Download using wget/curl commands
# You can run these in terminal or use subprocess

download_commands = {
    'GSE115978': [
        # Example - adjust based on actual GEO supplementary files
        'wget -P data/raw/scrna/GSE115978/ "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE115nnn/GSE115978/suppl/GSE115978_RAW.tar"',
    ],
    'GSE123813': [
        'wget -P data/raw/scrna/GSE123813/ "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE123nnn/GSE123813/suppl/GSE123813_RAW.tar"',
    ],
}

print("Manual download commands (run in terminal):")
print("="*60)
for geo_id, commands in download_commands.items():
    print(f"\n# {geo_id}")
    for cmd in commands:
        print(cmd)

## 4. Process Downloaded Data

After downloading, we need to:
1. Extract archives
2. Identify file formats (10x, Smart-seq2, etc.)
3. Convert to AnnData format for downstream analysis

In [None]:
def identify_data_format(directory):
    """
    Identify the scRNA-seq data format in a directory.
    
    Returns
    -------
    str
        Data format: '10x_h5', '10x_mtx', 'csv', 'txt', 'h5ad', or 'unknown'
    """
    directory = Path(directory)
    files = list(directory.rglob('*'))
    
    # Check for different formats
    extensions = [f.suffix for f in files if f.is_file()]
    filenames = [f.name for f in files if f.is_file()]
    
    if any('.h5ad' in str(f) for f in files):
        return 'h5ad'
    elif any('filtered_feature_bc_matrix.h5' in str(f) for f in files):
        return '10x_h5'
    elif any('matrix.mtx' in str(f) for f in files):
        return '10x_mtx'
    elif '.h5' in extensions:
        return '10x_h5'
    elif '.csv' in extensions or '.txt' in extensions:
        return 'csv_txt'
    else:
        return 'unknown'


def load_10x_data(directory):
    """
    Load 10x Genomics data from directory.
    
    Parameters
    ----------
    directory : Path
        Directory containing 10x output
    
    Returns
    -------
    AnnData
        Loaded data
    """
    directory = Path(directory)
    
    # Try h5 file first
    h5_files = list(directory.rglob('*.h5'))
    if h5_files:
        return sc.read_10x_h5(h5_files[0])
    
    # Try mtx directory
    mtx_files = list(directory.rglob('matrix.mtx*'))
    if mtx_files:
        mtx_dir = mtx_files[0].parent
        return sc.read_10x_mtx(mtx_dir)
    
    raise FileNotFoundError(f"No 10x data found in {directory}")


print("Data loading functions defined")

In [None]:
# Check downloaded data formats
print("Checking data formats in downloaded directories:\n")

for dataset in datasets:
    geo_id = dataset['id']
    data_dir = DATA_RAW / geo_id
    
    if data_dir.exists() and any(data_dir.iterdir()):
        format_type = identify_data_format(data_dir)
        n_files = len(list(data_dir.rglob('*')))
        print(f"{geo_id}: {format_type} ({n_files} files)")
    else:
        print(f"{geo_id}: NOT DOWNLOADED")

## 5. Create Dataset Metadata Registry

We'll create a metadata file tracking all datasets and their properties.

In [None]:
# Create comprehensive metadata
metadata_records = []

for dataset in datasets:
    geo_id = dataset['id']
    data_dir = DATA_RAW / geo_id
    
    record = {
        'geo_id': geo_id,
        'cancer_type': dataset['cancer_type'],
        'treatment': dataset.get('treatment', 'N/A'),
        'n_patients': dataset.get('n_patients', 'Unknown'),
        'timepoints': ', '.join(dataset.get('timepoints', [])),
        'data_path': str(data_dir),
        'downloaded': data_dir.exists() and any(data_dir.iterdir()) if data_dir.exists() else False,
        'format': identify_data_format(data_dir) if data_dir.exists() else 'N/A'
    }
    
    metadata_records.append(record)

# Create DataFrame
metadata_df = pd.DataFrame(metadata_records)

# Save metadata
metadata_path = DATA_RAW / 'dataset_metadata.csv'
metadata_df.to_csv(metadata_path, index=False)

print(f"Metadata saved to: {metadata_path}")
display(metadata_df)

## 6. Summary and Next Steps

### Completed
- Set up download infrastructure
- Created dataset metadata registry
- Identified data formats

### Next Steps
1. Complete manual downloads for remaining datasets
2. Proceed to `01b_spatial_data_download.ipynb` for spatial transcriptomics data
3. Continue to `02_preprocessing/` notebooks for quality control

### Data Access Notes
- Some datasets may require dbGaP access for controlled data
- Check GEO page for any usage restrictions
- Large datasets (>10GB) may take hours to download

In [None]:
# Final status summary
print("\n" + "="*60)
print("DOWNLOAD STATUS SUMMARY")
print("="*60)

downloaded = metadata_df['downloaded'].sum()
total = len(metadata_df)

print(f"\nDownloaded: {downloaded}/{total} datasets")
print(f"\nPending downloads:")
for _, row in metadata_df[~metadata_df['downloaded']].iterrows():
    print(f"  - {row['geo_id']}: {row['cancer_type']}")