# Data Preprocessing for Organoid pySCENIC Analysis

This notebook prepares single-cell RNA-seq data from organoids for pySCENIC gene regulatory network inference.

## Overview

The preprocessing steps include:
1. Data loading and quality control
2. Cell type filtering and annotation validation
3. Data splitting by cell line
4. Export for pySCENIC analysis

## Input Data

- Single-cell count matrix in H5AD format
- Cell metadata including:
  - Cell line information
  - Morphogen treatment conditions
  - Time points
  - Cell type annotations

In [None]:
# Import required libraries
import pandas as pd
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Set up scanpy settings
sc.settings.verbosity = 3  # verbosity level
sc.settings.set_figure_params(dpi=80, facecolor='white')

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully")

## 1. Load Input Data

In [None]:
# Define paths
data_dir = Path('../data')
input_file = data_dir / 'exp1_counts_for_scenic.h5ad'

# Load data
print(f"Loading data from: {input_file}")
adata = sc.read_h5ad(input_file)

print(f"Data shape: {adata.shape}")
print(f"Number of cells: {adata.n_obs}")
print(f"Number of genes: {adata.n_vars}")

In [None]:
# Examine metadata
print("Available metadata columns:")
print(adata.obs.columns.tolist())

print("\nFirst few rows of metadata:")
adata.obs.head()

## 2. Quality Control and Filtering

In [None]:
# Check cell line distribution
print("Cell line distribution:")
print(adata.obs['cell_line'].value_counts())

# Visualize cell line distribution
plt.figure(figsize=(10, 6))
cell_line_counts = adata.obs['cell_line'].value_counts()
plt.bar(cell_line_counts.index, cell_line_counts.values)
plt.xlabel('Cell Line')
plt.ylabel('Number of Cells')
plt.title('Cell Distribution by Cell Line')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Filter to specific cell lines of interest
cell_lines_of_interest = ['H1', 'WTC', 'H9', 'WIBJ2']

print(f"Filtering to cell lines: {cell_lines_of_interest}")
adata_filtered = adata[adata.obs.cell_line.isin(cell_lines_of_interest)].copy()

print(f"Filtered data shape: {adata_filtered.shape}")
print(f"Cells removed: {adata.n_obs - adata_filtered.n_obs}")

In [None]:
# Check cell type annotations
if 'fullname_v4' in adata_filtered.obs.columns:
    print("Cell type distribution:")
    print(adata_filtered.obs['fullname_v4'].value_counts())
    
    # Remove non-neurectodermal tissues if present
    if 'Non-neurectodermal tissues' in adata_filtered.obs['fullname_v4'].values:
        print("\nRemoving non-neurectodermal tissues...")
        adata_filtered = adata_filtered[adata_filtered.obs.fullname_v4 != 'Non-neurectodermal tissues'].copy()
        print(f"Data shape after filtering: {adata_filtered.shape}")

In [None]:
# Check for morphogen treatment information
morphogen_columns = ['FGF8', 'SHH', 'CHIR', 'RA']
available_morphogens = [col for col in morphogen_columns if col in adata_filtered.obs.columns]

print(f"Available morphogen columns: {available_morphogens}")

if available_morphogens:
    print("\nMorphogen treatment distribution:")
    for morphogen in available_morphogens:
        print(f"{morphogen}: {adata_filtered.obs[morphogen].value_counts().to_dict()}")

In [None]:
# Check time point information
if 'Time' in adata_filtered.obs.columns:
    print("Time point distribution:")
    print(adata_filtered.obs['Time'].value_counts())
    
    # Visualize time point distribution
    plt.figure(figsize=(8, 5))
    time_counts = adata_filtered.obs['Time'].value_counts()
    plt.bar(time_counts.index, time_counts.values)
    plt.xlabel('Time Point')
    plt.ylabel('Number of Cells')
    plt.title('Cell Distribution by Time Point')
    plt.tight_layout()
    plt.show()

## 3. Data Exploration and Visualization

In [None]:
# Basic statistics
print("Data statistics:")
print(f"Total genes: {adata_filtered.n_vars}")
print(f"Total cells: {adata_filtered.n_obs}")
print(f"Mean genes per cell: {adata_filtered.X.sum(axis=1).mean():.1f}")
print(f"Mean UMI per cell: {(adata_filtered.X > 0).sum(axis=1).mean():.1f}")

In [None]:
# Create a comprehensive metadata summary
summary_stats = []

for cell_line in cell_lines_of_interest:
    cl_data = adata_filtered[adata_filtered.obs['cell_line'] == cell_line]
    
    stats = {
        'cell_line': cell_line,
        'n_cells': cl_data.n_obs,
        'mean_genes_per_cell': cl_data.X.sum(axis=1).mean(),
        'mean_umi_per_cell': (cl_data.X > 0).sum(axis=1).mean()
    }
    
    # Add time point information if available
    if 'Time' in cl_data.obs.columns:
        for time_point in cl_data.obs['Time'].unique():
            n_cells_time = len(cl_data[cl_data.obs['Time'] == time_point])
            stats[f'n_cells_{time_point}'] = n_cells_time
    
    summary_stats.append(stats)

summary_df = pd.DataFrame(summary_stats)
print("Summary statistics by cell line:")
summary_df

## 4. Prepare Data for pySCENIC

In [None]:
# Check clustering information for subsampling
clustering_columns = [col for col in adata_filtered.obs.columns if 'res' in col or 'cluster' in col.lower()]
print(f"Available clustering columns: {clustering_columns}")

if clustering_columns:
    # Use the first available clustering column
    cluster_col = clustering_columns[0]
    print(f"\nUsing clustering column: {cluster_col}")
    print(f"Number of clusters: {len(adata_filtered.obs[cluster_col].unique())}")
    print("Cluster distribution:")
    print(adata_filtered.obs[cluster_col].value_counts().sort_index())

In [None]:
# Visualize cluster and cell line distribution
if clustering_columns:
    cluster_col = clustering_columns[0]
    
    plt.figure(figsize=(12, 6))
    
    # Cross-tabulation
    crosstab = pd.crosstab(adata_filtered.obs[cluster_col], adata_filtered.obs['cell_line'])
    
    plt.subplot(1, 2, 1)
    sns.heatmap(crosstab, annot=True, fmt='d', cmap='Blues')
    plt.title('Cells per Cluster and Cell Line')
    plt.xlabel('Cell Line')
    plt.ylabel('Cluster')
    
    # Proportional view
    plt.subplot(1, 2, 2)
    crosstab_prop = crosstab.div(crosstab.sum(axis=1), axis=0)
    sns.heatmap(crosstab_prop, annot=True, fmt='.2f', cmap='Blues', vmin=0, vmax=1)
    plt.title('Proportion of Cell Lines per Cluster')
    plt.xlabel('Cell Line')
    plt.ylabel('Cluster')
    
    plt.tight_layout()
    plt.show()

## 5. Export Data

In [None]:
# Save the filtered combined dataset
output_file = data_dir / 'exp1_counts_for_scenic_filtered.h5ad'
print(f"Saving filtered dataset to: {output_file}")
adata_filtered.write_h5ad(output_file)

print(f"Saved dataset with shape: {adata_filtered.shape}")

In [None]:
# Split data by cell line and save separately
print("Splitting data by cell line...")

for cell_line in cell_lines_of_interest:
    adata_cellline = adata_filtered[adata_filtered.obs['cell_line'] == cell_line].copy()
    
    output_file_cl = data_dir / f'exp1_counts_for_scenic_{cell_line}.h5ad'
    adata_cellline.write_h5ad(output_file_cl)
    
    print(f"Saved {cell_line}: {adata_cellline.shape} -> {output_file_cl}")

## 6. Generate Processing Summary Report

In [None]:
# Create a processing summary
processing_summary = {
    'original_data_shape': adata.shape,
    'filtered_data_shape': adata_filtered.shape,
    'cells_removed': adata.n_obs - adata_filtered.n_obs,
    'cell_lines_included': cell_lines_of_interest,
    'available_morphogens': available_morphogens,
    'clustering_column': clustering_columns[0] if clustering_columns else None,
    'output_files': {
        'combined': 'exp1_counts_for_scenic_filtered.h5ad',
        'by_cell_line': [f'exp1_counts_for_scenic_{cl}.h5ad' for cl in cell_lines_of_interest]
    }
}

print("Processing Summary:")
print("=" * 50)
for key, value in processing_summary.items():
    print(f"{key}: {value}")

# Save summary to file
import json
summary_file = data_dir / 'preprocessing_summary.json'
with open(summary_file, 'w') as f:
    json.dump(processing_summary, f, indent=2, default=str)

print(f"\nSummary saved to: {summary_file}")

## Summary

This notebook has successfully:

1. ✅ Loaded the original single-cell dataset
2. ✅ Filtered to cell lines of interest (H1, H9, WTC, WIBJ2)
3. ✅ Removed non-neurectodermal tissues
4. ✅ Validated morphogen treatment and time point information
5. ✅ Explored data distribution across cell lines and clusters
6. ✅ Exported filtered data for pySCENIC analysis
7. ✅ Generated data split by cell line
8. ✅ Created processing summary report

## Next Steps

The preprocessed data is now ready for:
- pySCENIC gene regulatory network inference
- Consensus regulon building across multiple runs
- Morphogen correlation analysis

Proceed to the next notebook: `02_pyscenic_consensus.ipynb`