# Enhanced Spatial Statistics Analysis
Comprehensive spatial metrics for MedGemma clinical reports

**Analyses:**
1. Ripley's K function (multi-scale clustering)
2. Neighborhood enrichment (cell type co-location)
3. Spatial entropy (local heterogeneity)
4. Nearest neighbor distances
5. Extended spatial autocorrelation
6. Cluster compactness metrics

In [1]:
import scanpy as sc
import squidpy as sq
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cdist, pdist, squareform
from scipy.stats import entropy
import json
import warnings
warnings.filterwarnings('ignore')

sc.settings.verbosity = 1
sc.settings.set_figure_params(dpi=150, facecolor='white')

  from pkg_resources import DistributionNotFound, get_distribution


## Load Data

In [2]:
adata = sc.read_h5ad('/Users/sriharshameghadri/randomAIProjects/kaggle/medGemma/outputs/annotated_visium_enhanced.h5ad')
print(f"Loaded: {adata.shape[0]} spots, {adata.shape[1]} genes")
print(f"Cell types: {adata.obs['cell_type'].value_counts().to_dict()}")
print(f"Leiden clusters: {adata.obs['leiden'].nunique()}")

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = '/Users/sriharshameghadri/randomAIProjects/kaggle/medGemma/outputs/annotated_visium_enhanced.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

## 1. Ripley's K Function
Measure spatial clustering at multiple distance scales

In [3]:
sq.gr.ripley(adata, cluster_key='cell_type', mode='K')

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

cell_types = adata.obs['cell_type'].unique()
for idx, ct in enumerate(cell_types):
    if idx >= 6:
        break
    sq.pl.ripley(adata, cluster_key='cell_type', mode='K', ax=axes[idx])
    axes[idx].set_title(f"Ripley's K: {ct}")

plt.tight_layout()
plt.savefig('/Users/sriharshameghadri/randomAIProjects/kaggle/medGemma/outputs/ripley_k_function.png', dpi=300, bbox_inches='tight')
plt.show()

ripley_stats = {}
if 'ripley_K' in adata.uns:
    for ct in cell_types:
        if ct in adata.uns['ripley_K']:
            k_values = adata.uns['ripley_K'][ct]
            ripley_stats[ct] = {
                'max_k': float(np.max(k_values)) if len(k_values) > 0 else 0,
                'mean_k': float(np.mean(k_values)) if len(k_values) > 0 else 0,
                'clustering_pattern': 'clustered' if np.max(k_values) > 0 else 'random'
            }

print("Ripley's K statistics:")
print(json.dumps(ripley_stats, indent=2))

NameError: name 'adata' is not defined

## 2. Neighborhood Enrichment Analysis
Which cell types are spatially enriched as neighbors

In [4]:
sq.gr.nhood_enrichment(adata, cluster_key='cell_type')

fig, ax = plt.subplots(figsize=(10, 8))
sq.pl.nhood_enrichment(adata, cluster_key='cell_type', ax=ax, title='Neighborhood Enrichment')
plt.savefig('/Users/sriharshameghadri/randomAIProjects/kaggle/medGemma/outputs/neighborhood_enrichment.png', dpi=300, bbox_inches='tight')
plt.show()

nhood_matrix = adata.uns['cell_type_nhood_enrichment']['zscore']
nhood_pvals = adata.uns['cell_type_nhood_enrichment']['pvalues']

enrichment_summary = {}
for i, ct1 in enumerate(cell_types):
    enriched_neighbors = []
    depleted_neighbors = []
    
    for j, ct2 in enumerate(cell_types):
        if i != j:
            zscore = nhood_matrix[i, j]
            pval = nhood_pvals[i, j]
            
            if pval < 0.05:
                if zscore > 2:
                    enriched_neighbors.append(ct2)
                elif zscore < -2:
                    depleted_neighbors.append(ct2)
    
    enrichment_summary[ct1] = {
        'enriched_neighbors': enriched_neighbors,
        'depleted_neighbors': depleted_neighbors,
        'n_enriched': len(enriched_neighbors),
        'n_depleted': len(depleted_neighbors)
    }

print("Neighborhood enrichment summary:")
print(json.dumps(enrichment_summary, indent=2))

NameError: name 'adata' is not defined

## 3. Spatial Entropy
Calculate local diversity/heterogeneity for each spot

In [5]:
def calculate_spatial_entropy(adata, cluster_key='cell_type', n_neighbors=30):
    """Calculate Shannon entropy of cell type diversity in local neighborhoods."""
    from sklearn.neighbors import NearestNeighbors
    
    coords = adata.obsm['spatial']
    nbrs = NearestNeighbors(n_neighbors=n_neighbors+1, algorithm='ball_tree').fit(coords)
    distances, indices = nbrs.kneighbors(coords)
    
    entropy_values = []
    for neighbors in indices:
        neighbor_types = adata.obs[cluster_key].iloc[neighbors[1:]].values
        unique, counts = np.unique(neighbor_types, return_counts=True)
        probs = counts / counts.sum()
        ent = entropy(probs)
        entropy_values.append(ent)
    
    return np.array(entropy_values)

spatial_entropy = calculate_spatial_entropy(adata, cluster_key='cell_type', n_neighbors=30)
adata.obs['spatial_entropy'] = spatial_entropy

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

sc.pl.spatial(adata, color='spatial_entropy', ax=axes[0], show=False, title='Spatial Entropy')
axes[1].hist(spatial_entropy, bins=50, edgecolor='black')
axes[1].set_xlabel('Spatial Entropy')
axes[1].set_ylabel('Count')
axes[1].set_title('Distribution of Spatial Entropy')
axes[1].axvline(np.median(spatial_entropy), color='red', linestyle='--', label=f'Median: {np.median(spatial_entropy):.2f}')
axes[1].legend()

plt.tight_layout()
plt.savefig('/Users/sriharshameghadri/randomAIProjects/kaggle/medGemma/outputs/spatial_entropy.png', dpi=300, bbox_inches='tight')
plt.show()

entropy_stats = {
    'mean': float(np.mean(spatial_entropy)),
    'median': float(np.median(spatial_entropy)),
    'std': float(np.std(spatial_entropy)),
    'min': float(np.min(spatial_entropy)),
    'max': float(np.max(spatial_entropy)),
    'interpretation': 'high' if np.median(spatial_entropy) > 1.0 else 'moderate' if np.median(spatial_entropy) > 0.5 else 'low'
}

entropy_by_type = {}
for ct in cell_types:
    mask = adata.obs['cell_type'] == ct
    entropy_by_type[ct] = {
        'mean_entropy': float(np.mean(spatial_entropy[mask])),
        'std_entropy': float(np.std(spatial_entropy[mask]))
    }

print("Spatial entropy statistics:")
print(json.dumps(entropy_stats, indent=2))
print("\nEntropy by cell type:")
print(json.dumps(entropy_by_type, indent=2))

NameError: name 'adata' is not defined

## 4. Nearest Neighbor Distances
Distance to nearest neighbor of same cell type

In [6]:
def calculate_nn_distances(adata, cluster_key='cell_type'):
    """Calculate nearest neighbor distances within cell types."""
    coords = adata.obsm['spatial']
    cell_types = adata.obs[cluster_key].values
    
    nn_distances = np.full(len(adata), np.nan)
    
    for ct in np.unique(cell_types):
        mask = cell_types == ct
        if mask.sum() < 2:
            continue
        
        ct_coords = coords[mask]
        dist_matrix = cdist(ct_coords, ct_coords)
        np.fill_diagonal(dist_matrix, np.inf)
        min_distances = np.min(dist_matrix, axis=1)
        
        nn_distances[mask] = min_distances
    
    return nn_distances

nn_distances = calculate_nn_distances(adata, cluster_key='cell_type')
adata.obs['nn_distance'] = nn_distances

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

nn_df = pd.DataFrame({
    'cell_type': adata.obs['cell_type'],
    'nn_distance': nn_distances
}).dropna()

nn_df.boxplot(column='nn_distance', by='cell_type', ax=axes[0])
axes[0].set_title('Nearest Neighbor Distance by Cell Type')
axes[0].set_xlabel('Cell Type')
axes[0].set_ylabel('Distance (pixels)')
plt.sca(axes[0])
plt.xticks(rotation=45, ha='right')

axes[1].hist(nn_distances[~np.isnan(nn_distances)], bins=50, edgecolor='black')
axes[1].set_xlabel('Nearest Neighbor Distance')
axes[1].set_ylabel('Count')
axes[1].set_title('Distribution of NN Distances')
axes[1].axvline(np.nanmedian(nn_distances), color='red', linestyle='--', label=f'Median: {np.nanmedian(nn_distances):.1f}')
axes[1].legend()

plt.tight_layout()
plt.savefig('/Users/sriharshameghadri/randomAIProjects/kaggle/medGemma/outputs/nearest_neighbor_distances.png', dpi=300, bbox_inches='tight')
plt.show()

nn_stats = {}
for ct in cell_types:
    mask = adata.obs['cell_type'] == ct
    ct_distances = nn_distances[mask]
    ct_distances = ct_distances[~np.isnan(ct_distances)]
    
    if len(ct_distances) > 0:
        nn_stats[ct] = {
            'mean_distance': float(np.mean(ct_distances)),
            'median_distance': float(np.median(ct_distances)),
            'std_distance': float(np.std(ct_distances)),
            'spatial_pattern': 'tightly_clustered' if np.median(ct_distances) < 50 else 'dispersed' if np.median(ct_distances) > 150 else 'moderate'
        }

print("Nearest neighbor distance statistics:")
print(json.dumps(nn_stats, indent=2))

NameError: name 'adata' is not defined

## 5. Extended Spatial Autocorrelation
Run Moran's I on all highly variable genes

In [7]:
sq.gr.spatial_neighbors(adata, coord_type='generic', n_neighs=6)

if 'highly_variable' in adata.var.columns:
    hvg_genes = adata.var_names[adata.var['highly_variable']].tolist()
else:
    hvg_genes = adata.var_names[:500].tolist()

print(f"Running spatial autocorrelation on {len(hvg_genes)} genes...")
sq.gr.spatial_autocorr(adata, mode='moran', genes=hvg_genes, n_perms=100, n_jobs=1)

morans_df = adata.uns['moranI'].copy()
morans_df = morans_df.sort_values('I', ascending=False)

top_spatially_variable = morans_df[morans_df['pval_norm'] < 0.05].head(20)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

axes[0, 0].scatter(morans_df.index, morans_df['I'], alpha=0.3)
axes[0, 0].axhline(0, color='red', linestyle='--')
axes[0, 0].set_xlabel('Genes')
axes[0, 0].set_ylabel("Moran's I")
axes[0, 0].set_title("Spatial Autocorrelation for All HVGs")

axes[0, 1].hist(morans_df['I'], bins=50, edgecolor='black')
axes[0, 1].axvline(morans_df['I'].median(), color='red', linestyle='--', label=f"Median: {morans_df['I'].median():.3f}")
axes[0, 1].set_xlabel("Moran's I")
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title("Distribution of Moran's I")
axes[0, 1].legend()

y_pos = np.arange(len(top_spatially_variable))
axes[1, 0].barh(y_pos, top_spatially_variable['I'])
axes[1, 0].set_yticks(y_pos)
axes[1, 0].set_yticklabels(top_spatially_variable.index)
axes[1, 0].set_xlabel("Moran's I")
axes[1, 0].set_title('Top 20 Spatially Variable Genes')
axes[1, 0].invert_yaxis()

axes[1, 1].scatter(morans_df['I'], -np.log10(morans_df['pval_norm'] + 1e-300), alpha=0.3)
axes[1, 1].axhline(-np.log10(0.05), color='red', linestyle='--', label='p=0.05')
axes[1, 1].set_xlabel("Moran's I")
axes[1, 1].set_ylabel('-log10(p-value)')
axes[1, 1].set_title('Volcano Plot: Spatial Autocorrelation')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig('/Users/sriharshameghadri/randomAIProjects/kaggle/medGemma/outputs/spatial_autocorrelation_extended.png', dpi=300, bbox_inches='tight')
plt.show()

autocorr_stats = {
    'n_genes_tested': len(morans_df),
    'n_significant': int((morans_df['pval_norm'] < 0.05).sum()),
    'mean_morans_i': float(morans_df['I'].mean()),
    'median_morans_i': float(morans_df['I'].median()),
    'top_genes': top_spatially_variable.index.tolist()[:10],
    'top_morans_i': top_spatially_variable['I'].head(10).tolist()
}

print("Extended spatial autocorrelation statistics:")
print(json.dumps(autocorr_stats, indent=2))

NameError: name 'adata' is not defined

## 6. Cluster Compactness
Measure spatial spread and density of leiden clusters

In [8]:
def calculate_cluster_compactness(adata, cluster_key='leiden'):
    """Calculate spatial compactness metrics for clusters."""
    coords = adata.obsm['spatial']
    clusters = adata.obs[cluster_key].values
    
    compactness = {}
    
    for cluster in np.unique(clusters):
        mask = clusters == cluster
        cluster_coords = coords[mask]
        
        if len(cluster_coords) < 2:
            continue
        
        centroid = cluster_coords.mean(axis=0)
        distances = np.linalg.norm(cluster_coords - centroid, axis=1)
        
        pairwise_dist = pdist(cluster_coords)
        
        convex_hull_area = 0
        try:
            from scipy.spatial import ConvexHull
            if len(cluster_coords) >= 3:
                hull = ConvexHull(cluster_coords)
                convex_hull_area = hull.volume
        except:
            pass
        
        compactness[str(cluster)] = {
            'n_spots': int(mask.sum()),
            'mean_distance_to_centroid': float(distances.mean()),
            'std_distance_to_centroid': float(distances.std()),
            'max_pairwise_distance': float(pairwise_dist.max()),
            'convex_hull_area': float(convex_hull_area),
            'density': float(mask.sum() / (convex_hull_area + 1)),
            'compactness_score': float(distances.mean() / (distances.std() + 1))
        }
    
    return compactness

cluster_compactness = calculate_cluster_compactness(adata, cluster_key='leiden')

compactness_df = pd.DataFrame(cluster_compactness).T

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

axes[0, 0].bar(compactness_df.index, compactness_df['mean_distance_to_centroid'])
axes[0, 0].set_xlabel('Cluster')
axes[0, 0].set_ylabel('Mean Distance to Centroid')
axes[0, 0].set_title('Cluster Spread')
axes[0, 0].tick_params(axis='x', rotation=45)

axes[0, 1].bar(compactness_df.index, compactness_df['density'])
axes[0, 1].set_xlabel('Cluster')
axes[0, 1].set_ylabel('Density (spots/area)')
axes[0, 1].set_title('Cluster Density')
axes[0, 1].tick_params(axis='x', rotation=45)

axes[1, 0].scatter(compactness_df['convex_hull_area'], compactness_df['n_spots'], s=100, alpha=0.6)
for idx, cluster in enumerate(compactness_df.index):
    axes[1, 0].annotate(cluster, (compactness_df['convex_hull_area'].iloc[idx], compactness_df['n_spots'].iloc[idx]))
axes[1, 0].set_xlabel('Convex Hull Area')
axes[1, 0].set_ylabel('Number of Spots')
axes[1, 0].set_title('Cluster Size vs Area')

axes[1, 1].bar(compactness_df.index, compactness_df['compactness_score'])
axes[1, 1].set_xlabel('Cluster')
axes[1, 1].set_ylabel('Compactness Score')
axes[1, 1].set_title('Cluster Compactness')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('/Users/sriharshameghadri/randomAIProjects/kaggle/medGemma/outputs/cluster_compactness.png', dpi=300, bbox_inches='tight')
plt.show()

print("Cluster compactness statistics:")
print(json.dumps(cluster_compactness, indent=2))

NameError: name 'adata' is not defined

## 7. Comprehensive Spatial Metrics JSON
Combine all statistics for MedGemma prompts

In [9]:
comprehensive_metrics = {
    'sample_info': {
        'n_spots': int(adata.shape[0]),
        'n_genes': int(adata.shape[1]),
        'n_cell_types': len(cell_types),
        'cell_types': cell_types.tolist(),
        'n_clusters': int(adata.obs['leiden'].nunique())
    },
    'ripley_k_function': ripley_stats,
    'neighborhood_enrichment': enrichment_summary,
    'spatial_entropy': {
        'overall': entropy_stats,
        'by_cell_type': entropy_by_type
    },
    'nearest_neighbor_distances': nn_stats,
    'spatial_autocorrelation': autocorr_stats,
    'cluster_compactness': cluster_compactness,
    'cell_type_distribution': adata.obs['cell_type'].value_counts().to_dict(),
    'cluster_distribution': adata.obs['leiden'].value_counts().to_dict()
}

output_path = '/Users/sriharshameghadri/randomAIProjects/kaggle/medGemma/outputs/spatial_statistics_enhanced.json'
with open(output_path, 'w') as f:
    json.dump(comprehensive_metrics, f, indent=2)

print(f"\nComprehensive spatial metrics saved to: {output_path}")
print(f"\nFile size: {len(json.dumps(comprehensive_metrics))} characters")

NameError: name 'adata' is not defined

## 8. Clinical Interpretation Summary
Generate human-readable summary for pathologists

In [10]:
def generate_clinical_summary(metrics):
    """Generate clinical interpretation of spatial statistics."""
    summary = []
    
    summary.append("SPATIAL ANALYSIS SUMMARY")
    summary.append("=" * 50)
    summary.append(f"\nSample: {metrics['sample_info']['n_spots']} spots, {metrics['sample_info']['n_cell_types']} cell types")
    
    summary.append("\n1. SPATIAL HETEROGENEITY:")
    entropy_interp = metrics['spatial_entropy']['overall']['interpretation']
    summary.append(f"   - Overall heterogeneity: {entropy_interp.upper()}")
    summary.append(f"   - Median entropy: {metrics['spatial_entropy']['overall']['median']:.3f}")
    
    summary.append("\n2. CELL TYPE SPATIAL PATTERNS:")
    for ct, stats in metrics['nearest_neighbor_distances'].items():
        pattern = stats['spatial_pattern']
        summary.append(f"   - {ct}: {pattern.replace('_', ' ').upper()}")
    
    summary.append("\n3. NEIGHBORHOOD INTERACTIONS:")
    for ct, enrichment in metrics['neighborhood_enrichment'].items():
        if enrichment['n_enriched'] > 0:
            summary.append(f"   - {ct} enriched near: {', '.join(enrichment['enriched_neighbors'])}")
    
    summary.append("\n4. SPATIALLY VARIABLE GENES:")
    n_sig = metrics['spatial_autocorrelation']['n_significant']
    top_genes = metrics['spatial_autocorrelation']['top_genes'][:5]
    summary.append(f"   - {n_sig} genes show significant spatial patterns")
    summary.append(f"   - Top spatial genes: {', '.join(top_genes)}")
    
    summary.append("\n5. CLUSTER ORGANIZATION:")
    for cluster, comp in metrics['cluster_compactness'].items():
        if comp['n_spots'] > 100:
            score = comp['compactness_score']
            org = "tightly organized" if score > 1.5 else "dispersed" if score < 0.8 else "moderate"
            summary.append(f"   - Cluster {cluster}: {org} ({comp['n_spots']} spots)")
    
    return "\n".join(summary)

clinical_summary = generate_clinical_summary(comprehensive_metrics)
print(clinical_summary)

with open('/Users/sriharshameghadri/randomAIProjects/kaggle/medGemma/outputs/clinical_spatial_summary.txt', 'w') as f:
    f.write(clinical_summary)

NameError: name 'comprehensive_metrics' is not defined

## 9. Save Updated h5ad
Store new spatial columns in AnnData object

In [11]:
print("\nNew columns added to adata.obs:")
print("  - spatial_entropy")
print("  - nn_distance")

output_h5ad = '/Users/sriharshameghadri/randomAIProjects/kaggle/medGemma/outputs/annotated_visium_spatial_stats.h5ad'
adata.write_h5ad(output_h5ad)
print(f"\nUpdated h5ad saved to: {output_h5ad}")


New columns added to adata.obs:
  - spatial_entropy
  - nn_distance


NameError: name 'adata' is not defined

## Summary

**Generated files:**
1. `outputs/spatial_statistics_enhanced.json` - Comprehensive metrics
2. `outputs/clinical_spatial_summary.txt` - Human-readable interpretation
3. `outputs/ripley_k_function.png` - Multi-scale clustering
4. `outputs/neighborhood_enrichment.png` - Cell type co-location
5. `outputs/spatial_entropy.png` - Local heterogeneity
6. `outputs/nearest_neighbor_distances.png` - Dispersion patterns
7. `outputs/spatial_autocorrelation_extended.png` - Gene spatial patterns
8. `outputs/cluster_compactness.png` - Cluster organization
9. `outputs/annotated_visium_spatial_stats.h5ad` - Updated AnnData

**Ready for MedGemma integration!**