# Lab 1: Exploring Population Genomic Data

This notebook introduces the fundamental concepts of population genomics through the analysis of data from the 1000 Genomes Project. You'll explore genetic variation across global populations and learn how to visualize and interpret population structure.

## Setup

First, we'll import the necessary libraries and set up our environment.

In [None]:
# Check if running in JupyterLite (browser) or local environment
import sys
IN_BROWSER = 'pyodide' in sys.modules

# Install required packages if running in browser
if IN_BROWSER:
    %pip install -q numpy pandas scikit-learn matplotlib seaborn
    print("Running in JupyterLite browser environment")
else:
    print("Running in standard Jupyter environment")

In [None]:
# Import standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import io
import requests
import json
import random
from IPython.display import HTML, display

# Set plot styles
plt.style.use('seaborn-whitegrid')
sns.set_context("notebook", font_scale=1.2)

# Configure matplotlib for high-quality plots
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['savefig.dpi'] = 100
plt.rcParams['font.size'] = 12

## Data Loading

In a browser environment, we'll be using a pre-processed subset of the 1000 Genomes Project data that's compatible with JupyterLite. This dataset includes genetic variants (SNPs) from chromosome 22 for 504 individuals across five super-populations.

## Exploratory Data Analysis

Now that we have loaded our data, let's explore the patterns of genetic variation across populations. We'll start by calculating some basic summary statistics and then visualize the data using Principal Component Analysis (PCA).

In [ ]:
def calculate_allele_frequencies(genotypes, metadata):
    """Calculate allele frequencies for each population"""
    populations = metadata['population'].unique()
    n_variants = genotypes.shape[1]
    
    # Dictionary to store allele frequencies for each population
    pop_frequencies = {}
    
    for pop in populations:
        # Get indices of samples in this population
        pop_indices = metadata[metadata['population'] == pop].index
        
        # Extract genotypes for this population
        pop_genotypes = genotypes[pop_indices, :]
        
        # Calculate allele frequency (divide by 2 because each genotype has 2 alleles)
        pop_freq = np.sum(pop_genotypes, axis=0) / (2 * len(pop_indices))
        
        # Store in dictionary
        pop_frequencies[pop] = pop_freq
    
    return pop_frequencies

# Calculate allele frequencies for each population
if genotypes is not None:
    pop_frequencies = calculate_allele_frequencies(genotypes, metadata)
    
    # Create a DataFrame for easier visualization
    freq_df = pd.DataFrame(pop_frequencies)
    
    # Display first few rows
    print("Allele frequencies for first 5 variants:")
    display(freq_df.head())
    
    # Plot the distribution of allele frequencies for each population
    plt.figure(figsize=(12, 6))
    
    for pop in metadata['population'].unique():
        sns.kdeplot(pop_frequencies[pop], label=pop)
    
    plt.xlabel('Allele Frequency')
    plt.ylabel('Density')
    plt.title('Distribution of Allele Frequencies Across Populations')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Compute and visualize FST (population differentiation)
    print("\nPairwise FST values (measure of population differentiation):")
    
    # Calculate FST for each pair of populations and each SNP
    # For simplicity, using Weir & Cockerham's formula for 2 populations
    def calculate_fst(freq1, freq2):
        # Simple FST calculation - for educational purposes
        # In a real analysis, you would use a proper formula like Weir & Cockerham's
        numerator = (freq1 - freq2)**2
        denominator = freq1 * (1 - freq1) + freq2 * (1 - freq2)
        # Avoid division by zero
        valid_indices = denominator > 0
        fst = np.zeros_like(numerator)
        fst[valid_indices] = numerator[valid_indices] / denominator[valid_indices]
        return np.mean(fst)  # Average across all SNPs
    
    # Calculate FST for each pair of populations
    pops = list(pop_frequencies.keys())
    fst_matrix = np.zeros((len(pops), len(pops)))
    
    for i, pop1 in enumerate(pops):
        for j, pop2 in enumerate(pops):
            if i < j:  # Only calculate for unique pairs
                fst = calculate_fst(pop_frequencies[pop1], pop_frequencies[pop2])
                fst_matrix[i, j] = fst
                fst_matrix[j, i] = fst  # Matrix is symmetric
    
    # Create a DataFrame and visualize as a heatmap
    fst_df = pd.DataFrame(fst_matrix, index=pops, columns=pops)
    display(fst_df)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(fst_df, annot=True, cmap='YlGnBu', fmt='.4f', 
                linewidths=0.5, cbar_kws={'label': 'FST Value'})
    plt.title('Pairwise FST Values Between Populations')
    plt.tight_layout()
    plt.show()

## Principal Component Analysis (PCA)

PCA is a powerful technique for visualizing the genetic structure of populations. It can reveal patterns of genetic relatedness and differentiation that might not be obvious from individual SNPs.

In [ ]:
def run_pca(genotypes, metadata, n_components=2):
    """Run PCA on genotype data and return the PCA projections"""
    if genotypes is None:
        return None
    
    # Standardize the genotype data (important for PCA)
    scaler = StandardScaler()
    genotypes_std = scaler.fit_transform(genotypes)
    
    # Apply PCA
    pca = PCA(n_components=n_components)
    pca_projections = pca.fit_transform(genotypes_std)
    
    # Calculate the percentage of variance explained
    variance_explained = pca.explained_variance_ratio_ * 100
    
    # Return the projections, pca object, and the variance explained
    return pca_projections, pca, variance_explained

# Run PCA if genotypes data is available
if genotypes is not None:
    # Run PCA
    n_components = 5  # Calculate 5 components
    pca_projections, pca, variance_explained = run_pca(genotypes, metadata, n_components)
    
    # Create a DataFrame with PCA projections and metadata
    pca_df = pd.DataFrame(pca_projections, columns=[f'PC{i+1}' for i in range(n_components)])
    pca_df['sample_id'] = metadata['sample_id'].values
    pca_df['population'] = metadata['population'].values
    
    # Print the percentage of variance explained
    print("Percentage of variance explained by each principal component:")
    for i, var in enumerate(variance_explained):
        print(f"PC{i+1}: {var:.2f}%")
    
    # Plot the PCA results (PC1 vs PC2)
    plt.figure(figsize=(12, 8))
    
    # Define a color palette for the populations
    palette = {'AFR': '#1f77b4', 'AMR': '#ff7f0e', 'EAS': '#2ca02c', 
               'EUR': '#d62728', 'SAS': '#9467bd'}
    
    # Plot each population with a different color
    for pop in pca_df['population'].unique():
        subset = pca_df[pca_df['population'] == pop]
        plt.scatter(subset['PC1'], subset['PC2'], label=pop, alpha=0.7, 
                    edgecolor='w', linewidth=0.5, s=80, color=palette[pop])
    
    # Add labels and title
    plt.xlabel(f'PC1 ({variance_explained[0]:.2f}% variance explained)')
    plt.ylabel(f'PC2 ({variance_explained[1]:.2f}% variance explained)')
    plt.title('PCA of Population Genetic Structure')
    plt.legend(title='Population')
    plt.grid(True, alpha=0.3)
    
    # Add a tight layout and show the plot
    plt.tight_layout()
    plt.show()
    
    # Plot PC1 vs PC3
    plt.figure(figsize=(12, 8))
    for pop in pca_df['population'].unique():
        subset = pca_df[pca_df['population'] == pop]
        plt.scatter(subset['PC1'], subset['PC3'], label=pop, alpha=0.7, 
                    edgecolor='w', linewidth=0.5, s=80, color=palette[pop])
    
    plt.xlabel(f'PC1 ({variance_explained[0]:.2f}% variance explained)')
    plt.ylabel(f'PC3 ({variance_explained[2]:.2f}% variance explained)')
    plt.title('PCA of Population Genetic Structure (PC1 vs PC3)')
    plt.legend(title='Population')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Create a pairplot for the first 4 PCs
    plt.figure(figsize=(15, 15))
    sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(4)], 
                 hue='population', palette=palette, plot_kws={'alpha': 0.6, 's': 70, 'edgecolor': 'w', 'linewidth': 0.5})
    plt.suptitle('Pairwise Relationships Between Principal Components', y=1.02, size=16)
    plt.tight_layout()
    plt.show()

## Browser Storage Integration

When running in JupyterLite, we can save our data to browser storage for use in subsequent labs. This will allow us to continue our analysis across multiple lab sessions.

In [ ]:
def save_to_browser_storage(data_dict, key_prefix='genetic_genealogy_'):
    """Save data to browser localStorage for use in subsequent notebooks"""
    if not IN_BROWSER:
        print("Not running in a browser environment, skipping storage")
        return False
    
    try:
        # Convert data to JSON-compatible format
        from pyodide.ffi import to_js
        
        # Create a serializable dictionary for common data types
        store_dict = {}
        
        for key, value in data_dict.items():
            storage_key = f"{key_prefix}{key}"
            
            if isinstance(value, np.ndarray):
                # For numpy arrays, convert to list and store type information
                data_type = str(value.dtype)
                shape = list(value.shape)
                if data_type.startswith('float'):
                    # Convert to standard Python list
                    data = value.tolist()
                    store_dict[storage_key] = {'type': 'ndarray', 'dtype': data_type, 
                                              'shape': shape, 'data': data}
                elif data_type.startswith('int'):
                    data = value.tolist()
                    store_dict[storage_key] = {'type': 'ndarray', 'dtype': data_type, 
                                              'shape': shape, 'data': data}
                else:
                    print(f"Unsupported numpy dtype: {data_type} for key {key}")
                    continue
            
            elif isinstance(value, pd.DataFrame):
                # For DataFrames, convert to dict
                store_dict[storage_key] = {'type': 'dataframe', 
                                          'data': value.to_dict(orient='records'),
                                          'index': value.index.tolist(),
                                          'columns': value.columns.tolist()}
            
            elif isinstance(value, pd.Series):
                # For Series, convert to dict
                store_dict[storage_key] = {'type': 'series', 
                                          'data': value.tolist(),
                                          'index': value.index.tolist(),
                                          'name': value.name}
            
            elif isinstance(value, (list, dict, str, int, float, bool)) or value is None:
                # These types are directly JSON serializable
                store_dict[storage_key] = {'type': type(value).__name__, 'data': value}
            
            else:
                print(f"Unsupported data type: {type(value)} for key {key}")
                continue
        
        # Store to localStorage as JSON
        import json
        for key, value in store_dict.items():
            # Convert to JavaScript object
            js_data = to_js(json.dumps(value))
            
            # Use JavaScript to set the item in localStorage
            from js import localStorage
            localStorage.setItem(key, js_data)
        
        print(f"Successfully saved {len(store_dict)} items to browser storage")
        return True
    
    except Exception as e:
        print(f"Error saving to browser storage: {str(e)}")
        return False

# If in browser environment and we have data, save it for subsequent labs
if IN_BROWSER and genotypes is not None:
    # Prepare data to save
    data_to_save = {
        'pca_projections': pca_projections if 'pca_projections' in locals() else None,
        'metadata': metadata,
        'variant_info': variant_info,
        'lab_progress': 100  # Mark this lab as completed
    }
    
    # Save to browser storage
    save_result = save_to_browser_storage(data_to_save)
    
    if save_result:
        print("\nProgress saved successfully! You can now continue to Lab 2.")
        
        # Display a summary of what was saved
        from IPython.display import HTML
        display(HTML("""
        <div style="background-color: #e2f0d9; padding: 10px; border-radius: 5px; border: 1px solid #a8d08d;">
            <h3 style="color: #548235;">Lab 1 Completed</h3>
            <p>Your data and progress have been saved to browser storage. In the next lab, you'll explore:</p>
            <ul>
                <li>Processing raw DNA profiles</li>
                <li>Applying quality control measures</li>
                <li>Preparing data for subsequent genetic genealogy analysis</li>
            </ul>
            <p>You can return to the course page and proceed to Lab 2.</p>
        </div>
        """))
    else:
        print("\nError saving progress. Please make sure your browser supports localStorage.")
else:
    print("\nNot saving data (either not in browser environment or no data available).")

## Conclusion

In this lab, you've explored the fundamentals of population genomics using simulated 1000 Genomes Project data. You've learned how to:

1. Load and inspect genetic data
2. Calculate and visualize allele frequencies across populations
3. Compute FST values to quantify population differentiation
4. Apply PCA to visualize population structure
5. Save your progress for subsequent labs

These concepts form the foundation for understanding genetic genealogy and interpreting relationships between individuals. In the next lab, you'll learn how to process raw DNA profiles and prepare them for analysis.

In [None]:
def load_sample_data():
    """Load demo genetic data for JupyterLite"""
    if IN_BROWSER:
        # For browser environment, we'll use a small pre-prepared dataset
        # This would be fetched from a CDN or embedded in the notebook
        
        # Simulate sample data for demonstration
        # In a real implementation, this would load from a URL or use embedded data
        np.random.seed(42)
        
        # Create sample metadata
        populations = ['AFR', 'AMR', 'EAS', 'EUR', 'SAS']
        pop_sizes = {'AFR': 99, 'AMR': 85, 'EAS': 103, 'EUR': 107, 'SAS': 110}
        
        # Generate sample IDs and population assignments
        sample_ids = []
        sample_pops = []
        
        for pop, size in pop_sizes.items():
            for i in range(size):
                sample_id = f"HG{i:05d}_{pop}"
                sample_ids.append(sample_id)
                sample_pops.append(pop)
        
        # Create sample metadata DataFrame
        metadata = pd.DataFrame({
            'sample_id': sample_ids,
            'population': sample_pops
        })
        
        # Generate synthetic genetic data (genotype matrix)
        # This simulates population structure
        n_samples = len(sample_ids)
        n_variants = 1000  # Reduced number for browser performance
        
        # Create population-specific allele frequencies
        pop_frequencies = {
            'AFR': np.random.beta(a=0.5, b=0.5, size=n_variants),  # More diverse
            'EUR': np.random.beta(a=0.3, b=0.7, size=n_variants),
            'EAS': np.random.beta(a=0.4, b=0.6, size=n_variants),
            'SAS': np.random.beta(a=0.35, b=0.65, size=n_variants),
            'AMR': np.random.beta(a=0.3, b=0.5, size=n_variants)   # Admixed
        }
        
        # Generate genotypes based on population frequencies
        genotypes = np.zeros((n_samples, n_variants))
        
        for i, (_, row) in enumerate(metadata.iterrows()):
            pop = row['population']
            # Generate genotypes as 0, 1, or 2 (counts of alternate allele)
            for j in range(n_variants):
                p = pop_frequencies[pop][j]
                genotypes[i, j] = np.random.binomial(2, p)
        
        # Create variant info
        variant_ids = [f"rs{i+1000}" for i in range(n_variants)]
        variant_pos = sorted(np.random.choice(range(1, 51000000), n_variants, replace=False))
        
        variant_info = pd.DataFrame({
            'variant_id': variant_ids,
            'chrom': 22,
            'position': variant_pos,
            'ref': np.random.choice(['A', 'C', 'G', 'T'], n_variants),
            'alt': np.random.choice(['A', 'C', 'G', 'T'], n_variants)
        })
        
        print(f"Loaded synthetic data: {n_samples} samples, {n_variants} variants")
        return genotypes, metadata, variant_info
    else:
        # For non-browser environments, load the real 1000 Genomes data
        # This is a simplified version of the data loading - in a real notebook,
        # it would use proper libraries like scikit-allel
        
        # This is just a stub - replace with actual data loading code when running locally
        print("In a non-browser environment, you would load the actual 1000 Genomes data here.")
        return None, None, None

# Load the data
genotypes, metadata, variant_info = load_sample_data()

# Display basic information about the data
if genotypes is not None:
    print(f"\nSample metadata:")
    display(metadata.head())
    
    print(f"\nVariant information:")
    display(variant_info.head())
    
    # Show population counts
    pop_counts = metadata['population'].value_counts().sort_index()
    print(f"\nPopulation sample counts:")
    display(pop_counts)
    
    # Display a small subset of the genotype matrix
    print(f"\nGenotype matrix (first 5 samples, first 5 variants):")
    genotype_subset = pd.DataFrame(
        genotypes[:5, :5], 
        index=metadata['sample_id'].iloc[:5],
        columns=variant_info['variant_id'].iloc[:5]
    )
    display(genotype_subset)