In [11]:
hapmap_path = '../data/filtered_hapmap3'
kg_path = '../data/1000g/populations'
output_path = '../data/pca'

In [12]:
from CB_02704 import *
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

def merge_genotype_data(populations):
    """Merge genotype data from multiple populations"""
    all_geno = []
    all_samples = []
    sample_pops = []
    
    for pop in populations:
        # Read population data
        geno = read_geno_pop(pop)
        ind_data = read_ind_pop(pop)
        
        # Store data
        all_geno.append(geno)
        all_samples.extend(ind_data.index)
        sample_pops.extend([pop] * len(ind_data))
    
    # Combine data
    merged_geno = np.vstack([g.T for g in all_geno])  # Transpose to get samples x SNPs
    
    return merged_geno, all_samples, sample_pops

def perform_pca(geno_data, n_components=10):
    """Perform PCA on genotype data"""
    # Standardize the data
    scaler = StandardScaler()
    geno_scaled = scaler.fit_transform(geno_data)
    
    # Perform PCA
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(geno_scaled)
    
    # Calculate explained variance
    var_explained = pca.explained_variance_ratio_ * 100
    
    return pca_result, var_explained

def plot_pca(pca_result, populations, components=[0, 1]):
    """Plot PCA results"""
    # Create DataFrame for plotting
    df = pd.DataFrame({
        'PC1': pca_result[:, components[0]],
        'PC2': pca_result[:, components[1]],
        'Population': populations
    })
    
    # Set up the plot
    plt.figure(figsize=(12, 8))
    
    # Define population groups and colors
    pop_groups = {
        'AFR': ['YRI', 'LWK', 'ASW'],
        'EUR': ['CEU', 'TSI'],
        'EAS': ['CHB', 'JPT'],
        'AMR': ['MXL']
    }
    
    colors = {'AFR': 'blue', 'EUR': 'green', 'EAS': 'red', 'AMR': 'purple'}
    
    # Plot each population group
    for group, pops in pop_groups.items():
        mask = df['Population'].isin(pops)
        plt.scatter(df.loc[mask, 'PC1'], 
                   df.loc[mask, 'PC2'],
                   label=group,
                   color=colors[group],
                   alpha=0.7)
    
    # Customize plot
    plt.xlabel(f'PC1 ({var_explained[components[0]]:.1f}%)')
    plt.ylabel(f'PC2 ({var_explained[components[1]]:.1f}%)')
    plt.title('PCA of Population Structure')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    return plt.gcf()

def main():
    # Define populations to analyze
    populations = ['ASW', 'CEU', 'CHB', 'JPT', 'LWK', 'MXL', 'TSI', 'YRI']
    
    # Merge genotype data
    print("Merging genotype data...")
    geno_data, samples, sample_pops = merge_genotype_data(populations)
    
    # Perform PCA
    print("Performing PCA...")
    pca_result, var_explained = perform_pca(geno_data)
    
    # Plot results
    print("Creating plot...")
    fig = plot_pca(pca_result, sample_pops)
    
    # Print variance explained
    print("\nVariance explained by each PC:")
    for i, var in enumerate(var_explained[:5], 1):
        print(f"PC{i}: {var:.1f}%")
    
    # Save plot
    plt.savefig('population_pca.png', dpi=300, bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    main()

Loading data...
Loading HapMap3 data...
Loading ../data/filtered_hapmap3/ASW.geno...
Loading ../data/filtered_hapmap3/CEU.geno...
Loading ../data/filtered_hapmap3/CHB.geno...
Loading ../data/filtered_hapmap3/JPT.geno...
Loading ../data/filtered_hapmap3/LWK.geno...
Loading ../data/filtered_hapmap3/MXL.geno...
Loading ../data/filtered_hapmap3/TSI.geno...
Loading ../data/filtered_hapmap3/YRI.geno...

Loading 1000G data...
Loading ../data/1000g/populations/ASW.geno...
Loaded 1000G ASW: 61 samples
Loading ../data/1000g/populations/CEU.geno...
Loaded 1000G CEU: 99 samples
Loading ../data/1000g/populations/CHB.geno...
Loaded 1000G CHB: 103 samples
Loading ../data/1000g/populations/JPT.geno...
Loaded 1000G JPT: 104 samples
Loading ../data/1000g/populations/LWK.geno...
Loaded 1000G LWK: 99 samples
Loading ../data/1000g/populations/MXL.geno...
Loaded 1000G MXL: 64 samples
Loading ../data/1000g/populations/TSI.geno...
Loaded 1000G TSI: 107 samples
Loading ../data/1000g/populations/YRI.geno...
Loa

  plt.style.use('seaborn')



Plot saved as pca_plot.png

PC scores saved to pca_results.tsv


Loading HapMap3 SNPs...

Processing chromosome 1
Chromosome 1 statistics:
HapMap3 SNPs: 55983
1000G SNPs: 6468094
Common positions: 1616

Processing chromosome 2
Chromosome 2 statistics:
HapMap3 SNPs: 58409
1000G SNPs: 7081600
Common positions: 1784

Processing chromosome 3
Chromosome 3 statistics:
HapMap3 SNPs: 48655
1000G SNPs: 5832276
Common positions: 1445

Processing chromosome 4
Chromosome 4 statistics:
HapMap3 SNPs: 43809
1000G SNPs: 5732585
Common positions: 1366

Processing chromosome 5
Chromosome 5 statistics:
HapMap3 SNPs: 45045
1000G SNPs: 5265763
Common positions: 1323

Processing chromosome 6
Chromosome 6 statistics:
HapMap3 SNPs: 46016
1000G SNPs: 5024119
Common positions: 1421

Processing chromosome 7
Chromosome 7 statistics:
HapMap3 SNPs: 38578
1000G SNPs: 4716715
Common positions: 1211

Processing chromosome 8
Chromosome 8 statistics:
HapMap3 SNPs: 38291
1000G SNPs: 4597105
Common positions: 1381

Processing chromosome 9
Chromosome 9 statistics:
HapMap3 SNPs: 32252
10

KeyError: 'chromosome'