In [1]:
import pandas as pd
import subprocess
import os
from typing import Dict, Set, List, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def get_hapmap_positions(hapmap_file: str) -> Dict[int, Set[str]]:
    """Read HapMap3 SNP positions by chromosome"""
    print("Reading HapMap3 positions...")
    hapmap_df = pd.read_csv(hapmap_file, sep='\s+', header=None,
                           names=['SNP', 'chromosome', 'morgans', 'position', 'ref', 'alt'])
    
    positions_by_chr = {}
    for chrom in range(1, 23):
        chr_data = hapmap_df[hapmap_df['chromosome'] == chrom]
        positions_by_chr[chrom] = set(f"{chrom}:{pos}" for pos in chr_data['position'])
    
    return positions_by_chr

def get_1000g_positions(vcf_file: str) -> Set[str]:
    """Extract SNP positions from VCF file"""
    cmd = f"bcftools query -f '%CHROM:%POS\n' {vcf_file}"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return set(result.stdout.strip().split('\n'))

def filter_vcf_by_positions(input_vcf: str, output_vcf: str, positions: Set[str]):
    """Filter VCF file to keep only specified positions"""
    # Create temporary positions file
    temp_pos_file = f"temp_positions_{os.path.basename(input_vcf)}.txt"
    with open(temp_pos_file, 'w') as f:
        for pos in positions:
            chrom, pos = pos.split(':')
            f.write(f"{chrom}\t{pos}\n")
    
    # Filter VCF
    cmd = f'bcftools view -T {temp_pos_file} -Oz -o {output_vcf} {input_vcf}'
    subprocess.run(cmd, shell=True, check=True)
    
    # Index filtered file
    subprocess.run(f'bcftools index {output_vcf}', shell=True, check=True)
    
    # Remove temporary file
    os.remove(temp_pos_file)

def process_chromosome(chrom: int, hapmap_positions: Set[str], 
                      data_dir: str, output_dir: str) -> Tuple[int, Dict]:
    """Process a single chromosome to find and filter common SNPs"""
    try:
        # Input files
        vcf_file = os.path.join(data_dir, f'filtered.chr{chrom}.phase3.vcf.gz')
        
        # Get 1000G positions
        kg_positions = get_1000g_positions(vcf_file)
        
        # Find common positions
        common_positions = hapmap_positions.intersection(kg_positions)
        
        # Create output directory if needed
        os.makedirs(output_dir, exist_ok=True)
        
        # Filter VCF file
        output_vcf = os.path.join(output_dir, f'common_snps.chr{chrom}.vcf.gz')
        filter_vcf_by_positions(vcf_file, output_vcf, common_positions)
        
        stats = {
            'chromosome': chrom,
            'hapmap_count': len(hapmap_positions),
            'kg_count': len(kg_positions),
            'common_count': len(common_positions)
        }
        
        return chrom, stats
        
    except Exception as e:
        print(f"Error processing chromosome {chrom}: {e}")
        return chrom, None

def main():
    # Configuration
    DATA_DIR = '../data/1000g'
    OUTPUT_DIR = '../data/common_snps'
    HAPMAP_FILE = '/Users/jingl1/Desktop/CMU/02704/hw_data/02704_data/HapMap3.snp'  # Update with your HapMap3 file path
    
    # Create output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Get HapMap positions by chromosome
    hapmap_positions = get_hapmap_positions(HAPMAP_FILE)
    
    # Process all chromosomes
    stats_all = []
    with ThreadPoolExecutor(max_workers=3) as executor:
        future_to_chrom = {}
        for chrom in range(1, 23):
            future = executor.submit(
                process_chromosome,
                chrom,
                hapmap_positions[chrom],
                DATA_DIR,
                OUTPUT_DIR
            )
            future_to_chrom[future] = chrom
        
        for future in tqdm(as_completed(future_to_chrom), total=22):
            chrom = future_to_chrom[future]
            _, stats = future.result()
            if stats:
                stats_all.append(stats)
    
    # Create summary
    stats_df = pd.DataFrame(stats_all)
    stats_df['hapmap_pct'] = (stats_df['common_count'] / stats_df['hapmap_count'] * 100).round(2)
    stats_df['kg_pct'] = (stats_df['common_count'] / stats_df['kg_count'] * 100).round(2)
    
    # Save summary
    summary_file = os.path.join(OUTPUT_DIR, 'common_snps_summary.csv')
    stats_df.to_csv(summary_file, index=False)
    
    # Print summary
    print("\nSummary of common SNPs:")
    print(f"Total HapMap SNPs: {stats_df['hapmap_count'].sum():,}")
    print(f"Total 1000G SNPs: {stats_df['kg_count'].sum():,}")
    print(f"Total common SNPs: {stats_df['common_count'].sum():,}")
    print(f"\nDetailed statistics saved to: {summary_file}")

if __name__ == "__main__":
    main()

Reading HapMap3 positions...


100%|███████████████████████████████████████████| 22/22 [05:49<00:00, 15.89s/it]


Summary of common SNPs:
Total HapMap SNPs: 697,393
Total 1000G SNPs: 81,192,381
Total common SNPs: 21,643

Detailed statistics saved to: ../data/common_snps/common_snps_summary.csv





In [6]:
import os
import subprocess
import pandas as pd
import numpy as np
from tqdm import tqdm
import tempfile

class VCFPopulationConverter:
    def __init__(self, data_dir: str = '../data/1000g', common_snps_dir: str = '../data/common_snps'):
        self.data_dir = data_dir
        self.common_snps_dir = common_snps_dir
        self.output_dir = os.path.join(data_dir, 'populations')
        os.makedirs(self.output_dir, exist_ok=True)
        
        # Load population mapping
        self.sample_pop_map = self._load_population_mapping()
    
    def _load_population_mapping(self):
        panel_file = os.path.join(self.data_dir, 'integrated_call_samples.panel')
        panel_df = pd.read_csv(panel_file, sep='\t')
        return dict(zip(panel_df['sample'], panel_df['pop']))
    
    def _get_population_samples(self, population):
        samples = [sample for sample, pop in self.sample_pop_map.items() 
                  if pop == population]
        return sorted(samples)
    
    def _convert_chromosome(self, chrom, population, samples):
        """Convert a single chromosome for a population"""
        print(f"Processing chromosome {chrom} for {population}...")
        
        # Input VCF file
        vcf_file = os.path.join(self.common_snps_dir, f'common_snps.chr{chrom}.vcf.gz')
        
        # Create temporary directory
        with tempfile.TemporaryDirectory() as temp_dir:
            # Create sample file
            sample_file = os.path.join(temp_dir, 'samples.txt')
            with open(sample_file, 'w') as f:
                for sample in samples:
                    f.write(f"{sample}\n")
            
            # Extract population VCF
            temp_vcf = os.path.join(temp_dir, f'temp_{population}.vcf.gz')
            cmd = f"bcftools view -S {sample_file} -Oz -o {temp_vcf} {vcf_file}"
            subprocess.run(cmd, shell=True, check=True)
            subprocess.run(f"bcftools index {temp_vcf}", shell=True, check=True)
            
            # Get variant info
            variant_file = os.path.join(temp_dir, 'variants.txt')
            cmd = f"bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\n' {temp_vcf} > {variant_file}"
            subprocess.run(cmd, shell=True, check=True)
            
            variant_data = pd.read_csv(variant_file, sep='\t',
                                     names=['chromosome', 'position', 'ref', 'alt'])
            n_variants = len(variant_data)
            
            # Get and convert genotypes
            geno_file = os.path.join(temp_dir, 'genotypes.txt')
            cmd = f"bcftools query -f '%CHROM\t%POS[\t%GT]\n' {temp_vcf} > {geno_file}"
            subprocess.run(cmd, shell=True, check=True)
            
            # Read and convert genotypes
            n_samples = len(samples)
            geno_array = np.zeros((n_variants, n_samples), dtype=np.uint8)
            
            print(f"Converting genotypes for {n_variants} variants and {n_samples} samples...")
            with open(geno_file, 'r') as f:
                for i, line in enumerate(tqdm(f, total=n_variants)):
                    parts = line.strip().split('\t')
                    gts = parts[2:]  # Skip CHROM and POS
                    
                    for j, gt in enumerate(gts):
                        if gt in ['0|0', '0/0']:
                            code = 0
                        elif gt in ['0|1', '1|0', '0/1', '1/0']:
                            code = 1
                        elif gt in ['1|1', '1/1']:
                            code = 2
                        else:
                            code = 9
                        geno_array[i, j] = code
            
            # Verify genotype conversion
            unique_gts = np.unique(geno_array, return_counts=True)
            print("\nGenotype distribution:")
            for gt, count in zip(*unique_gts):
                print(f"Genotype {gt}: {count} ({count/(n_variants*n_samples)*100:.2f}%)")
        
        return geno_array, variant_data
    
    def convert_population(self, population):
        """Convert all chromosomes for a population to HapMap format"""
        print(f"Converting data for population: {population}")
        
        # Get samples
        samples = self._get_population_samples(population)
        if not samples:
            raise ValueError(f"No samples found for population {population}")
        
        print(f"Found {len(samples)} samples for {population}")
        
        # Process chromosomes
        all_genotypes = []
        all_variants = []
        
        for chrom in range(1, 23):
            geno_array, variant_data = self._convert_chromosome(chrom, population, samples)
            all_genotypes.append(geno_array)
            all_variants.append(variant_data)
        
        # Combine data
        final_genotypes = np.vstack(all_genotypes)
        final_variants = pd.concat(all_variants, ignore_index=True)
        
        # Save output files
        output_prefix = os.path.join(self.output_dir, population)
        
        # 1. SNP file
        final_variants['morgans'] = 0.0
        snp_file = f"{output_prefix}.snp"
        final_variants.to_csv(snp_file, sep='\t', index=False, header=False)
        
        # 2. Sample file
        ind_data = pd.DataFrame({
            'sample': samples,
            'sex': 'U',
            'pop': population
        })
        ind_file = f"{output_prefix}.ind"
        ind_data.to_csv(ind_file, sep='\t', index=False, header=False)
        
        # 3. Genotype file
        geno_file = f"{output_prefix}.geno"
        print(f"\nWriting genotypes to {geno_file}...")
        with open(geno_file, 'w') as f:
            for row in tqdm(final_genotypes):
                f.write(''.join(map(str, row)) + '\n')
        
        print(f"\nConversion complete for {population}!")
        print(f"Files created:")
        print(f"1. {snp_file} - SNP information ({len(final_variants)} variants)")
        print(f"2. {ind_file} - Sample information ({len(samples)} samples)")
        print(f"3. {geno_file} - Genotype data")
        
        # Print final statistics
        total_gts = final_genotypes.size
        gt_counts = np.unique(final_genotypes, return_counts=True)
        print("\nFinal genotype distribution:")
        for gt, count in zip(*gt_counts):
            print(f"Genotype {gt}: {count} ({count/total_gts*100:.2f}%)")
        
        return final_genotypes, final_variants, samples

    def convert_all_populations(self, populations=None):
        if populations is None:
            populations = ['ASW', 'CEU', 'CHB', 'JPT', 'LWK', 'MXL', 'TSI', 'YRI']
        
        for population in populations:
            try:
                print(f"\nProcessing population: {population}")
                self.convert_population(population)
            except Exception as e:
                print(f"Error processing population {population}: {e}")

if __name__ == "__main__":
    converter = VCFPopulationConverter()
    converter.convert_all_populations()


Processing population: ASW
Converting data for population: ASW
Found 61 samples for ASW
Processing chromosome 1 for ASW...
Converting genotypes for 1617 variants and 61 samples...


100%|███████████████████████████████████| 1617/1617 [00:00<00:00, 145872.36it/s]



Genotype distribution:
Genotype 0: 92256 (93.53%)
Genotype 1: 3976 (4.03%)
Genotype 2: 2354 (2.39%)
Genotype 9: 51 (0.05%)
Processing chromosome 2 for ASW...
Converting genotypes for 1786 variants and 61 samples...


100%|███████████████████████████████████| 1786/1786 [00:00<00:00, 142331.08it/s]



Genotype distribution:
Genotype 0: 102249 (93.85%)
Genotype 1: 4523 (4.15%)
Genotype 2: 2055 (1.89%)
Genotype 9: 119 (0.11%)
Processing chromosome 3 for ASW...
Converting genotypes for 1446 variants and 61 samples...


100%|███████████████████████████████████| 1446/1446 [00:00<00:00, 104859.41it/s]



Genotype distribution:
Genotype 0: 82774 (93.84%)
Genotype 1: 3615 (4.10%)
Genotype 2: 1809 (2.05%)
Genotype 9: 8 (0.01%)
Processing chromosome 4 for ASW...
Converting genotypes for 1366 variants and 61 samples...


100%|███████████████████████████████████| 1366/1366 [00:00<00:00, 135932.51it/s]



Genotype distribution:
Genotype 0: 78472 (94.17%)
Genotype 1: 3429 (4.12%)
Genotype 2: 1378 (1.65%)
Genotype 9: 47 (0.06%)
Processing chromosome 5 for ASW...
Converting genotypes for 1325 variants and 61 samples...


100%|███████████████████████████████████| 1325/1325 [00:00<00:00, 136188.71it/s]



Genotype distribution:
Genotype 0: 75478 (93.38%)
Genotype 1: 3605 (4.46%)
Genotype 2: 1707 (2.11%)
Genotype 9: 35 (0.04%)
Processing chromosome 6 for ASW...
Converting genotypes for 1423 variants and 61 samples...


100%|███████████████████████████████████| 1423/1423 [00:00<00:00, 137024.07it/s]


Genotype distribution:
Genotype 0: 80509 (92.75%)
Genotype 1: 4029 (4.64%)
Genotype 2: 2227 (2.57%)
Genotype 9: 38 (0.04%)
Processing chromosome 7 for ASW...





Converting genotypes for 1215 variants and 61 samples...


100%|███████████████████████████████████| 1215/1215 [00:00<00:00, 136547.24it/s]


Genotype distribution:
Genotype 0: 69518 (93.80%)
Genotype 1: 3289 (4.44%)
Genotype 2: 1248 (1.68%)
Genotype 9: 60 (0.08%)
Processing chromosome 8 for ASW...





Converting genotypes for 1382 variants and 61 samples...


100%|███████████████████████████████████| 1382/1382 [00:00<00:00, 142315.94it/s]


Genotype distribution:
Genotype 0: 79894 (94.77%)
Genotype 1: 3016 (3.58%)
Genotype 2: 1273 (1.51%)
Genotype 9: 119 (0.14%)
Processing chromosome 9 for ASW...





Converting genotypes for 1038 variants and 61 samples...


100%|███████████████████████████████████| 1038/1038 [00:00<00:00, 135262.30it/s]


Genotype distribution:
Genotype 0: 59525 (94.01%)
Genotype 1: 2694 (4.25%)
Genotype 2: 1043 (1.65%)
Genotype 9: 56 (0.09%)
Processing chromosome 10 for ASW...





Converting genotypes for 1128 variants and 61 samples...


100%|███████████████████████████████████| 1128/1128 [00:00<00:00, 135637.59it/s]


Genotype distribution:
Genotype 0: 64744 (94.09%)
Genotype 1: 2696 (3.92%)
Genotype 2: 1368 (1.99%)
Processing chromosome 11 for ASW...





Converting genotypes for 1154 variants and 61 samples...


100%|███████████████████████████████████| 1154/1154 [00:00<00:00, 135927.06it/s]


Genotype distribution:
Genotype 0: 66529 (94.51%)
Genotype 1: 2560 (3.64%)
Genotype 2: 1274 (1.81%)
Genotype 9: 31 (0.04%)
Processing chromosome 12 for ASW...





Converting genotypes for 1046 variants and 61 samples...


100%|███████████████████████████████████| 1046/1046 [00:00<00:00, 140567.17it/s]


Genotype distribution:
Genotype 0: 59448 (93.17%)
Genotype 1: 2806 (4.40%)
Genotype 2: 1436 (2.25%)
Genotype 9: 116 (0.18%)
Processing chromosome 13 for ASW...





Converting genotypes for 773 variants and 61 samples...


100%|█████████████████████████████████████| 773/773 [00:00<00:00, 122522.75it/s]


Genotype distribution:
Genotype 0: 44128 (93.58%)
Genotype 1: 1859 (3.94%)
Genotype 2: 1112 (2.36%)
Genotype 9: 54 (0.11%)
Processing chromosome 14 for ASW...





Converting genotypes for 667 variants and 61 samples...


100%|█████████████████████████████████████| 667/667 [00:00<00:00, 132186.77it/s]


Genotype distribution:
Genotype 0: 37917 (93.19%)
Genotype 1: 1794 (4.41%)
Genotype 2: 881 (2.17%)
Genotype 9: 95 (0.23%)
Processing chromosome 15 for ASW...





Converting genotypes for 637 variants and 61 samples...


100%|█████████████████████████████████████| 637/637 [00:00<00:00, 132135.10it/s]


Genotype distribution:
Genotype 0: 36760 (94.60%)
Genotype 1: 1399 (3.60%)
Genotype 2: 698 (1.80%)
Processing chromosome 16 for ASW...





Converting genotypes for 835 variants and 61 samples...


100%|█████████████████████████████████████| 835/835 [00:00<00:00, 139559.43it/s]



Genotype distribution:
Genotype 0: 47539 (93.33%)
Genotype 1: 2330 (4.57%)
Genotype 2: 1052 (2.07%)
Genotype 9: 14 (0.03%)
Processing chromosome 17 for ASW...
Converting genotypes for 664 variants and 61 samples...


100%|█████████████████████████████████████| 664/664 [00:00<00:00, 129795.30it/s]



Genotype distribution:
Genotype 0: 34451 (85.06%)
Genotype 1: 3752 (9.26%)
Genotype 2: 2257 (5.57%)
Genotype 9: 44 (0.11%)
Processing chromosome 18 for ASW...
Converting genotypes for 598 variants and 61 samples...


100%|█████████████████████████████████████| 598/598 [00:00<00:00, 130621.49it/s]



Genotype distribution:
Genotype 0: 34675 (95.06%)
Genotype 1: 1119 (3.07%)
Genotype 2: 653 (1.79%)
Genotype 9: 31 (0.08%)
Processing chromosome 19 for ASW...
Converting genotypes for 367 variants and 61 samples...


100%|█████████████████████████████████████| 367/367 [00:00<00:00, 129093.39it/s]


Genotype distribution:
Genotype 0: 21352 (95.38%)
Genotype 1: 746 (3.33%)
Genotype 2: 288 (1.29%)
Genotype 9: 1 (0.00%)
Processing chromosome 20 for ASW...





Converting genotypes for 571 variants and 61 samples...


100%|█████████████████████████████████████| 571/571 [00:00<00:00, 140088.18it/s]



Genotype distribution:
Genotype 0: 32377 (92.95%)
Genotype 1: 1505 (4.32%)
Genotype 2: 948 (2.72%)
Genotype 9: 1 (0.00%)
Processing chromosome 21 for ASW...
Converting genotypes for 340 variants and 61 samples...


100%|█████████████████████████████████████| 340/340 [00:00<00:00, 116957.55it/s]



Genotype distribution:
Genotype 0: 19549 (94.26%)
Genotype 1: 842 (4.06%)
Genotype 2: 342 (1.65%)
Genotype 9: 7 (0.03%)
Processing chromosome 22 for ASW...
Converting genotypes for 286 variants and 61 samples...


100%|█████████████████████████████████████| 286/286 [00:00<00:00, 112826.46it/s]


Genotype distribution:
Genotype 0: 16158 (92.62%)
Genotype 1: 769 (4.41%)
Genotype 2: 460 (2.64%)
Genotype 9: 59 (0.34%)






Writing genotypes to ../data/1000g/populations/ASW.geno...


100%|██████████████████████████████████| 21664/21664 [00:00<00:00, 72421.41it/s]



Conversion complete for ASW!
Files created:
1. ../data/1000g/populations/ASW.snp - SNP information (21664 variants)
2. ../data/1000g/populations/ASW.ind - Sample information (61 samples)
3. ../data/1000g/populations/ASW.geno - Genotype data

Final genotype distribution:
Genotype 0: 1236302 (93.55%)
Genotype 1: 56353 (4.26%)
Genotype 2: 27863 (2.11%)
Genotype 9: 986 (0.07%)

Processing population: CEU
Converting data for population: CEU
Found 99 samples for CEU
Processing chromosome 1 for CEU...
Converting genotypes for 1617 variants and 99 samples...


100%|████████████████████████████████████| 1617/1617 [00:00<00:00, 89170.11it/s]



Genotype distribution:
Genotype 0: 151309 (94.52%)
Genotype 1: 4827 (3.02%)
Genotype 2: 3867 (2.42%)
Genotype 9: 80 (0.05%)
Processing chromosome 2 for CEU...
Converting genotypes for 1786 variants and 99 samples...


100%|████████████████████████████████████| 1786/1786 [00:00<00:00, 91877.24it/s]



Genotype distribution:
Genotype 0: 168296 (95.18%)
Genotype 1: 5325 (3.01%)
Genotype 2: 2973 (1.68%)
Genotype 9: 220 (0.12%)
Processing chromosome 3 for CEU...
Converting genotypes for 1446 variants and 99 samples...


100%|████████████████████████████████████| 1446/1446 [00:00<00:00, 89441.87it/s]



Genotype distribution:
Genotype 0: 136064 (95.05%)
Genotype 1: 4194 (2.93%)
Genotype 2: 2896 (2.02%)
Processing chromosome 4 for CEU...
Converting genotypes for 1366 variants and 99 samples...


100%|████████████████████████████████████| 1366/1366 [00:00<00:00, 89450.89it/s]



Genotype distribution:
Genotype 0: 128312 (94.88%)
Genotype 1: 4237 (3.13%)
Genotype 2: 2631 (1.95%)
Genotype 9: 54 (0.04%)
Processing chromosome 5 for CEU...
Converting genotypes for 1325 variants and 99 samples...


100%|████████████████████████████████████| 1325/1325 [00:00<00:00, 91246.39it/s]



Genotype distribution:
Genotype 0: 123692 (94.30%)
Genotype 1: 4686 (3.57%)
Genotype 2: 2758 (2.10%)
Genotype 9: 39 (0.03%)
Processing chromosome 6 for CEU...
Converting genotypes for 1423 variants and 99 samples...


100%|████████████████████████████████████| 1423/1423 [00:00<00:00, 87193.68it/s]



Genotype distribution:
Genotype 0: 131233 (93.15%)
Genotype 1: 5929 (4.21%)
Genotype 2: 3708 (2.63%)
Genotype 9: 7 (0.00%)
Processing chromosome 7 for CEU...
Converting genotypes for 1215 variants and 99 samples...


100%|████████████████████████████████████| 1215/1215 [00:00<00:00, 90132.28it/s]


Genotype distribution:
Genotype 0: 114441 (95.14%)
Genotype 1: 3980 (3.31%)
Genotype 2: 1852 (1.54%)
Genotype 9: 12 (0.01%)
Processing chromosome 8 for CEU...





Converting genotypes for 1382 variants and 99 samples...


100%|████████████████████████████████████| 1382/1382 [00:00<00:00, 89184.22it/s]


Genotype distribution:
Genotype 0: 130900 (95.67%)
Genotype 1: 3632 (2.65%)
Genotype 2: 2112 (1.54%)
Genotype 9: 174 (0.13%)
Processing chromosome 9 for CEU...





Converting genotypes for 1038 variants and 99 samples...


100%|████████████████████████████████████| 1038/1038 [00:00<00:00, 91467.87it/s]


Genotype distribution:
Genotype 0: 97639 (95.01%)
Genotype 1: 3206 (3.12%)
Genotype 2: 1853 (1.80%)
Genotype 9: 64 (0.06%)
Processing chromosome 10 for CEU...





Converting genotypes for 1128 variants and 99 samples...


100%|████████████████████████████████████| 1128/1128 [00:00<00:00, 89426.06it/s]


Genotype distribution:
Genotype 0: 106238 (95.13%)
Genotype 1: 3150 (2.82%)
Genotype 2: 2284 (2.05%)
Processing chromosome 11 for CEU...





Converting genotypes for 1154 variants and 99 samples...


100%|████████████████████████████████████| 1154/1154 [00:00<00:00, 89781.80it/s]


Genotype distribution:
Genotype 0: 108817 (95.25%)
Genotype 1: 3166 (2.77%)
Genotype 2: 2249 (1.97%)
Genotype 9: 14 (0.01%)
Processing chromosome 12 for CEU...





Converting genotypes for 1046 variants and 99 samples...


100%|████████████████████████████████████| 1046/1046 [00:00<00:00, 86862.32it/s]


Genotype distribution:
Genotype 0: 97050 (93.72%)
Genotype 1: 3735 (3.61%)
Genotype 2: 2684 (2.59%)
Genotype 9: 85 (0.08%)
Processing chromosome 13 for CEU...





Converting genotypes for 773 variants and 99 samples...


100%|██████████████████████████████████████| 773/773 [00:00<00:00, 86015.89it/s]


Genotype distribution:
Genotype 0: 71811 (93.84%)
Genotype 1: 2525 (3.30%)
Genotype 2: 2080 (2.72%)
Genotype 9: 111 (0.15%)
Processing chromosome 14 for CEU...





Converting genotypes for 667 variants and 99 samples...


100%|██████████████████████████████████████| 667/667 [00:00<00:00, 92626.59it/s]


Genotype distribution:
Genotype 0: 62167 (94.15%)
Genotype 1: 2226 (3.37%)
Genotype 2: 1473 (2.23%)
Genotype 9: 167 (0.25%)
Processing chromosome 15 for CEU...





Converting genotypes for 637 variants and 99 samples...


100%|██████████████████████████████████████| 637/637 [00:00<00:00, 90583.88it/s]


Genotype distribution:
Genotype 0: 60077 (95.27%)
Genotype 1: 1974 (3.13%)
Genotype 2: 1012 (1.60%)
Processing chromosome 16 for CEU...





Converting genotypes for 835 variants and 99 samples...


100%|██████████████████████████████████████| 835/835 [00:00<00:00, 92439.20it/s]


Genotype distribution:
Genotype 0: 78211 (94.61%)
Genotype 1: 2835 (3.43%)
Genotype 2: 1588 (1.92%)
Genotype 9: 31 (0.04%)
Processing chromosome 17 for CEU...





Converting genotypes for 664 variants and 99 samples...


100%|██████████████████████████████████████| 664/664 [00:00<00:00, 81712.81it/s]


Genotype distribution:
Genotype 0: 55890 (85.02%)
Genotype 1: 5112 (7.78%)
Genotype 2: 4660 (7.09%)
Genotype 9: 74 (0.11%)
Processing chromosome 18 for CEU...





Converting genotypes for 598 variants and 99 samples...


100%|██████████████████████████████████████| 598/598 [00:00<00:00, 84049.12it/s]


Genotype distribution:
Genotype 0: 56613 (95.63%)
Genotype 1: 1428 (2.41%)
Genotype 2: 1142 (1.93%)
Genotype 9: 19 (0.03%)
Processing chromosome 19 for CEU...
Converting genotypes for 367 variants and 99 samples...



100%|██████████████████████████████████████| 367/367 [00:00<00:00, 75887.87it/s]



Genotype distribution:
Genotype 0: 34894 (96.04%)
Genotype 1: 963 (2.65%)
Genotype 2: 476 (1.31%)
Processing chromosome 20 for CEU...
Converting genotypes for 571 variants and 99 samples...


100%|██████████████████████████████████████| 571/571 [00:00<00:00, 85692.99it/s]



Genotype distribution:
Genotype 0: 53244 (94.19%)
Genotype 1: 1929 (3.41%)
Genotype 2: 1355 (2.40%)
Genotype 9: 1 (0.00%)
Processing chromosome 21 for CEU...
Converting genotypes for 340 variants and 99 samples...


100%|██████████████████████████████████████| 340/340 [00:00<00:00, 78251.94it/s]



Genotype distribution:
Genotype 0: 31933 (94.87%)
Genotype 1: 1033 (3.07%)
Genotype 2: 650 (1.93%)
Genotype 9: 44 (0.13%)
Processing chromosome 22 for CEU...
Converting genotypes for 286 variants and 99 samples...


100%|██████████████████████████████████████| 286/286 [00:00<00:00, 79378.70it/s]



Genotype distribution:
Genotype 0: 26573 (93.85%)
Genotype 1: 962 (3.40%)
Genotype 2: 691 (2.44%)
Genotype 9: 88 (0.31%)

Writing genotypes to ../data/1000g/populations/CEU.geno...


100%|██████████████████████████████████| 21664/21664 [00:00<00:00, 45436.04it/s]



Conversion complete for CEU!
Files created:
1. ../data/1000g/populations/CEU.snp - SNP information (21664 variants)
2. ../data/1000g/populations/CEU.ind - Sample information (99 samples)
3. ../data/1000g/populations/CEU.geno - Genotype data

Final genotype distribution:
Genotype 0: 2025404 (94.44%)
Genotype 1: 71054 (3.31%)
Genotype 2: 46994 (2.19%)
Genotype 9: 1284 (0.06%)

Processing population: CHB
Converting data for population: CHB
Found 103 samples for CHB
Processing chromosome 1 for CHB...
Converting genotypes for 1617 variants and 103 samples...


100%|████████████████████████████████████| 1617/1617 [00:00<00:00, 84329.37it/s]



Genotype distribution:
Genotype 0: 156840 (94.17%)
Genotype 1: 4818 (2.89%)
Genotype 2: 4754 (2.85%)
Genotype 9: 139 (0.08%)
Processing chromosome 2 for CHB...
Converting genotypes for 1786 variants and 103 samples...


100%|████████████████████████████████████| 1786/1786 [00:00<00:00, 87007.84it/s]



Genotype distribution:
Genotype 0: 174885 (95.07%)
Genotype 1: 5096 (2.77%)
Genotype 2: 3699 (2.01%)
Genotype 9: 278 (0.15%)
Processing chromosome 3 for CHB...
Converting genotypes for 1446 variants and 103 samples...


100%|████████████████████████████████████| 1446/1446 [00:00<00:00, 88848.31it/s]



Genotype distribution:
Genotype 0: 141676 (95.12%)
Genotype 1: 4134 (2.78%)
Genotype 2: 3128 (2.10%)
Processing chromosome 4 for CHB...
Converting genotypes for 1366 variants and 103 samples...


100%|████████████████████████████████████| 1366/1366 [00:00<00:00, 85246.53it/s]



Genotype distribution:
Genotype 0: 133614 (94.97%)
Genotype 1: 4128 (2.93%)
Genotype 2: 2905 (2.06%)
Genotype 9: 51 (0.04%)
Processing chromosome 5 for CHB...
Converting genotypes for 1325 variants and 103 samples...


100%|████████████████████████████████████| 1325/1325 [00:00<00:00, 86313.27it/s]



Genotype distribution:
Genotype 0: 129099 (94.60%)
Genotype 1: 4332 (3.17%)
Genotype 2: 3021 (2.21%)
Genotype 9: 23 (0.02%)
Processing chromosome 6 for CHB...
Converting genotypes for 1423 variants and 103 samples...


100%|████████████████████████████████████| 1423/1423 [00:00<00:00, 86314.78it/s]



Genotype distribution:
Genotype 0: 136930 (93.42%)
Genotype 1: 5116 (3.49%)
Genotype 2: 4522 (3.09%)
Genotype 9: 1 (0.00%)
Processing chromosome 7 for CHB...
Converting genotypes for 1215 variants and 103 samples...


100%|████████████████████████████████████| 1215/1215 [00:00<00:00, 86656.23it/s]



Genotype distribution:
Genotype 0: 118534 (94.72%)
Genotype 1: 4318 (3.45%)
Genotype 2: 2291 (1.83%)
Genotype 9: 2 (0.00%)
Processing chromosome 8 for CHB...
Converting genotypes for 1382 variants and 103 samples...


100%|████████████████████████████████████| 1382/1382 [00:00<00:00, 85865.59it/s]



Genotype distribution:
Genotype 0: 135979 (95.53%)
Genotype 1: 3750 (2.63%)
Genotype 2: 2397 (1.68%)
Genotype 9: 220 (0.15%)
Processing chromosome 9 for CHB...
Converting genotypes for 1038 variants and 103 samples...


100%|████████████████████████████████████| 1038/1038 [00:00<00:00, 82908.43it/s]



Genotype distribution:
Genotype 0: 101124 (94.58%)
Genotype 1: 3430 (3.21%)
Genotype 2: 2276 (2.13%)
Genotype 9: 84 (0.08%)
Processing chromosome 10 for CHB...
Converting genotypes for 1128 variants and 103 samples...


100%|████████████████████████████████████| 1128/1128 [00:00<00:00, 85745.42it/s]


Genotype distribution:
Genotype 0: 110293 (94.93%)
Genotype 1: 3162 (2.72%)
Genotype 2: 2729 (2.35%)
Processing chromosome 11 for CHB...





Converting genotypes for 1154 variants and 103 samples...


100%|████████████████████████████████████| 1154/1154 [00:00<00:00, 88320.47it/s]


Genotype distribution:
Genotype 0: 113880 (95.81%)
Genotype 1: 2900 (2.44%)
Genotype 2: 2042 (1.72%)
Genotype 9: 40 (0.03%)
Processing chromosome 12 for CHB...





Converting genotypes for 1046 variants and 103 samples...


100%|████████████████████████████████████| 1046/1046 [00:00<00:00, 87655.43it/s]


Genotype distribution:
Genotype 0: 101390 (94.11%)
Genotype 1: 3248 (3.01%)
Genotype 2: 2955 (2.74%)
Genotype 9: 145 (0.13%)
Processing chromosome 13 for CHB...





Converting genotypes for 773 variants and 103 samples...


100%|██████████████████████████████████████| 773/773 [00:00<00:00, 84553.32it/s]


Genotype distribution:
Genotype 0: 74869 (94.03%)
Genotype 1: 2473 (3.11%)
Genotype 2: 2173 (2.73%)
Genotype 9: 104 (0.13%)
Processing chromosome 14 for CHB...





Converting genotypes for 667 variants and 103 samples...


100%|██████████████████████████████████████| 667/667 [00:00<00:00, 88982.21it/s]


Genotype distribution:
Genotype 0: 65078 (94.73%)
Genotype 1: 1996 (2.91%)
Genotype 2: 1495 (2.18%)
Genotype 9: 132 (0.19%)
Processing chromosome 15 for CHB...





Converting genotypes for 637 variants and 103 samples...


100%|██████████████████████████████████████| 637/637 [00:00<00:00, 82429.01it/s]


Genotype distribution:
Genotype 0: 62482 (95.23%)
Genotype 1: 1845 (2.81%)
Genotype 2: 1284 (1.96%)
Processing chromosome 16 for CHB...





Converting genotypes for 835 variants and 103 samples...


100%|██████████████████████████████████████| 835/835 [00:00<00:00, 88100.11it/s]


Genotype distribution:
Genotype 0: 81298 (94.53%)
Genotype 1: 2971 (3.45%)
Genotype 2: 1734 (2.02%)
Genotype 9: 2 (0.00%)
Processing chromosome 17 for CHB...





Converting genotypes for 664 variants and 103 samples...


100%|██████████████████████████████████████| 664/664 [00:00<00:00, 80671.37it/s]


Genotype distribution:
Genotype 0: 58412 (85.41%)
Genotype 1: 5212 (7.62%)
Genotype 2: 4649 (6.80%)
Genotype 9: 119 (0.17%)
Processing chromosome 18 for CHB...





Converting genotypes for 598 variants and 103 samples...


100%|██████████████████████████████████████| 598/598 [00:00<00:00, 87271.88it/s]


Genotype distribution:
Genotype 0: 58655 (95.23%)
Genotype 1: 1629 (2.64%)
Genotype 2: 1250 (2.03%)
Genotype 9: 60 (0.10%)
Processing chromosome 19 for CHB...
Converting genotypes for 367 variants and 103 samples...



100%|██████████████████████████████████████| 367/367 [00:00<00:00, 75779.53it/s]



Genotype distribution:
Genotype 0: 36230 (95.84%)
Genotype 1: 1144 (3.03%)
Genotype 2: 427 (1.13%)
Processing chromosome 20 for CHB...
Converting genotypes for 571 variants and 103 samples...


100%|██████████████████████████████████████| 571/571 [00:00<00:00, 83871.39it/s]


Genotype distribution:
Genotype 0: 55347 (94.11%)
Genotype 1: 1969 (3.35%)
Genotype 2: 1497 (2.55%)
Processing chromosome 21 for CHB...





Converting genotypes for 340 variants and 103 samples...


100%|██████████████████████████████████████| 340/340 [00:00<00:00, 83910.76it/s]


Genotype distribution:
Genotype 0: 33415 (95.42%)
Genotype 1: 1028 (2.94%)
Genotype 2: 566 (1.62%)
Genotype 9: 11 (0.03%)
Processing chromosome 22 for CHB...





Converting genotypes for 286 variants and 103 samples...


100%|██████████████████████████████████████| 286/286 [00:00<00:00, 80562.19it/s]



Genotype distribution:
Genotype 0: 27859 (94.57%)
Genotype 1: 837 (2.84%)
Genotype 2: 713 (2.42%)
Genotype 9: 49 (0.17%)

Writing genotypes to ../data/1000g/populations/CHB.geno...


100%|██████████████████████████████████| 21664/21664 [00:00<00:00, 44079.57it/s]



Conversion complete for CHB!
Files created:
1. ../data/1000g/populations/CHB.snp - SNP information (21664 variants)
2. ../data/1000g/populations/CHB.ind - Sample information (103 samples)
3. ../data/1000g/populations/CHB.geno - Genotype data

Final genotype distribution:
Genotype 0: 2107889 (94.47%)
Genotype 1: 69536 (3.12%)
Genotype 2: 52507 (2.35%)
Genotype 9: 1460 (0.07%)

Processing population: JPT
Converting data for population: JPT
Found 104 samples for JPT
Processing chromosome 1 for JPT...
Converting genotypes for 1617 variants and 104 samples...


100%|████████████████████████████████████| 1617/1617 [00:00<00:00, 85101.82it/s]



Genotype distribution:
Genotype 0: 158456 (94.22%)
Genotype 1: 4802 (2.86%)
Genotype 2: 4763 (2.83%)
Genotype 9: 147 (0.09%)
Processing chromosome 2 for JPT...
Converting genotypes for 1786 variants and 104 samples...


100%|████████████████████████████████████| 1786/1786 [00:00<00:00, 87515.06it/s]



Genotype distribution:
Genotype 0: 176541 (95.05%)
Genotype 1: 5147 (2.77%)
Genotype 2: 3768 (2.03%)
Genotype 9: 288 (0.16%)
Processing chromosome 3 for JPT...
Converting genotypes for 1446 variants and 104 samples...


100%|████████████████████████████████████| 1446/1446 [00:00<00:00, 82760.85it/s]



Genotype distribution:
Genotype 0: 142893 (95.02%)
Genotype 1: 4101 (2.73%)
Genotype 2: 3390 (2.25%)
Processing chromosome 4 for JPT...
Converting genotypes for 1366 variants and 104 samples...


100%|████████████████████████████████████| 1366/1366 [00:00<00:00, 85604.44it/s]



Genotype distribution:
Genotype 0: 134878 (94.94%)
Genotype 1: 4180 (2.94%)
Genotype 2: 2959 (2.08%)
Genotype 9: 47 (0.03%)
Processing chromosome 5 for JPT...
Converting genotypes for 1325 variants and 104 samples...


100%|████████████████████████████████████| 1325/1325 [00:00<00:00, 86039.34it/s]



Genotype distribution:
Genotype 0: 130196 (94.48%)
Genotype 1: 4358 (3.16%)
Genotype 2: 3214 (2.33%)
Genotype 9: 32 (0.02%)
Processing chromosome 6 for JPT...
Converting genotypes for 1423 variants and 104 samples...


100%|████████████████████████████████████| 1423/1423 [00:00<00:00, 84425.98it/s]



Genotype distribution:
Genotype 0: 138460 (93.56%)
Genotype 1: 5004 (3.38%)
Genotype 2: 4524 (3.06%)
Genotype 9: 4 (0.00%)
Processing chromosome 7 for JPT...
Converting genotypes for 1215 variants and 104 samples...


100%|████████████████████████████████████| 1215/1215 [00:00<00:00, 86195.99it/s]



Genotype distribution:
Genotype 0: 119574 (94.63%)
Genotype 1: 4358 (3.45%)
Genotype 2: 2426 (1.92%)
Genotype 9: 2 (0.00%)
Processing chromosome 8 for JPT...
Converting genotypes for 1382 variants and 104 samples...


100%|████████████████████████████████████| 1382/1382 [00:00<00:00, 85630.92it/s]



Genotype distribution:
Genotype 0: 137114 (95.40%)
Genotype 1: 3873 (2.69%)
Genotype 2: 2509 (1.75%)
Genotype 9: 232 (0.16%)
Processing chromosome 9 for JPT...
Converting genotypes for 1038 variants and 104 samples...


100%|████████████████████████████████████| 1038/1038 [00:00<00:00, 81957.94it/s]


Genotype distribution:
Genotype 0: 102141 (94.62%)
Genotype 1: 3519 (3.26%)
Genotype 2: 2238 (2.07%)
Genotype 9: 54 (0.05%)
Processing chromosome 10 for JPT...





Converting genotypes for 1128 variants and 104 samples...


100%|████████████████████████████████████| 1128/1128 [00:00<00:00, 87625.71it/s]


Genotype distribution:
Genotype 0: 111326 (94.90%)
Genotype 1: 3233 (2.76%)
Genotype 2: 2753 (2.35%)
Processing chromosome 11 for JPT...





Converting genotypes for 1154 variants and 104 samples...


100%|████████████████████████████████████| 1154/1154 [00:00<00:00, 84777.94it/s]


Genotype distribution:
Genotype 0: 114906 (95.74%)
Genotype 1: 2970 (2.47%)
Genotype 2: 2109 (1.76%)
Genotype 9: 31 (0.03%)
Processing chromosome 12 for JPT...





Converting genotypes for 1046 variants and 104 samples...


100%|████████████████████████████████████| 1046/1046 [00:00<00:00, 83407.64it/s]


Genotype distribution:
Genotype 0: 102469 (94.19%)
Genotype 1: 3211 (2.95%)
Genotype 2: 2978 (2.74%)
Genotype 9: 126 (0.12%)
Processing chromosome 13 for JPT...





Converting genotypes for 773 variants and 104 samples...


100%|██████████████████████████████████████| 773/773 [00:00<00:00, 79346.98it/s]


Genotype distribution:
Genotype 0: 75528 (93.95%)
Genotype 1: 2572 (3.20%)
Genotype 2: 2207 (2.75%)
Genotype 9: 85 (0.11%)
Processing chromosome 14 for JPT...





Converting genotypes for 667 variants and 104 samples...


100%|██████████████████████████████████████| 667/667 [00:00<00:00, 87017.13it/s]


Genotype distribution:
Genotype 0: 65781 (94.83%)
Genotype 1: 1934 (2.79%)
Genotype 2: 1537 (2.22%)
Genotype 9: 116 (0.17%)
Processing chromosome 15 for JPT...





Converting genotypes for 637 variants and 104 samples...


100%|██████████████████████████████████████| 637/637 [00:00<00:00, 87838.11it/s]


Genotype distribution:
Genotype 0: 63085 (95.23%)
Genotype 1: 1854 (2.80%)
Genotype 2: 1309 (1.98%)
Processing chromosome 16 for JPT...





Converting genotypes for 835 variants and 104 samples...


100%|██████████████████████████████████████| 835/835 [00:00<00:00, 83516.01it/s]


Genotype distribution:
Genotype 0: 82137 (94.58%)
Genotype 1: 2915 (3.36%)
Genotype 2: 1783 (2.05%)
Genotype 9: 5 (0.01%)
Processing chromosome 17 for JPT...





Converting genotypes for 664 variants and 104 samples...


100%|██████████████████████████████████████| 664/664 [00:00<00:00, 81063.51it/s]


Genotype distribution:
Genotype 0: 58966 (85.39%)
Genotype 1: 4668 (6.76%)
Genotype 2: 5308 (7.69%)
Genotype 9: 114 (0.17%)
Processing chromosome 18 for JPT...





Converting genotypes for 598 variants and 104 samples...


100%|██████████████████████████████████████| 598/598 [00:00<00:00, 87821.91it/s]



Genotype distribution:
Genotype 0: 59164 (95.13%)
Genotype 1: 1671 (2.69%)
Genotype 2: 1289 (2.07%)
Genotype 9: 68 (0.11%)
Processing chromosome 19 for JPT...
Converting genotypes for 367 variants and 104 samples...


100%|██████████████████████████████████████| 367/367 [00:00<00:00, 79674.41it/s]



Genotype distribution:
Genotype 0: 36609 (95.92%)
Genotype 1: 1109 (2.91%)
Genotype 2: 450 (1.18%)
Processing chromosome 20 for JPT...
Converting genotypes for 571 variants and 104 samples...


100%|██████████████████████████████████████| 571/571 [00:00<00:00, 80580.99it/s]


Genotype distribution:
Genotype 0: 55764 (93.90%)
Genotype 1: 2071 (3.49%)
Genotype 2: 1549 (2.61%)
Processing chromosome 21 for JPT...





Converting genotypes for 340 variants and 104 samples...


100%|██████████████████████████████████████| 340/340 [00:00<00:00, 81872.97it/s]


Genotype distribution:
Genotype 0: 33756 (95.46%)
Genotype 1: 1080 (3.05%)
Genotype 2: 519 (1.47%)
Genotype 9: 5 (0.01%)
Processing chromosome 22 for JPT...





Converting genotypes for 286 variants and 104 samples...


100%|██████████████████████████████████████| 286/286 [00:00<00:00, 82615.08it/s]



Genotype distribution:
Genotype 0: 28163 (94.68%)
Genotype 1: 836 (2.81%)
Genotype 2: 705 (2.37%)
Genotype 9: 40 (0.13%)

Writing genotypes to ../data/1000g/populations/JPT.geno...


100%|██████████████████████████████████| 21664/21664 [00:00<00:00, 42836.91it/s]



Conversion complete for JPT!
Files created:
1. ../data/1000g/populations/JPT.snp - SNP information (21664 variants)
2. ../data/1000g/populations/JPT.ind - Sample information (104 samples)
3. ../data/1000g/populations/JPT.geno - Genotype data

Final genotype distribution:
Genotype 0: 2127907 (94.45%)
Genotype 1: 69466 (3.08%)
Genotype 2: 54287 (2.41%)
Genotype 9: 1396 (0.06%)

Processing population: LWK
Converting data for population: LWK
Found 99 samples for LWK
Processing chromosome 1 for LWK...
Converting genotypes for 1617 variants and 99 samples...


100%|████████████████████████████████████| 1617/1617 [00:00<00:00, 89322.78it/s]



Genotype distribution:
Genotype 0: 149453 (93.36%)
Genotype 1: 6622 (4.14%)
Genotype 2: 3941 (2.46%)
Genotype 9: 67 (0.04%)
Processing chromosome 2 for LWK...
Converting genotypes for 1786 variants and 99 samples...


100%|████████████████████████████████████| 1786/1786 [00:00<00:00, 92842.87it/s]



Genotype distribution:
Genotype 0: 165684 (93.71%)
Genotype 1: 7296 (4.13%)
Genotype 2: 3670 (2.08%)
Genotype 9: 164 (0.09%)
Processing chromosome 3 for LWK...
Converting genotypes for 1446 variants and 99 samples...


100%|████████████████████████████████████| 1446/1446 [00:00<00:00, 90669.35it/s]



Genotype distribution:
Genotype 0: 134142 (93.70%)
Genotype 1: 5881 (4.11%)
Genotype 2: 3108 (2.17%)
Genotype 9: 23 (0.02%)
Processing chromosome 4 for LWK...
Converting genotypes for 1366 variants and 99 samples...


100%|████████████████████████████████████| 1366/1366 [00:00<00:00, 89417.39it/s]



Genotype distribution:
Genotype 0: 127193 (94.05%)
Genotype 1: 5595 (4.14%)
Genotype 2: 2359 (1.74%)
Genotype 9: 87 (0.06%)
Processing chromosome 5 for LWK...
Converting genotypes for 1325 variants and 99 samples...


100%|████████████████████████████████████| 1325/1325 [00:00<00:00, 90591.93it/s]



Genotype distribution:
Genotype 0: 122140 (93.11%)
Genotype 1: 5940 (4.53%)
Genotype 2: 3020 (2.30%)
Genotype 9: 75 (0.06%)
Processing chromosome 6 for LWK...
Converting genotypes for 1423 variants and 99 samples...


100%|████████████████████████████████████| 1423/1423 [00:00<00:00, 91866.81it/s]



Genotype distribution:
Genotype 0: 130775 (92.83%)
Genotype 1: 6122 (4.35%)
Genotype 2: 3897 (2.77%)
Genotype 9: 83 (0.06%)
Processing chromosome 7 for LWK...
Converting genotypes for 1215 variants and 99 samples...


100%|████████████████████████████████████| 1215/1215 [00:00<00:00, 87922.56it/s]



Genotype distribution:
Genotype 0: 112634 (93.64%)
Genotype 1: 5337 (4.44%)
Genotype 2: 2235 (1.86%)
Genotype 9: 79 (0.07%)
Processing chromosome 8 for LWK...
Converting genotypes for 1382 variants and 99 samples...


100%|████████████████████████████████████| 1382/1382 [00:00<00:00, 88499.31it/s]


Genotype distribution:
Genotype 0: 129612 (94.73%)
Genotype 1: 4843 (3.54%)
Genotype 2: 2167 (1.58%)
Genotype 9: 196 (0.14%)





Processing chromosome 9 for LWK...
Converting genotypes for 1038 variants and 99 samples...


100%|████████████████████████████████████| 1038/1038 [00:00<00:00, 87829.08it/s]


Genotype distribution:
Genotype 0: 96362 (93.77%)
Genotype 1: 4518 (4.40%)
Genotype 2: 1805 (1.76%)
Genotype 9: 77 (0.07%)
Processing chromosome 10 for LWK...





Converting genotypes for 1128 variants and 99 samples...


100%|████████████████████████████████████| 1128/1128 [00:00<00:00, 88352.26it/s]


Genotype distribution:
Genotype 0: 104857 (93.90%)
Genotype 1: 4453 (3.99%)
Genotype 2: 2362 (2.12%)
Processing chromosome 11 for LWK...





Converting genotypes for 1154 variants and 99 samples...


100%|████████████████████████████████████| 1154/1154 [00:00<00:00, 91893.74it/s]


Genotype distribution:
Genotype 0: 107998 (94.53%)
Genotype 1: 4030 (3.53%)
Genotype 2: 2175 (1.90%)
Genotype 9: 43 (0.04%)
Processing chromosome 12 for LWK...





Converting genotypes for 1046 variants and 99 samples...


100%|████████████████████████████████████| 1046/1046 [00:00<00:00, 92688.86it/s]


Genotype distribution:
Genotype 0: 96625 (93.31%)
Genotype 1: 4425 (4.27%)
Genotype 2: 2291 (2.21%)
Genotype 9: 213 (0.21%)
Processing chromosome 13 for LWK...





Converting genotypes for 773 variants and 99 samples...


100%|██████████████████████████████████████| 773/773 [00:00<00:00, 87812.06it/s]


Genotype distribution:
Genotype 0: 71633 (93.60%)
Genotype 1: 3042 (3.98%)
Genotype 2: 1783 (2.33%)
Genotype 9: 69 (0.09%)
Processing chromosome 14 for LWK...





Converting genotypes for 667 variants and 99 samples...


100%|██████████████████████████████████████| 667/667 [00:00<00:00, 85139.56it/s]


Genotype distribution:
Genotype 0: 61356 (92.92%)
Genotype 1: 3037 (4.60%)
Genotype 2: 1493 (2.26%)
Genotype 9: 147 (0.22%)
Processing chromosome 15 for LWK...





Converting genotypes for 637 variants and 99 samples...


100%|██████████████████████████████████████| 637/637 [00:00<00:00, 86358.90it/s]


Genotype distribution:
Genotype 0: 59474 (94.31%)
Genotype 1: 2335 (3.70%)
Genotype 2: 1254 (1.99%)
Processing chromosome 16 for LWK...





Converting genotypes for 835 variants and 99 samples...


100%|██████████████████████████████████████| 835/835 [00:00<00:00, 90027.35it/s]


Genotype distribution:
Genotype 0: 76944 (93.08%)
Genotype 1: 3810 (4.61%)
Genotype 2: 1873 (2.27%)
Genotype 9: 38 (0.05%)
Processing chromosome 17 for LWK...





Converting genotypes for 664 variants and 99 samples...


100%|██████████████████████████████████████| 664/664 [00:00<00:00, 84512.29it/s]


Genotype distribution:
Genotype 0: 56117 (85.37%)
Genotype 1: 5824 (8.86%)
Genotype 2: 3739 (5.69%)
Genotype 9: 56 (0.09%)
Processing chromosome 18 for LWK...





Converting genotypes for 598 variants and 99 samples...


100%|██████████████████████████████████████| 598/598 [00:00<00:00, 85246.02it/s]


Genotype distribution:
Genotype 0: 56248 (95.01%)
Genotype 1: 1857 (3.14%)
Genotype 2: 1062 (1.79%)
Genotype 9: 35 (0.06%)
Processing chromosome 19 for LWK...





Converting genotypes for 367 variants and 99 samples...


100%|██████████████████████████████████████| 367/367 [00:00<00:00, 79349.94it/s]


Genotype distribution:
Genotype 0: 34604 (95.24%)
Genotype 1: 1220 (3.36%)
Genotype 2: 509 (1.40%)
Processing chromosome 20 for LWK...





Converting genotypes for 571 variants and 99 samples...


100%|██████████████████████████████████████| 571/571 [00:00<00:00, 82873.03it/s]



Genotype distribution:
Genotype 0: 52451 (92.79%)
Genotype 1: 2493 (4.41%)
Genotype 2: 1583 (2.80%)
Genotype 9: 2 (0.00%)
Processing chromosome 21 for LWK...
Converting genotypes for 340 variants and 99 samples...


100%|██████████████████████████████████████| 340/340 [00:00<00:00, 77786.69it/s]


Genotype distribution:
Genotype 0: 31722 (94.24%)
Genotype 1: 1344 (3.99%)
Genotype 2: 585 (1.74%)
Genotype 9: 9 (0.03%)
Processing chromosome 22 for LWK...





Converting genotypes for 286 variants and 99 samples...


100%|██████████████████████████████████████| 286/286 [00:00<00:00, 77257.10it/s]


Genotype distribution:
Genotype 0: 26226 (92.63%)
Genotype 1: 1263 (4.46%)
Genotype 2: 740 (2.61%)
Genotype 9: 85 (0.30%)






Writing genotypes to ../data/1000g/populations/LWK.geno...


100%|██████████████████████████████████| 21664/21664 [00:00<00:00, 44540.29it/s]



Conversion complete for LWK!
Files created:
1. ../data/1000g/populations/LWK.snp - SNP information (21664 variants)
2. ../data/1000g/populations/LWK.ind - Sample information (99 samples)
3. ../data/1000g/populations/LWK.geno - Genotype data

Final genotype distribution:
Genotype 0: 2004250 (93.45%)
Genotype 1: 91287 (4.26%)
Genotype 2: 47651 (2.22%)
Genotype 9: 1548 (0.07%)

Processing population: MXL
Converting data for population: MXL
Found 64 samples for MXL
Processing chromosome 1 for MXL...
Converting genotypes for 1617 variants and 64 samples...


100%|███████████████████████████████████| 1617/1617 [00:00<00:00, 140217.69it/s]



Genotype distribution:
Genotype 0: 97713 (94.42%)
Genotype 1: 3145 (3.04%)
Genotype 2: 2574 (2.49%)
Genotype 9: 56 (0.05%)
Processing chromosome 2 for MXL...
Converting genotypes for 1786 variants and 64 samples...


100%|███████████████████████████████████| 1786/1786 [00:00<00:00, 137778.68it/s]



Genotype distribution:
Genotype 0: 108584 (95.00%)
Genotype 1: 3545 (3.10%)
Genotype 2: 2006 (1.75%)
Genotype 9: 169 (0.15%)
Processing chromosome 3 for MXL...
Converting genotypes for 1446 variants and 64 samples...


100%|███████████████████████████████████| 1446/1446 [00:00<00:00, 137101.60it/s]



Genotype distribution:
Genotype 0: 87835 (94.91%)
Genotype 1: 2862 (3.09%)
Genotype 2: 1847 (2.00%)
Processing chromosome 4 for MXL...
Converting genotypes for 1366 variants and 64 samples...


100%|███████████████████████████████████| 1366/1366 [00:00<00:00, 134106.20it/s]



Genotype distribution:
Genotype 0: 82948 (94.88%)
Genotype 1: 2734 (3.13%)
Genotype 2: 1709 (1.95%)
Genotype 9: 33 (0.04%)
Processing chromosome 5 for MXL...
Converting genotypes for 1325 variants and 64 samples...


100%|███████████████████████████████████| 1325/1325 [00:00<00:00, 131630.81it/s]



Genotype distribution:
Genotype 0: 79906 (94.23%)
Genotype 1: 3108 (3.67%)
Genotype 2: 1770 (2.09%)
Genotype 9: 16 (0.02%)
Processing chromosome 6 for MXL...
Converting genotypes for 1423 variants and 64 samples...


100%|███████████████████████████████████| 1423/1423 [00:00<00:00, 135743.24it/s]


Genotype distribution:
Genotype 0: 84910 (93.23%)
Genotype 1: 3674 (4.03%)
Genotype 2: 2481 (2.72%)
Genotype 9: 7 (0.01%)
Processing chromosome 7 for MXL...





Converting genotypes for 1215 variants and 64 samples...


100%|███████████████████████████████████| 1215/1215 [00:00<00:00, 136547.24it/s]


Genotype distribution:
Genotype 0: 73861 (94.99%)
Genotype 1: 2610 (3.36%)
Genotype 2: 1280 (1.65%)
Genotype 9: 9 (0.01%)
Processing chromosome 8 for MXL...





Converting genotypes for 1382 variants and 64 samples...


100%|███████████████████████████████████| 1382/1382 [00:00<00:00, 142298.47it/s]


Genotype distribution:
Genotype 0: 84502 (95.54%)
Genotype 1: 2422 (2.74%)
Genotype 2: 1397 (1.58%)
Genotype 9: 127 (0.14%)
Processing chromosome 9 for MXL...





Converting genotypes for 1038 variants and 64 samples...


100%|███████████████████████████████████| 1038/1038 [00:00<00:00, 138344.06it/s]


Genotype distribution:
Genotype 0: 63061 (94.93%)
Genotype 1: 2089 (3.14%)
Genotype 2: 1243 (1.87%)
Genotype 9: 39 (0.06%)
Processing chromosome 10 for MXL...





Converting genotypes for 1128 variants and 64 samples...


100%|███████████████████████████████████| 1128/1128 [00:00<00:00, 136956.87it/s]


Genotype distribution:
Genotype 0: 68390 (94.73%)
Genotype 1: 2234 (3.09%)
Genotype 2: 1564 (2.17%)
Genotype 9: 4 (0.01%)
Processing chromosome 11 for MXL...





Converting genotypes for 1154 variants and 64 samples...


100%|███████████████████████████████████| 1154/1154 [00:00<00:00, 136244.63it/s]


Genotype distribution:
Genotype 0: 70496 (95.45%)
Genotype 1: 2030 (2.75%)
Genotype 2: 1315 (1.78%)
Genotype 9: 15 (0.02%)
Processing chromosome 12 for MXL...





Converting genotypes for 1046 variants and 64 samples...


100%|███████████████████████████████████| 1046/1046 [00:00<00:00, 127543.52it/s]


Genotype distribution:
Genotype 0: 62758 (93.75%)
Genotype 1: 2345 (3.50%)
Genotype 2: 1771 (2.65%)
Genotype 9: 70 (0.10%)
Processing chromosome 13 for MXL...





Converting genotypes for 773 variants and 64 samples...


100%|█████████████████████████████████████| 773/773 [00:00<00:00, 126514.89it/s]


Genotype distribution:
Genotype 0: 46350 (93.69%)
Genotype 1: 1727 (3.49%)
Genotype 2: 1334 (2.70%)
Genotype 9: 61 (0.12%)
Processing chromosome 14 for MXL...





Converting genotypes for 667 variants and 64 samples...


100%|█████████████████████████████████████| 667/667 [00:00<00:00, 125893.29it/s]


Genotype distribution:
Genotype 0: 40282 (94.36%)
Genotype 1: 1440 (3.37%)
Genotype 2: 902 (2.11%)
Genotype 9: 64 (0.15%)
Processing chromosome 15 for MXL...





Converting genotypes for 637 variants and 64 samples...


100%|█████████████████████████████████████| 637/637 [00:00<00:00, 136544.78it/s]


Genotype distribution:
Genotype 0: 38855 (95.31%)
Genotype 1: 1194 (2.93%)
Genotype 2: 719 (1.76%)
Processing chromosome 16 for MXL...





Converting genotypes for 835 variants and 64 samples...


100%|█████████████████████████████████████| 835/835 [00:00<00:00, 136865.21it/s]


Genotype distribution:
Genotype 0: 50420 (94.35%)
Genotype 1: 2013 (3.77%)
Genotype 2: 992 (1.86%)
Genotype 9: 15 (0.03%)
Processing chromosome 17 for MXL...





Converting genotypes for 664 variants and 64 samples...


100%|█████████████████████████████████████| 664/664 [00:00<00:00, 133817.89it/s]


Genotype distribution:
Genotype 0: 36259 (85.32%)
Genotype 1: 3402 (8.01%)
Genotype 2: 2764 (6.50%)
Genotype 9: 71 (0.17%)
Processing chromosome 18 for MXL...





Converting genotypes for 598 variants and 64 samples...


100%|█████████████████████████████████████| 598/598 [00:00<00:00, 135292.83it/s]



Genotype distribution:
Genotype 0: 36555 (95.51%)
Genotype 1: 1005 (2.63%)
Genotype 2: 702 (1.83%)
Genotype 9: 10 (0.03%)
Processing chromosome 19 for MXL...
Converting genotypes for 367 variants and 64 samples...


100%|█████████████████████████████████████| 367/367 [00:00<00:00, 126556.74it/s]


Genotype distribution:
Genotype 0: 22521 (95.88%)
Genotype 1: 683 (2.91%)
Genotype 2: 283 (1.20%)
Genotype 9: 1 (0.00%)
Processing chromosome 20 for MXL...





Converting genotypes for 571 variants and 64 samples...


100%|█████████████████████████████████████| 571/571 [00:00<00:00, 118385.94it/s]



Genotype distribution:
Genotype 0: 34283 (93.81%)
Genotype 1: 1363 (3.73%)
Genotype 2: 898 (2.46%)
Processing chromosome 21 for MXL...
Converting genotypes for 340 variants and 64 samples...


100%|█████████████████████████████████████| 340/340 [00:00<00:00, 120008.70it/s]


Genotype distribution:
Genotype 0: 20711 (95.18%)
Genotype 1: 669 (3.07%)
Genotype 2: 364 (1.67%)
Genotype 9: 16 (0.07%)
Processing chromosome 22 for MXL...





Converting genotypes for 286 variants and 64 samples...


100%|█████████████████████████████████████| 286/286 [00:00<00:00, 116928.64it/s]



Genotype distribution:
Genotype 0: 17206 (94.00%)
Genotype 1: 596 (3.26%)
Genotype 2: 445 (2.43%)
Genotype 9: 57 (0.31%)

Writing genotypes to ../data/1000g/populations/MXL.geno...


100%|██████████████████████████████████| 21664/21664 [00:00<00:00, 69352.05it/s]



Conversion complete for MXL!
Files created:
1. ../data/1000g/populations/MXL.snp - SNP information (21664 variants)
2. ../data/1000g/populations/MXL.ind - Sample information (64 samples)
3. ../data/1000g/populations/MXL.geno - Genotype data

Final genotype distribution:
Genotype 0: 1308406 (94.37%)
Genotype 1: 46890 (3.38%)
Genotype 2: 30360 (2.19%)
Genotype 9: 840 (0.06%)

Processing population: TSI
Converting data for population: TSI
Found 107 samples for TSI
Processing chromosome 1 for TSI...
Converting genotypes for 1617 variants and 107 samples...


100%|████████████████████████████████████| 1617/1617 [00:00<00:00, 84402.83it/s]



Genotype distribution:
Genotype 0: 163716 (94.62%)
Genotype 1: 5087 (2.94%)
Genotype 2: 4126 (2.38%)
Genotype 9: 90 (0.05%)
Processing chromosome 2 for TSI...
Converting genotypes for 1786 variants and 107 samples...


100%|████████████████████████████████████| 1786/1786 [00:00<00:00, 85171.76it/s]



Genotype distribution:
Genotype 0: 181658 (95.06%)
Genotype 1: 5788 (3.03%)
Genotype 2: 3410 (1.78%)
Genotype 9: 246 (0.13%)
Processing chromosome 3 for TSI...
Converting genotypes for 1446 variants and 107 samples...


100%|████████████████████████████████████| 1446/1446 [00:00<00:00, 86748.91it/s]



Genotype distribution:
Genotype 0: 147121 (95.09%)
Genotype 1: 4428 (2.86%)
Genotype 2: 3172 (2.05%)
Genotype 9: 1 (0.00%)
Processing chromosome 4 for TSI...
Converting genotypes for 1366 variants and 107 samples...


100%|████████████████████████████████████| 1366/1366 [00:00<00:00, 83979.53it/s]



Genotype distribution:
Genotype 0: 138724 (94.91%)
Genotype 1: 4533 (3.10%)
Genotype 2: 2841 (1.94%)
Genotype 9: 64 (0.04%)
Processing chromosome 5 for TSI...
Converting genotypes for 1325 variants and 107 samples...


100%|████████████████████████████████████| 1325/1325 [00:00<00:00, 83712.97it/s]



Genotype distribution:
Genotype 0: 133711 (94.31%)
Genotype 1: 5057 (3.57%)
Genotype 2: 2981 (2.10%)
Genotype 9: 26 (0.02%)
Processing chromosome 6 for TSI...
Converting genotypes for 1423 variants and 107 samples...


100%|████████████████████████████████████| 1423/1423 [00:00<00:00, 84908.80it/s]



Genotype distribution:
Genotype 0: 141780 (93.12%)
Genotype 1: 6448 (4.23%)
Genotype 2: 4019 (2.64%)
Genotype 9: 14 (0.01%)
Processing chromosome 7 for TSI...
Converting genotypes for 1215 variants and 107 samples...


100%|████████████████████████████████████| 1215/1215 [00:00<00:00, 84526.11it/s]



Genotype distribution:
Genotype 0: 123413 (94.93%)
Genotype 1: 4410 (3.39%)
Genotype 2: 2169 (1.67%)
Genotype 9: 13 (0.01%)
Processing chromosome 8 for TSI...
Converting genotypes for 1382 variants and 107 samples...


100%|████████████████████████████████████| 1382/1382 [00:00<00:00, 83363.70it/s]



Genotype distribution:
Genotype 0: 141470 (95.67%)
Genotype 1: 3916 (2.65%)
Genotype 2: 2293 (1.55%)
Genotype 9: 195 (0.13%)
Processing chromosome 9 for TSI...
Converting genotypes for 1038 variants and 107 samples...


100%|████████████████████████████████████| 1038/1038 [00:00<00:00, 84056.14it/s]


Genotype distribution:
Genotype 0: 105494 (94.98%)
Genotype 1: 3577 (3.22%)
Genotype 2: 1925 (1.73%)
Genotype 9: 70 (0.06%)
Processing chromosome 10 for TSI...





Converting genotypes for 1128 variants and 107 samples...


100%|████████████████████████████████████| 1128/1128 [00:00<00:00, 83210.36it/s]


Genotype distribution:
Genotype 0: 114855 (95.16%)
Genotype 1: 3511 (2.91%)
Genotype 2: 2330 (1.93%)
Processing chromosome 11 for TSI...





Converting genotypes for 1154 variants and 107 samples...


100%|████████████████████████████████████| 1154/1154 [00:00<00:00, 84001.09it/s]


Genotype distribution:
Genotype 0: 117558 (95.21%)
Genotype 1: 3408 (2.76%)
Genotype 2: 2494 (2.02%)
Genotype 9: 18 (0.01%)
Processing chromosome 12 for TSI...





Converting genotypes for 1046 variants and 107 samples...


100%|████████████████████████████████████| 1046/1046 [00:00<00:00, 84292.23it/s]


Genotype distribution:
Genotype 0: 104791 (93.63%)
Genotype 1: 4127 (3.69%)
Genotype 2: 2910 (2.60%)
Genotype 9: 94 (0.08%)
Processing chromosome 13 for TSI...





Converting genotypes for 773 variants and 107 samples...


100%|██████████████████████████████████████| 773/773 [00:00<00:00, 83152.45it/s]


Genotype distribution:
Genotype 0: 77583 (93.80%)
Genotype 1: 2756 (3.33%)
Genotype 2: 2252 (2.72%)
Genotype 9: 120 (0.15%)
Processing chromosome 14 for TSI...





Converting genotypes for 667 variants and 107 samples...


100%|██████████████████████████████████████| 667/667 [00:00<00:00, 78841.19it/s]


Genotype distribution:
Genotype 0: 67178 (94.13%)
Genotype 1: 2426 (3.40%)
Genotype 2: 1596 (2.24%)
Genotype 9: 169 (0.24%)
Processing chromosome 15 for TSI...





Converting genotypes for 637 variants and 107 samples...


100%|██████████████████████████████████████| 637/637 [00:00<00:00, 80989.77it/s]


Genotype distribution:
Genotype 0: 64854 (95.15%)
Genotype 1: 2199 (3.23%)
Genotype 2: 1105 (1.62%)
Genotype 9: 1 (0.00%)
Processing chromosome 16 for TSI...





Converting genotypes for 835 variants and 107 samples...


100%|██████████████████████████████████████| 835/835 [00:00<00:00, 85055.47it/s]


Genotype distribution:
Genotype 0: 84462 (94.53%)
Genotype 1: 3185 (3.56%)
Genotype 2: 1659 (1.86%)
Genotype 9: 39 (0.04%)
Processing chromosome 17 for TSI...





Converting genotypes for 664 variants and 107 samples...


100%|██████████████████████████████████████| 664/664 [00:00<00:00, 81772.80it/s]


Genotype distribution:
Genotype 0: 60512 (85.17%)
Genotype 1: 5611 (7.90%)
Genotype 2: 4853 (6.83%)
Genotype 9: 72 (0.10%)
Processing chromosome 18 for TSI...





Converting genotypes for 598 variants and 107 samples...


100%|██████████████████████████████████████| 598/598 [00:00<00:00, 85580.52it/s]


Genotype distribution:
Genotype 0: 61042 (95.40%)
Genotype 1: 1554 (2.43%)
Genotype 2: 1374 (2.15%)
Genotype 9: 16 (0.03%)
Processing chromosome 19 for TSI...





Converting genotypes for 367 variants and 107 samples...


100%|██████████████████████████████████████| 367/367 [00:00<00:00, 77687.98it/s]



Genotype distribution:
Genotype 0: 37725 (96.07%)
Genotype 1: 1035 (2.64%)
Genotype 2: 509 (1.30%)
Processing chromosome 20 for TSI...
Converting genotypes for 571 variants and 107 samples...


100%|██████████████████████████████████████| 571/571 [00:00<00:00, 76995.58it/s]



Genotype distribution:
Genotype 0: 57508 (94.13%)
Genotype 1: 2099 (3.44%)
Genotype 2: 1490 (2.44%)
Processing chromosome 21 for TSI...
Converting genotypes for 340 variants and 107 samples...


100%|██████████████████████████████████████| 340/340 [00:00<00:00, 78540.69it/s]


Genotype distribution:
Genotype 0: 34491 (94.81%)
Genotype 1: 1169 (3.21%)
Genotype 2: 671 (1.84%)
Genotype 9: 49 (0.13%)
Processing chromosome 22 for TSI...





Converting genotypes for 286 variants and 107 samples...


100%|██████████████████████████████████████| 286/286 [00:00<00:00, 75302.63it/s]


Genotype distribution:
Genotype 0: 28769 (94.01%)
Genotype 1: 950 (3.10%)
Genotype 2: 785 (2.57%)
Genotype 9: 98 (0.32%)






Writing genotypes to ../data/1000g/populations/TSI.geno...


100%|██████████████████████████████████| 21664/21664 [00:00<00:00, 41089.76it/s]



Conversion complete for TSI!
Files created:
1. ../data/1000g/populations/TSI.snp - SNP information (21664 variants)
2. ../data/1000g/populations/TSI.ind - Sample information (107 samples)
3. ../data/1000g/populations/TSI.geno - Genotype data

Final genotype distribution:
Genotype 0: 2188415 (94.41%)
Genotype 1: 77274 (3.33%)
Genotype 2: 50964 (2.20%)
Genotype 9: 1395 (0.06%)

Processing population: YRI
Converting data for population: YRI
Found 108 samples for YRI
Processing chromosome 1 for YRI...
Converting genotypes for 1617 variants and 108 samples...


100%|████████████████████████████████████| 1617/1617 [00:00<00:00, 85813.57it/s]



Genotype distribution:
Genotype 0: 162983 (93.33%)
Genotype 1: 7279 (4.17%)
Genotype 2: 4295 (2.46%)
Genotype 9: 79 (0.05%)
Processing chromosome 2 for YRI...
Converting genotypes for 1786 variants and 108 samples...


100%|████████████████████████████████████| 1786/1786 [00:00<00:00, 82505.75it/s]



Genotype distribution:
Genotype 0: 180716 (93.69%)
Genotype 1: 7876 (4.08%)
Genotype 2: 4107 (2.13%)
Genotype 9: 189 (0.10%)
Processing chromosome 3 for YRI...
Converting genotypes for 1446 variants and 108 samples...


100%|████████████████████████████████████| 1446/1446 [00:00<00:00, 80279.60it/s]


Genotype distribution:
Genotype 0: 146244 (93.65%)
Genotype 1: 6480 (4.15%)
Genotype 2: 3419 (2.19%)
Genotype 9: 25 (0.02%)
Processing chromosome 4 for YRI...





Converting genotypes for 1366 variants and 108 samples...


100%|████████████████████████████████████| 1366/1366 [00:00<00:00, 81914.38it/s]


Genotype distribution:
Genotype 0: 138777 (94.07%)
Genotype 1: 6197 (4.20%)
Genotype 2: 2459 (1.67%)
Genotype 9: 95 (0.06%)
Processing chromosome 5 for YRI...





Converting genotypes for 1325 variants and 108 samples...


100%|████████████████████████████████████| 1325/1325 [00:00<00:00, 79862.23it/s]


Genotype distribution:
Genotype 0: 133211 (93.09%)
Genotype 1: 6500 (4.54%)
Genotype 2: 3297 (2.30%)
Genotype 9: 92 (0.06%)





Processing chromosome 6 for YRI...
Converting genotypes for 1423 variants and 108 samples...


100%|████████████████████████████████████| 1423/1423 [00:00<00:00, 83147.51it/s]



Genotype distribution:
Genotype 0: 142443 (92.69%)
Genotype 1: 6979 (4.54%)
Genotype 2: 4165 (2.71%)
Genotype 9: 97 (0.06%)
Processing chromosome 7 for YRI...
Converting genotypes for 1215 variants and 108 samples...


100%|████████████████████████████████████| 1215/1215 [00:00<00:00, 82860.38it/s]


Genotype distribution:
Genotype 0: 122763 (93.56%)
Genotype 1: 5757 (4.39%)
Genotype 2: 2571 (1.96%)
Genotype 9: 129 (0.10%)





Processing chromosome 8 for YRI...
Converting genotypes for 1382 variants and 108 samples...


100%|████████████████████████████████████| 1382/1382 [00:00<00:00, 84032.01it/s]



Genotype distribution:
Genotype 0: 141321 (94.68%)
Genotype 1: 5309 (3.56%)
Genotype 2: 2404 (1.61%)
Genotype 9: 222 (0.15%)
Processing chromosome 9 for YRI...
Converting genotypes for 1038 variants and 108 samples...


100%|████████████████████████████████████| 1038/1038 [00:00<00:00, 75370.26it/s]



Genotype distribution:
Genotype 0: 105208 (93.85%)
Genotype 1: 4820 (4.30%)
Genotype 2: 1995 (1.78%)
Genotype 9: 81 (0.07%)
Processing chromosome 10 for YRI...
Converting genotypes for 1128 variants and 108 samples...


100%|████████████████████████████████████| 1128/1128 [00:00<00:00, 81163.37it/s]



Genotype distribution:
Genotype 0: 114383 (93.89%)
Genotype 1: 4861 (3.99%)
Genotype 2: 2580 (2.12%)
Processing chromosome 11 for YRI...
Converting genotypes for 1154 variants and 108 samples...


100%|████████████████████████████████████| 1154/1154 [00:00<00:00, 83871.54it/s]


Genotype distribution:
Genotype 0: 117717 (94.45%)
Genotype 1: 4467 (3.58%)
Genotype 2: 2383 (1.91%)
Genotype 9: 65 (0.05%)
Processing chromosome 12 for YRI...





Converting genotypes for 1046 variants and 108 samples...


100%|████████████████████████████████████| 1046/1046 [00:00<00:00, 81494.23it/s]


Genotype distribution:
Genotype 0: 105264 (93.18%)
Genotype 1: 4916 (4.35%)
Genotype 2: 2547 (2.25%)
Genotype 9: 241 (0.21%)
Processing chromosome 13 for YRI...





Converting genotypes for 773 variants and 108 samples...


100%|██████████████████████████████████████| 773/773 [00:00<00:00, 79151.34it/s]


Genotype distribution:
Genotype 0: 78211 (93.68%)
Genotype 1: 3193 (3.82%)
Genotype 2: 2003 (2.40%)
Genotype 9: 77 (0.09%)
Processing chromosome 14 for YRI...





Converting genotypes for 667 variants and 108 samples...


100%|██████████████████████████████████████| 667/667 [00:00<00:00, 81861.03it/s]


Genotype distribution:
Genotype 0: 66953 (92.94%)
Genotype 1: 3290 (4.57%)
Genotype 2: 1647 (2.29%)
Genotype 9: 146 (0.20%)
Processing chromosome 15 for YRI...





Converting genotypes for 637 variants and 108 samples...


100%|██████████████████████████████████████| 637/637 [00:00<00:00, 77636.18it/s]


Genotype distribution:
Genotype 0: 65095 (94.62%)
Genotype 1: 2355 (3.42%)
Genotype 2: 1346 (1.96%)
Processing chromosome 16 for YRI...





Converting genotypes for 835 variants and 108 samples...


100%|██████████████████████████████████████| 835/835 [00:00<00:00, 80491.00it/s]


Genotype distribution:
Genotype 0: 83947 (93.09%)
Genotype 1: 4131 (4.58%)
Genotype 2: 2080 (2.31%)
Genotype 9: 22 (0.02%)
Processing chromosome 17 for YRI...





Converting genotypes for 664 variants and 108 samples...


100%|██████████████████████████████████████| 664/664 [00:00<00:00, 78842.09it/s]


Genotype distribution:
Genotype 0: 61242 (85.40%)
Genotype 1: 6224 (8.68%)
Genotype 2: 4186 (5.84%)
Genotype 9: 60 (0.08%)
Processing chromosome 18 for YRI...





Converting genotypes for 598 variants and 108 samples...


100%|██████████████████████████████████████| 598/598 [00:00<00:00, 83520.16it/s]


Genotype distribution:
Genotype 0: 61315 (94.94%)
Genotype 1: 2099 (3.25%)
Genotype 2: 1119 (1.73%)
Genotype 9: 51 (0.08%)
Processing chromosome 19 for YRI...





Converting genotypes for 367 variants and 108 samples...


100%|██████████████████████████████████████| 367/367 [00:00<00:00, 78272.63it/s]


Genotype distribution:
Genotype 0: 37855 (95.51%)
Genotype 1: 1246 (3.14%)
Genotype 2: 535 (1.35%)
Processing chromosome 20 for YRI...





Converting genotypes for 571 variants and 108 samples...


100%|██████████████████████████████████████| 571/571 [00:00<00:00, 78561.51it/s]


Genotype distribution:
Genotype 0: 57329 (92.96%)
Genotype 1: 2550 (4.14%)
Genotype 2: 1789 (2.90%)
Processing chromosome 21 for YRI...





Converting genotypes for 340 variants and 108 samples...


100%|██████████████████████████████████████| 340/340 [00:00<00:00, 82747.09it/s]


Genotype distribution:
Genotype 0: 34555 (94.10%)
Genotype 1: 1518 (4.13%)
Genotype 2: 639 (1.74%)
Genotype 9: 8 (0.02%)
Processing chromosome 22 for YRI...
Converting genotypes for 286 variants and 108 samples...



100%|██████████████████████████████████████| 286/286 [00:00<00:00, 75302.63it/s]



Genotype distribution:
Genotype 0: 28586 (92.55%)
Genotype 1: 1324 (4.29%)
Genotype 2: 881 (2.85%)
Genotype 9: 97 (0.31%)

Writing genotypes to ../data/1000g/populations/YRI.geno...


100%|██████████████████████████████████| 21664/21664 [00:00<00:00, 41390.28it/s]


Conversion complete for YRI!
Files created:
1. ../data/1000g/populations/YRI.snp - SNP information (21664 variants)
2. ../data/1000g/populations/YRI.ind - Sample information (108 samples)
3. ../data/1000g/populations/YRI.geno - Genotype data

Final genotype distribution:
Genotype 0: 2186118 (93.44%)
Genotype 1: 99371 (4.25%)
Genotype 2: 52447 (2.24%)
Genotype 9: 1776 (0.08%)





In [7]:
import os
import subprocess
from tqdm import tqdm
import pandas as pd

class MAFFilter:
    def __init__(self, data_dir='../data/1000g', output_dir=None):
        self.data_dir = data_dir
        self.output_dir = output_dir or os.path.join(data_dir, 'maf_filtered')
        os.makedirs(self.output_dir, exist_ok=True)
    
    def calculate_maf(self, vcf_file: str, output_file: str):
        """Calculate MAF for each variant and filter those with MAF < 5%"""
        print(f"Processing {vcf_file}")
        
        # Calculate allele frequencies using bcftools
        cmd = (f"bcftools +fill-tags {vcf_file} -- -t AF,MAF | "
               f"bcftools view -i 'MAF >= 0.05' -Oz -o {output_file}")
        
        print("Filtering variants with MAF < 5%...")
        subprocess.run(cmd, shell=True, check=True)
        
        # Index the filtered file
        subprocess.run(f"bcftools index {output_file}", shell=True, check=True)
        
        # Get statistics
        stats_cmd = f"bcftools stats {output_file}"
        stats = subprocess.run(stats_cmd, shell=True, capture_output=True, text=True).stdout
        
        return output_file, stats
    
    def filter_all_chromosomes(self):
        """Filter all chromosomes"""
        results = []
        
        for chrom in range(1, 23):
            input_vcf = os.path.join(self.data_dir, 
                                   f'filtered.chr{chrom}.phase3.vcf.gz')
            output_vcf = os.path.join(self.output_dir, 
                                    f'maf_filtered.chr{chrom}.phase3.vcf.gz')
            
            if not os.path.exists(input_vcf):
                print(f"Warning: Input file not found: {input_vcf}")
                continue
            
            # Calculate MAF and filter
            filtered_file, stats = self.calculate_maf(input_vcf, output_vcf)
            
            # Extract key statistics
            variant_counts = {}
            for line in stats.split('\n'):
                if line.startswith('SN'):
                    if 'number of records:' in line:
                        variant_counts['total'] = int(line.split()[-1])
            
            results.append({
                'chromosome': chrom,
                'filtered_variants': variant_counts.get('total', 0),
                'output_file': filtered_file
            })
        
        # Create summary
        summary_df = pd.DataFrame(results)
        summary_file = os.path.join(self.output_dir, 'maf_filtering_summary.csv')
        summary_df.to_csv(summary_file, index=False)
        
        print("\nMAF Filtering Summary:")
        print(summary_df)
        print(f"\nTotal variants after filtering: {summary_df['filtered_variants'].sum():,}")
        print(f"\nDetailed summary saved to: {summary_file}")
        
        return summary_df

def main():
    # Initialize filter
    maf_filter = MAFFilter()
    
    # Run filtering
    print("Starting MAF filtering for all chromosomes...")
    summary = maf_filter.filter_all_chromosomes()
    
    print("\nFiltering complete!")
    print("Filtered VCF files are saved in:", maf_filter.output_dir)

if __name__ == "__main__":
    main()

Starting MAF filtering for all chromosomes...
Processing ../data/1000g/filtered.chr1.phase3.vcf.gz
Filtering variants with MAF < 5%...
Processing ../data/1000g/filtered.chr2.phase3.vcf.gz
Filtering variants with MAF < 5%...
Processing ../data/1000g/filtered.chr3.phase3.vcf.gz
Filtering variants with MAF < 5%...
Processing ../data/1000g/filtered.chr4.phase3.vcf.gz
Filtering variants with MAF < 5%...
Processing ../data/1000g/filtered.chr5.phase3.vcf.gz
Filtering variants with MAF < 5%...
Processing ../data/1000g/filtered.chr6.phase3.vcf.gz
Filtering variants with MAF < 5%...
Processing ../data/1000g/filtered.chr7.phase3.vcf.gz
Filtering variants with MAF < 5%...
Processing ../data/1000g/filtered.chr8.phase3.vcf.gz
Filtering variants with MAF < 5%...
Processing ../data/1000g/filtered.chr9.phase3.vcf.gz
Filtering variants with MAF < 5%...
Processing ../data/1000g/filtered.chr10.phase3.vcf.gz
Filtering variants with MAF < 5%...
Processing ../data/1000g/filtered.chr11.phase3.vcf.gz
Filtering

In [11]:

import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Dict, Set, Tuple

class HapMapFilter:
    def __init__(self, hapmap_dir: str, common_snps_dir: str, output_dir: str):
        self.hapmap_dir = hapmap_dir
        self.common_snps_dir = common_snps_dir
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        
    def _load_common_snps(self) -> Dict[int, pd.DataFrame]:
        """Load common SNP information by chromosome"""
        common_snps = {}
        
        print("Loading common SNP positions...")
        for chrom in range(1, 23):
            # Read both position and alleles from common SNPs VCF
            vcf_file = os.path.join(self.common_snps_dir, f'common_snps.chr{chrom}.vcf.gz')
            cmd = f"bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\n' {vcf_file}"
            import subprocess
            result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
            
            # Parse the output into a DataFrame
            snp_info = []
            for line in result.stdout.strip().split('\n'):
                if line:
                    chrom, pos, ref, alt = line.split('\t')
                    snp_info.append({
                        'chromosome': int(chrom),
                        'position': int(pos),
                        'ref': ref,
                        'alt': alt
                    })
            
            if snp_info:
                common_snps[chrom] = pd.DataFrame(snp_info)
                print(f"Chromosome {chrom}: {len(snp_info)} common SNPs")
            else:
                print(f"Warning: No SNPs found for chromosome {chrom}")
        
        return common_snps
    
    def _read_hapmap_data(self, population: str) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray]:
        """Read HapMap3 data files for a population"""
        base_path = os.path.join(self.hapmap_dir, population)
        
        print(f"Reading data from {base_path}")
        
        # Read SNP information
        snp_data = pd.read_csv(f"{base_path}.snp", sep='\s+', header=None,
                              names=['SNP', 'chromosome', 'morgans', 'position', 'ref', 'alt'])
        print(f"Read {len(snp_data)} SNPs")
        print("Sample SNP data:")
        print(snp_data.head())
        
        # Read sample information
        ind_data = pd.read_csv(f"{base_path}.ind", sep='\s+', header=None,
                              names=['sample', 'sex', 'population'])
        print(f"Read {len(ind_data)} samples")
        
        # Read genotype data
        print("Reading genotype data...")
        geno_data = []
        with open(f"{base_path}.geno", 'r') as f:
            for line in tqdm(f):
                geno_data.append([int(g) for g in line.strip()])
        geno_array = np.array(geno_data)
        print(f"Genotype array shape: {geno_array.shape}")
        
        return snp_data, ind_data, geno_array
    
    def filter_population(self, population: str):
        """Filter HapMap3 data for a population to keep only common SNPs"""
        print(f"\nProcessing population: {population}")
        
        # Load common SNP positions
        common_snps = self._load_common_snps()
        
        # Read HapMap3 data
        print("\nReading HapMap3 data...")
        snp_data, ind_data, geno_array = self._read_hapmap_data(population)
        
        # Debug: Print chromosome distribution in HapMap data
        print("\nChromosome distribution in HapMap data:")
        print(snp_data['chromosome'].value_counts().sort_index())
        
        # Create mask for common SNPs
        print("\nFinding common SNPs...")
        keep_mask = np.zeros(len(snp_data), dtype=bool)
        
        for chrom in range(1, 23):
            # Get HapMap SNPs for this chromosome
            chr_mask = snp_data['chromosome'] == chrom
            chr_snps = snp_data[chr_mask]
            
            if chrom in common_snps:
                common_chr_snps = common_snps[chrom]
                
                # Debug information
                print(f"\nChromosome {chrom}:")
                print(f"HapMap SNPs: {len(chr_snps)}")
                print(f"Common SNPs: {len(common_chr_snps)}")
                
                # Match positions
                matches = chr_snps['position'].isin(common_chr_snps['position'])
                keep_mask[chr_mask] = matches
                
                print(f"Matching positions: {sum(matches)}")
                
                # Debug: Show some matching examples
                if sum(matches) > 0:
                    print("\nSample matching SNPs:")
                    matched_hapmap = chr_snps[matches].head()
                    matched_common = common_chr_snps[common_chr_snps['position'].isin(matched_hapmap['position'])].head()
                    print("HapMap SNPs:")
                    print(matched_hapmap[['position', 'ref', 'alt']])
                    print("Common SNPs:")
                    print(matched_common[['position', 'ref', 'alt']])
        
        # Apply filtering
        filtered_snps = snp_data[keep_mask]
        filtered_geno = geno_array[keep_mask]
        
        if len(filtered_snps) == 0:
            raise ValueError("No matching SNPs found! Please check the chromosome numbering and position matching logic.")
        
        # Save filtered data
        output_prefix = os.path.join(self.output_dir, f"{population}_filtered")
        
        filtered_snps.to_csv(f"{output_prefix}.snp", sep='\t', index=False, header=False)
        ind_data.to_csv(f"{output_prefix}.ind", sep='\t', index=False, header=False)
        
        print("\nSaving filtered genotypes...")
        with open(f"{output_prefix}.geno", 'w') as f:
            for row in tqdm(filtered_geno):
                f.write(''.join(map(str, row)) + '\n')
        
        print(f"\nFiltering statistics for {population}:")
        print(f"Original SNPs: {len(snp_data)}")
        print(f"Filtered SNPs: {len(filtered_snps)}")
        print(f"Retention rate: {len(filtered_snps)/len(snp_data)*100:.2f}%")
        
        return filtered_snps, ind_data, filtered_geno

    def filter_all_populations(self, populations=None):
        if populations is None:
            populations = ['ASW', 'CEU', 'CHB', 'JPT', 'LWK', 'MXL', 'TSI', 'YRI']
        
        for population in populations:
            try:
                self.filter_population(population)
            except Exception as e:
                print(f"Error processing population {population}: {e}")
                import traceback
                traceback.print_exc()

if __name__ == "__main__":
    # Configure paths
    HAPMAP_DIR = '/Users/jingl1/Desktop/CMU/02704/hw_data/02704_data'  # HapMap3种群数据目录
    COMMON_SNPS_DIR = "../data/common_snps"      # 共同SNP目录
    OUTPUT_DIR = "../data/hapmap3_filtered"      # 输出目录
    
    # Initialize and run filter
    hapmap_filter = HapMapFilter(HAPMAP_DIR, COMMON_SNPS_DIR, OUTPUT_DIR)
    hapmap_filter.filter_all_populations()


Processing population: ASW
Loading common SNP positions...
Chromosome 1: 1617 common SNPs
Chromosome 2: 1786 common SNPs
Chromosome 3: 1446 common SNPs
Chromosome 4: 1366 common SNPs
Chromosome 5: 1325 common SNPs
Chromosome 6: 1423 common SNPs
Chromosome 7: 1215 common SNPs
Chromosome 8: 1382 common SNPs
Chromosome 9: 1038 common SNPs
Chromosome 10: 1128 common SNPs
Chromosome 11: 1154 common SNPs
Chromosome 12: 1046 common SNPs
Chromosome 13: 773 common SNPs
Chromosome 14: 667 common SNPs
Chromosome 15: 637 common SNPs
Chromosome 16: 835 common SNPs
Chromosome 17: 664 common SNPs
Chromosome 18: 598 common SNPs
Chromosome 19: 367 common SNPs
Chromosome 20: 571 common SNPs
Chromosome 21: 340 common SNPs
Chromosome 22: 286 common SNPs

Reading HapMap3 data...
Reading data from /Users/jingl1/Desktop/CMU/02704/hw_data/02704_data/ASW
Read 718848 SNPs
Sample SNP data:
          SNP  chromosome  morgans  position ref alt
0   rs3131972           1      0.0    742584   G   A
1   rs3131969    

718848it [00:02, 264627.22it/s]
Traceback (most recent call last):
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 154, in filter_all_populations
    self.filter_population(population)
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 128, in filter_population
    raise ValueError("No matching SNPs found! Please check the chromosome numbering and position matching logic.")
ValueError: No matching SNPs found! Please check the chromosome numbering and position matching logic.


Genotype array shape: (718848, 49)

Chromosome distribution in HapMap data:
chromosome
1     55983
2     58409
3     48655
4     43809
5     45045
6     46016
7     38578
8     38291
9     32252
10    36847
11    36514
12    33528
13    26595
14    22956
15    21459
16    22191
17    19210
18    21196
19    13472
20    18359
21    10281
22     9447
23    19755
Name: count, dtype: int64

Finding common SNPs...
Error processing population ASW: No matching SNPs found! Please check the chromosome numbering and position matching logic.

Processing population: CEU
Loading common SNP positions...
Chromosome 1: 1617 common SNPs
Chromosome 2: 1786 common SNPs
Chromosome 3: 1446 common SNPs
Chromosome 4: 1366 common SNPs
Chromosome 5: 1325 common SNPs
Chromosome 6: 1423 common SNPs
Chromosome 7: 1215 common SNPs
Chromosome 8: 1382 common SNPs
Chromosome 9: 1038 common SNPs
Chromosome 10: 1128 common SNPs
Chromosome 11: 1154 common SNPs
Chromosome 12: 1046 common SNPs
Chromosome 13: 773 common SN

718848it [00:05, 124886.73it/s]
Traceback (most recent call last):
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 154, in filter_all_populations
    self.filter_population(population)
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 128, in filter_population
    raise ValueError("No matching SNPs found! Please check the chromosome numbering and position matching logic.")
ValueError: No matching SNPs found! Please check the chromosome numbering and position matching logic.


Genotype array shape: (718848, 112)

Chromosome distribution in HapMap data:
chromosome
1     55983
2     58409
3     48655
4     43809
5     45045
6     46016
7     38578
8     38291
9     32252
10    36847
11    36514
12    33528
13    26595
14    22956
15    21459
16    22191
17    19210
18    21196
19    13472
20    18359
21    10281
22     9447
23    19755
Name: count, dtype: int64

Finding common SNPs...
Error processing population CEU: No matching SNPs found! Please check the chromosome numbering and position matching logic.

Processing population: CHB
Loading common SNP positions...
Chromosome 1: 1617 common SNPs
Chromosome 2: 1786 common SNPs
Chromosome 3: 1446 common SNPs
Chromosome 4: 1366 common SNPs
Chromosome 5: 1325 common SNPs
Chromosome 6: 1423 common SNPs
Chromosome 7: 1215 common SNPs
Chromosome 8: 1382 common SNPs
Chromosome 9: 1038 common SNPs
Chromosome 10: 1128 common SNPs
Chromosome 11: 1154 common SNPs
Chromosome 12: 1046 common SNPs
Chromosome 13: 773 common S

718848it [00:04, 163231.68it/s]
Traceback (most recent call last):
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 154, in filter_all_populations
    self.filter_population(population)
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 128, in filter_population
    raise ValueError("No matching SNPs found! Please check the chromosome numbering and position matching logic.")
ValueError: No matching SNPs found! Please check the chromosome numbering and position matching logic.


Genotype array shape: (718848, 84)

Chromosome distribution in HapMap data:
chromosome
1     55983
2     58409
3     48655
4     43809
5     45045
6     46016
7     38578
8     38291
9     32252
10    36847
11    36514
12    33528
13    26595
14    22956
15    21459
16    22191
17    19210
18    21196
19    13472
20    18359
21    10281
22     9447
23    19755
Name: count, dtype: int64

Finding common SNPs...
Error processing population CHB: No matching SNPs found! Please check the chromosome numbering and position matching logic.

Processing population: JPT
Loading common SNP positions...
Chromosome 1: 1617 common SNPs
Chromosome 2: 1786 common SNPs
Chromosome 3: 1446 common SNPs
Chromosome 4: 1366 common SNPs
Chromosome 5: 1325 common SNPs
Chromosome 6: 1423 common SNPs
Chromosome 7: 1215 common SNPs
Chromosome 8: 1382 common SNPs
Chromosome 9: 1038 common SNPs
Chromosome 10: 1128 common SNPs
Chromosome 11: 1154 common SNPs
Chromosome 12: 1046 common SNPs
Chromosome 13: 773 common SN

718848it [00:04, 167225.68it/s]
Traceback (most recent call last):
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 154, in filter_all_populations
    self.filter_population(population)
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 128, in filter_population
    raise ValueError("No matching SNPs found! Please check the chromosome numbering and position matching logic.")
ValueError: No matching SNPs found! Please check the chromosome numbering and position matching logic.


Genotype array shape: (718848, 86)

Chromosome distribution in HapMap data:
chromosome
1     55983
2     58409
3     48655
4     43809
5     45045
6     46016
7     38578
8     38291
9     32252
10    36847
11    36514
12    33528
13    26595
14    22956
15    21459
16    22191
17    19210
18    21196
19    13472
20    18359
21    10281
22     9447
23    19755
Name: count, dtype: int64

Finding common SNPs...
Error processing population JPT: No matching SNPs found! Please check the chromosome numbering and position matching logic.

Processing population: LWK
Loading common SNP positions...
Chromosome 1: 1617 common SNPs
Chromosome 2: 1786 common SNPs
Chromosome 3: 1446 common SNPs
Chromosome 4: 1366 common SNPs
Chromosome 5: 1325 common SNPs
Chromosome 6: 1423 common SNPs
Chromosome 7: 1215 common SNPs
Chromosome 8: 1382 common SNPs
Chromosome 9: 1038 common SNPs
Chromosome 10: 1128 common SNPs
Chromosome 11: 1154 common SNPs
Chromosome 12: 1046 common SNPs
Chromosome 13: 773 common SN

718848it [00:04, 159643.60it/s]
Traceback (most recent call last):
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 154, in filter_all_populations
    self.filter_population(population)
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 128, in filter_population
    raise ValueError("No matching SNPs found! Please check the chromosome numbering and position matching logic.")
ValueError: No matching SNPs found! Please check the chromosome numbering and position matching logic.


Genotype array shape: (718848, 90)

Chromosome distribution in HapMap data:
chromosome
1     55983
2     58409
3     48655
4     43809
5     45045
6     46016
7     38578
8     38291
9     32252
10    36847
11    36514
12    33528
13    26595
14    22956
15    21459
16    22191
17    19210
18    21196
19    13472
20    18359
21    10281
22     9447
23    19755
Name: count, dtype: int64

Finding common SNPs...
Error processing population LWK: No matching SNPs found! Please check the chromosome numbering and position matching logic.

Processing population: MXL
Loading common SNP positions...
Chromosome 1: 1617 common SNPs
Chromosome 2: 1786 common SNPs
Chromosome 3: 1446 common SNPs
Chromosome 4: 1366 common SNPs
Chromosome 5: 1325 common SNPs
Chromosome 6: 1423 common SNPs
Chromosome 7: 1215 common SNPs
Chromosome 8: 1382 common SNPs
Chromosome 9: 1038 common SNPs
Chromosome 10: 1128 common SNPs
Chromosome 11: 1154 common SNPs
Chromosome 12: 1046 common SNPs
Chromosome 13: 773 common SN

718848it [00:02, 257880.10it/s]
Traceback (most recent call last):
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 154, in filter_all_populations
    self.filter_population(population)
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 128, in filter_population
    raise ValueError("No matching SNPs found! Please check the chromosome numbering and position matching logic.")
ValueError: No matching SNPs found! Please check the chromosome numbering and position matching logic.


Genotype array shape: (718848, 50)

Chromosome distribution in HapMap data:
chromosome
1     55983
2     58409
3     48655
4     43809
5     45045
6     46016
7     38578
8     38291
9     32252
10    36847
11    36514
12    33528
13    26595
14    22956
15    21459
16    22191
17    19210
18    21196
19    13472
20    18359
21    10281
22     9447
23    19755
Name: count, dtype: int64

Finding common SNPs...
Error processing population MXL: No matching SNPs found! Please check the chromosome numbering and position matching logic.

Processing population: TSI
Loading common SNP positions...
Chromosome 1: 1617 common SNPs
Chromosome 2: 1786 common SNPs
Chromosome 3: 1446 common SNPs
Chromosome 4: 1366 common SNPs
Chromosome 5: 1325 common SNPs
Chromosome 6: 1423 common SNPs
Chromosome 7: 1215 common SNPs
Chromosome 8: 1382 common SNPs
Chromosome 9: 1038 common SNPs
Chromosome 10: 1128 common SNPs
Chromosome 11: 1154 common SNPs
Chromosome 12: 1046 common SNPs
Chromosome 13: 773 common SN

718848it [00:04, 157288.55it/s]
Traceback (most recent call last):
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 154, in filter_all_populations
    self.filter_population(population)
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 128, in filter_population
    raise ValueError("No matching SNPs found! Please check the chromosome numbering and position matching logic.")
ValueError: No matching SNPs found! Please check the chromosome numbering and position matching logic.


Genotype array shape: (718848, 88)

Chromosome distribution in HapMap data:
chromosome
1     55983
2     58409
3     48655
4     43809
5     45045
6     46016
7     38578
8     38291
9     32252
10    36847
11    36514
12    33528
13    26595
14    22956
15    21459
16    22191
17    19210
18    21196
19    13472
20    18359
21    10281
22     9447
23    19755
Name: count, dtype: int64

Finding common SNPs...
Error processing population TSI: No matching SNPs found! Please check the chromosome numbering and position matching logic.

Processing population: YRI
Loading common SNP positions...
Chromosome 1: 1617 common SNPs
Chromosome 2: 1786 common SNPs
Chromosome 3: 1446 common SNPs
Chromosome 4: 1366 common SNPs
Chromosome 5: 1325 common SNPs
Chromosome 6: 1423 common SNPs
Chromosome 7: 1215 common SNPs
Chromosome 8: 1382 common SNPs
Chromosome 9: 1038 common SNPs
Chromosome 10: 1128 common SNPs
Chromosome 11: 1154 common SNPs
Chromosome 12: 1046 common SNPs
Chromosome 13: 773 common SN

718848it [00:05, 124894.56it/s]


Genotype array shape: (718848, 113)

Chromosome distribution in HapMap data:
chromosome
1     55983
2     58409
3     48655
4     43809
5     45045
6     46016
7     38578
8     38291
9     32252
10    36847
11    36514
12    33528
13    26595
14    22956
15    21459
16    22191
17    19210
18    21196
19    13472
20    18359
21    10281
22     9447
23    19755
Name: count, dtype: int64

Finding common SNPs...
Error processing population YRI: No matching SNPs found! Please check the chromosome numbering and position matching logic.


Traceback (most recent call last):
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 154, in filter_all_populations
    self.filter_population(population)
  File "/var/folders/96/2s64979965l0x1hmvz6rctxh0000gp/T/ipykernel_35761/819443221.py", line 128, in filter_population
    raise ValueError("No matching SNPs found! Please check the chromosome numbering and position matching logic.")
ValueError: No matching SNPs found! Please check the chromosome numbering and position matching logic.


In [23]:
from CB_02704 import *
import pandas as pd
import numpy as np
from typing import Dict, Set, Tuple
import subprocess
import os

def get_1000g_positions(vcf_file: str) -> Set[str]:
    """Get SNP positions from filtered 1000G VCF file"""
    cmd = f"bcftools query -f '%CHROM:%POS\n' {vcf_file}"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return set(result.stdout.strip().split('\n'))

def get_all_1000g_positions() -> Set[str]:
    """Get all SNP positions from filtered 1000G VCF files"""
    all_positions = set()
    for chr in range(1, 23):
        vcf_file = f"../data/common_snps/common_snps.chr{chr}.vcf.gz"
        positions = get_1000g_positions(vcf_file)
        all_positions.update(positions)
    return all_positions

def filter_hapmap3(population: str, kg_positions: Set[str], data_dir: str) -> Tuple[np.ndarray, pd.DataFrame]:
    """Filter HapMap3 data for a population to keep only common SNPs"""
    # Read HapMap3 data
    print(f"Reading {population} data...")
    snp_data = read_snp_pop(population)
    geno_data = read_geno_pop(population)
    ind_data = read_ind_pop(population)
    
    # Create position strings for comparison
    snp_positions = set(f"{row['chromosome']}:{row['position']}" 
                       for _, row in snp_data.iterrows())
    
    # Find common positions
    common_positions = snp_positions.intersection(kg_positions)
    
    # Get indices of SNPs to keep
    keep_idx = [i for i, (_, row) in enumerate(snp_data.iterrows())
                if f"{row['chromosome']}:{row['position']}" in common_positions]
    
    # Filter data
    filtered_geno = geno_data[keep_idx]
    filtered_snps = snp_data.iloc[keep_idx]
    
    # Print statistics
    print(f"\nFiltering statistics for {population}:")
    print(f"Original SNPs: {len(snp_data):,}")
    print(f"Retained SNPs: {len(filtered_snps):,}")
    print(f"Retention rate: {(len(filtered_snps)/len(snp_data)*100):.2f}%")
    
    return filtered_geno, filtered_snps, ind_data

def save_hapmap3_files(population: str, geno_data: np.ndarray, 
                      snp_data: pd.DataFrame, ind_data: pd.DataFrame,
                      output_dir: str):
    """Save filtered HapMap3 data in original format"""
    os.makedirs(output_dir, exist_ok=True)
    
    # Save .geno file
    geno_path = os.path.join(output_dir, f"{population}.geno")
    with open(geno_path, 'w') as f:
        for row in geno_data:
            f.write(''.join(map(str, row)) + '\n')
    
    # Save .snp file
    snp_path = os.path.join(output_dir, f"{population}.snp")
    snp_data.to_csv(snp_path, sep='\t', index=True, header=False)
    
    # Save .ind file (unchanged)
    ind_path = os.path.join(output_dir, f"{population}.ind")
    ind_data.to_csv(ind_path, sep='\t', index=True, header=False)
    
    print(f"\nSaved filtered files for {population}:")
    print(f"Genotype data: {geno_path}")
    print(f"SNP data: {snp_path}")
    print(f"Individual data: {ind_path}")

def main():
    # Configuration
    DATA_DIR = "/Users/jingl1/Desktop/CMU/02704/hw_data/02704_data"
    OUTPUT_DIR = "filtered_hapmap3"
    POPULATIONS = ['ASW', 'CEU', 'CHB', 'JPT', 'LWK', 'MXL', 'TSI', 'YRI']
    
    # Get all 1000G positions
    print("Getting 1000G SNP positions...")
    kg_positions = get_all_1000g_positions()
    print(f"Found {len(kg_positions):,} SNPs in filtered 1000G data")
    
    # Process each population
    for population in POPULATIONS:
        print(f"\nProcessing {population}...")
        
        # Filter population data
        filtered_geno, filtered_snps, ind_data = filter_hapmap3(
            population, kg_positions, DATA_DIR)
        
        # Save filtered data
        save_hapmap3_files(population, filtered_geno, filtered_snps, 
                          ind_data, OUTPUT_DIR)

if __name__ == "__main__":
    main()

Getting 1000G SNP positions...
Found 21,643 SNPs in filtered 1000G data

Processing ASW...
Reading ASW data...

Filtering statistics for ASW:
Original SNPs: 718,848
Retained SNPs: 21,694
Retention rate: 3.02%

Saved filtered files for ASW:
Genotype data: filtered_hapmap3/ASW.geno
SNP data: filtered_hapmap3/ASW.snp
Individual data: filtered_hapmap3/ASW.ind

Processing CEU...
Reading CEU data...

Filtering statistics for CEU:
Original SNPs: 718,848
Retained SNPs: 21,694
Retention rate: 3.02%

Saved filtered files for CEU:
Genotype data: filtered_hapmap3/CEU.geno
SNP data: filtered_hapmap3/CEU.snp
Individual data: filtered_hapmap3/CEU.ind

Processing CHB...
Reading CHB data...

Filtering statistics for CHB:
Original SNPs: 718,848
Retained SNPs: 21,694
Retention rate: 3.02%

Saved filtered files for CHB:
Genotype data: filtered_hapmap3/CHB.geno
SNP data: filtered_hapmap3/CHB.snp
Individual data: filtered_hapmap3/CHB.ind

Processing JPT...
Reading JPT data...

Filtering statistics for JPT:
