In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
class VCFConverter:
    def __init__(self, data_dir='../data/1000g'):
        self.data_dir = data_dir
        self.output_dir = os.path.join(data_dir, 'converted')
        os.makedirs(self.output_dir, exist_ok=True)
        
    def convert_vcf_to_hapmap(self, chrom, population):
        """Convert VCF file to HapMap3 format (.snp, .ind, .geno) for a specific chromosome"""
        vcf_file = os.path.join(self.data_dir, f'filtered.chr{chrom}.phase3.vcf.gz')
        base_out = os.path.join(self.output_dir, f'{population}')
        
        # 1. Create .ind file (only need to do once for all chromosomes)
        if not os.path.exists(f'{base_out}.ind'):
            cmd = f"bcftools query -l {vcf_file} > temp_samples.txt"
            subprocess.run(cmd, shell=True, check=True)
            
            # Read sample IDs and create .ind file
            with open('temp_samples.txt', 'r') as f:
                samples = [line.strip() for line in f]
            
            # Get population info from our panel file
            panel_data = pd.read_csv(os.path.join(self.data_dir, 'population_summary.csv'))
            
            # Create .ind format DataFrame
            ind_data = pd.DataFrame({
                'sample': samples,
                'sex': 'U',  # Unknown sex, can be updated if we have the info
                'pop': population
            })
            
            # Save .ind file
            ind_data.to_csv(f'{base_out}.ind', sep='\t', index=False, header=False)
            os.remove('temp_samples.txt')
        
        # 2. Create .snp file
        cmd = f"""bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' {vcf_file} > temp_snps.txt"""
        subprocess.run(cmd, shell=True, check=True)
        
        # Read and format SNP data
        snp_data = pd.read_csv('temp_snps.txt', sep='\t', 
                              names=['chromosome', 'position', 'rsid', 'ref', 'alt'])
        snp_data['morgans'] = 0.0  # Set morgans to 0 as in HapMap3
        
        # Save .snp file
        snp_data.to_csv(f'{base_out}.snp', sep='\t', index=False, header=False)
        os.remove('temp_snps.txt')
        
        # 3. Create .geno file
        # Extract genotypes and convert to 0,1,2 format
        cmd = f"""bcftools query -f '[%GT]\n' {vcf_file} > temp_geno.txt"""
        subprocess.run(cmd, shell=True, check=True)
        
        # Convert genotypes
        with open('temp_geno.txt', 'r') as f:
            with open(f'{base_out}.geno', 'w') as out:
                for line in tqdm(f, desc=f"Converting genotypes for chr{chrom}"):
                    gts = line.strip().split('\t')
                    converted = []
                    for gt in gts:
                        if gt == '0|0':
                            converted.append('0')
                        elif gt in ('0|1', '1|0'):
                            converted.append('1')
                        elif gt == '1|1':
                            converted.append('2')
                        else:
                            converted.append('9')  # Missing data
                    out.write(''.join(converted) + '\n')
        
        os.remove('temp_geno.txt')
        return base_out

def main():
    converter = VCFConverter()
    
    # Convert each chromosome
    for population in ['CHB', 'JPT', 'CEU', 'YRI', 'ASW', 'LWK', 'MXL', 'TSI']:
        print(f"\nProcessing population: {population}")
        for chrom in range(1, 23):
            print(f"Converting chromosome {chrom}")
            output_prefix = converter.convert_vcf_to_hapmap(chrom, population)
            print(f"Created files: {output_prefix}.snp/ind/geno")

if __name__ == "__main__":
    main()


Processing population: CHB
Converting chromosome 1


Converting genotypes for chr1: 6468094it [00:13, 485780.83it/s]


Created files: ../data/1000g/converted/CHB.snp/ind/geno
Converting chromosome 2
