## READme

I want to convert the format from VCF to HapMap3(.snp .ind .geno)

But the results are super large and storege consuming

So we drop the idea

Start analysis from VCF

In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:

def examine_first_line():
    """Examine the first line of genotype data in detail"""
    # 1. Get the raw genotype string
    cmd = "bcftools query -f '[%GT]\t' ../data/1000g/filtered.chr22.phase3.vcf.gz | head -n 1"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    raw_gts = result.stdout.strip()
    
    print("Raw genotype string:")
    print(raw_gts[:100], "...")  
    
    # 2. Parse and show the conversion process
    gts = raw_gts[1:-1].split('][')
    
    print("\nFirst 10 individual genotypes and their conversion:")
    for i, gt in enumerate(gts[:10]):
        print(f"Original: '{gt}'", end=" -> ")
        if gt in ['0|0', '0/0']:
            print("0")
        elif gt in ['0|1', '1|0', '0/1', '1/0']:
            print("1")
        elif gt in ['1|1', '1/1']:
            print("2")
        else:
            print("9")
    
    print("\nTotal number of genotypes:", len(gts))
    
    # 3. Count each type of genotype
    genotype_counts = {}
    for gt in gts:
        if gt not in genotype_counts:
            genotype_counts[gt] = 0
        genotype_counts[gt] += 1
    
    print("\nRaw genotype counts:")
    for gt, count in genotype_counts.items():
        print(f"'{gt}': {count}")

if __name__ == "__main__":
    examine_first_line()

Raw genotype string:
0|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00 ...

First 10 individual genotypes and their conversion:
Original: '|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00

In [8]:
class VCFConverter:
    def __init__(self, data_dir='../data/1000g'):
        self.data_dir = data_dir
        self.output_dir = os.path.join(data_dir, 'converted')
        os.makedirs(self.output_dir, exist_ok=True)
    
    def convert_vcf_to_hapmap(self, population, output_array=True):
        """
        Convert merged_common_snps.vcf.gz to HapMap3 format
        
        Parameters:
        -----------
        population : str
            Population code (e.g., 'CHB')
        output_array : bool
            If True, also returns the genotype data as a numpy masked array
        """
        vcf_file = os.path.join(self.data_dir, 'merged_common_snps.vcf.gz')
        base_out = os.path.join(self.output_dir, f'{population}')
        
        # 1. Get number of variants and samples first
        print("Counting variants and samples...")
        cmd = f"bcftools view -H {vcf_file} | wc -l"
        n_variants = int(subprocess.check_output(cmd, shell=True).decode().strip())
        
        cmd = f"bcftools query -l {vcf_file} | wc -l"
        n_samples = int(subprocess.check_output(cmd, shell=True).decode().strip())
        
        print(f"Found {n_variants} variants and {n_samples} samples")
        
        # 2. Create .ind file if not exists
        if not os.path.exists(f'{base_out}.ind'):
            print("Creating .ind file...")
            cmd = f"bcftools query -l {vcf_file} > temp_samples.txt"
            subprocess.run(cmd, shell=True, check=True)
            
            with open('temp_samples.txt', 'r') as f:
                samples = [line.strip() for line in f]
            
            ind_data = pd.DataFrame({
                'sample': samples,
                'sex': 'U',
                'pop': population
            })
            ind_data.to_csv(f'{base_out}.ind', sep='\t', index=False, header=False)
            os.remove('temp_samples.txt')
        
        # 3. Create .snp file
        print("Creating .snp file...")
        cmd = f"""bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\n' {vcf_file} > temp_snps.txt"""
        subprocess.run(cmd, shell=True, check=True)
        
        snp_data = pd.read_csv('temp_snps.txt', sep='\t', 
                              names=['chromosome', 'position', 'ref', 'alt'])
        snp_data['morgans'] = 0.0
        snp_data.to_csv(f'{base_out}.snp', sep='\t', index=False, header=False)
        os.remove('temp_snps.txt')
        
        # 4. Create genotype array and .geno file
        print("Creating genotype array...")
        # Initialize masked array
        if output_array:
            geno_array = np.ma.empty((n_variants, n_samples), dtype=np.uint8)
            geno_array.mask = False  # Initialize mask
        
        # Extract and convert genotypes
        cmd = f"bcftools query -f '%CHROM\t%POS[\t%GT]' {vcf_file} > temp_geno.txt"
        subprocess.run(cmd, shell=True, check=True)
        
        print("Converting genotypes...")
        with open('temp_geno.txt', 'r') as f, open(f'{base_out}.geno', 'w') as out:
            for i, line in enumerate(tqdm(f, total=n_variants)):
                parts = line.strip().split('\t')
                gts = parts[2:]  # Skip CHROM and POS
                
                converted = []
                for j, gt in enumerate(gts):
                    if gt in ['0|0', '0/0']:
                        code = 0
                    elif gt in ['0|1', '1|0', '0/1', '1/0']:
                        code = 1
                    elif gt in ['1|1', '1/1']:
                        code = 2
                    else:
                        code = 9
                        if output_array:
                            geno_array.mask[i,j] = True
                    
                    converted.append(str(code))
                    if output_array:
                        geno_array[i,j] = code
                
                out.write(''.join(converted) + '\n')
        
        os.remove('temp_geno.txt')
        
        # Verify the output
        print("\nVerifying conversion:")
        if output_array:
            print("Array shape:", geno_array.shape)
            print("Missing genotypes:", geno_array.mask.sum())
            print("Genotype counts:")
            for code in [0, 1, 2]:
                count = np.sum(geno_array.data[~geno_array.mask] == code)
                print(f"    {code}: {count}")
        
        if output_array:
            return base_out, geno_array
        return base_out



In [9]:
def test_conversion():
    """Test conversion with array output"""
    converter = VCFConverter(data_dir='../data/1000g')  # 适配你的路径
    print("\nTesting conversion with CHB population, chromosome 22...")
    base_out, geno_array = converter.convert_vcf_to_hapmap(population='CHB', output_array=True)
    
    print("\nArray verification:")
    print("Shape:", geno_array.shape)
    print("Data type:", geno_array.dtype)
    print("First few rows:")
    print(geno_array[:5,:10])  
    
    print("\nChecking consistency with .geno file:")
    with open(f'{base_out}.geno', 'r') as f:
        first_line = f.readline().strip()
        print("First line from file:", first_line[:20], "...")
        print("Same line from array:", ''.join(map(str, geno_array[0,:20])), "...")

if __name__ == "__main__":
    test_conversion()



Testing conversion with CHB population, chromosome 22...
Counting variants and samples...
Found 26395 variants and 745 samples
Creating .snp file...
Creating genotype array...
Converting genotypes...


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 26395/26395 [00:49<00:00, 537.33it/s]



Verifying conversion:
Array shape: (26395, 745)
Missing genotypes: 0
Genotype counts:
    0: 18667429
    1: 597587
    2: 381822

Array verification:
Shape: (26395, 745)
Data type: uint8
First few rows:
[[2 2 2 2 2 2 2 2 2 2]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]

Checking consistency with .geno file:
First line from file: 22222222222222222222 ...
Same line from array: 22222222222222222222 ...


In [10]:
# 读取 .geno 文件并与数组对比
with open('../data/1000g/converted/CHB.geno', 'r') as f:
    for i, line in enumerate(f):
        geno_file_line = line.strip()
        geno_array_line = ''.join(map(str, geno_array[i, :]))
        assert geno_file_line == geno_array_line, f"Mismatch at SNP {i}"
        
print("File and array match!")
import matplotlib.pyplot as plt

# 绘制基因型频率分布
unique_values, counts = np.unique(geno_array.compressed(), return_counts=True)
plt.bar(unique_values, counts)
plt.xlabel('Genotype')
plt.ylabel('Frequency')
plt.title('Genotype Frequency Distribution')
plt.show()

NameError: name 'geno_array' is not defined

In [16]:
# convert and test on 22

class VCFConverter:
    def __init__(self, data_dir='../data/1000g'):
        self.data_dir = data_dir
        self.output_dir = os.path.join(data_dir, 'converted')
        os.makedirs(self.output_dir, exist_ok=True)
    
    def convert_vcf_to_hapmap(self, chrom, population, output_array=True):
        """
        Convert VCF file to HapMap3 format
        
        Parameters:
        -----------
        chrom : int
            Chromosome number
        population : str
            Population code (e.g., 'CHB')
        output_array : bool
            If True, also returns the genotype data as a numpy masked array
        """
        vcf_file = os.path.join(self.data_dir, f'filtered.chr{chrom}.phase3.vcf.gz')
        base_out = os.path.join(self.output_dir, f'{population}')
        
        # 1. Get number of variants and samples first
        print("Counting variants and samples...")
        cmd = f"bcftools view -H {vcf_file} | wc -l"
        n_variants = int(subprocess.check_output(cmd, shell=True).decode().strip())
        
        cmd = f"bcftools query -l {vcf_file} | wc -l"
        n_samples = int(subprocess.check_output(cmd, shell=True).decode().strip())
        
        print(f"Found {n_variants} variants and {n_samples} samples")
        
        # 2. Create .ind file if not exists
        if not os.path.exists(f'{base_out}.ind'):
            print("Creating .ind file...")
            cmd = f"bcftools query -l {vcf_file} > temp_samples.txt"
            subprocess.run(cmd, shell=True, check=True)
            
            with open('temp_samples.txt', 'r') as f:
                samples = [line.strip() for line in f]
            
            ind_data = pd.DataFrame({
                'sample': samples,
                'sex': 'U',
                'pop': population
            })
            ind_data.to_csv(f'{base_out}.ind', sep='\t', index=False, header=False)
            os.remove('temp_samples.txt')
        
        # 3. Create .snp file
        print("Creating .snp file...")
        cmd = f"""bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\n' {vcf_file} > temp_snps.txt"""
        subprocess.run(cmd, shell=True, check=True)
        
        snp_data = pd.read_csv('temp_snps.txt', sep='\t', 
                              names=['chromosome', 'position', 'ref', 'alt'])
        snp_data['morgans'] = 0.0
        snp_data.to_csv(f'{base_out}.snp', sep='\t', index=False, header=False)
        os.remove('temp_snps.txt')
        
        # 4. Create genotype array and .geno file
        print("Creating genotype array...")
        # Initialize masked array
        if output_array:
            geno_array = np.ma.empty((n_variants, n_samples), dtype=np.uint8)
            geno_array.mask = False  # Initialize mask
        
        # Extract and convert genotypes
        cmd = f"bcftools query -f '%CHROM\t%POS[\t%GT]' {vcf_file} > temp_geno.txt"
        subprocess.run(cmd, shell=True, check=True)
        
        print("Converting genotypes...")
        with open('temp_geno.txt', 'r') as f, open(f'{base_out}.geno', 'w') as out:
            for i, line in enumerate(tqdm(f, total=n_variants)):
                parts = line.strip().split('\t')
                gts = parts[2:]  # Skip CHROM and POS
                
                converted = []
                for j, gt in enumerate(gts):
                    if gt in ['0|0', '0/0']:
                        code = 0
                    elif gt in ['0|1', '1|0', '0/1', '1/0']:
                        code = 1
                    elif gt in ['1|1', '1/1']:
                        code = 2
                    else:
                        code = 9
                        if output_array:
                            geno_array.mask[i,j] = True
                    
                    converted.append(str(code))
                    if output_array:
                        geno_array[i,j] = code
                
                out.write(''.join(converted) + '\n')
        
        os.remove('temp_geno.txt')
        
        # Verify the output
        print("\nVerifying conversion:")
        if output_array:
            print("Array shape:", geno_array.shape)
            print("Missing genotypes:", geno_array.mask.sum())
            print("Genotype counts:")
            for code in [0, 1, 2]:
                count = np.sum(geno_array.data[~geno_array.mask] == code)
                print(f"    {code}: {count}")
        
        if output_array:
            return base_out, geno_array
        return base_out

def test_conversion():
    """Test conversion with array output"""
    converter = VCFConverter()
    print("\nTesting conversion with CHB population, chromosome 22...")
    base_out, geno_array = converter.convert_vcf_to_hapmap(22, 'CHB', output_array=True)
    
    print("\nArray verification:")
    print("Shape:", geno_array.shape)
    print("Data type:", geno_array.dtype)
    print("First few rows:")
    print(geno_array[:5,:10])  
    
    print("\nChecking consistency with .geno file:")
    with open(f'{base_out}.geno', 'r') as f:
        first_line = f.readline().strip()
        print("First line from file:", first_line[:20], "...")
        print("Same line from array:", ''.join(map(str, geno_array[0,:20])), "...")

if __name__ == "__main__":
    test_conversion()


Testing conversion with CHB population, chromosome 22...
Counting variants and samples...
Found 1103547 variants and 745 samples
Creating .snp file...
Creating genotype array...
Converting genotypes...


100%|████████████████████████████████| 1103547/1103547 [25:57<00:00, 708.36it/s]



Verifying conversion:
Array shape: (1103547, 745)
Missing genotypes: 0
Genotype counts:
    0: 777007268
    1: 28191540
    2: 16477403

Array verification:
Shape: (1103547, 745)
Data type: uint8
First few rows:
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]

Checking consistency with .geno file:
First line from file: 00000000000000000000 ...
Same line from array: 00000000000000000000 ...


In [17]:
# convert all chromosomes
class VCFConverter:
    def __init__(self, data_dir='../data/1000g'):
        self.data_dir = data_dir
        self.output_dir = os.path.join(data_dir, 'converted')
        os.makedirs(self.output_dir, exist_ok=True)
    
    def convert_all_chromosomes(self, population):
        """Convert all chromosomes for a population"""
        # 1. First collect all SNPs and their positions across chromosomes
        all_snp_data = []
        total_variants = 0
        
        print(f"Processing all chromosomes for {population}...")
        for chrom in range(1, 23):
            vcf_file = os.path.join(self.data_dir, f'filtered.chr{chrom}.phase3.vcf.gz')
            
            # Get SNP info
            print(f"\nProcessing chromosome {chrom}")
            cmd = f"""bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\n' {vcf_file}"""
            subprocess.run(cmd, shell=True, stdout=open('temp_snps.txt', 'w'), check=True)
            
            snp_data = pd.read_csv('temp_snps.txt', sep='\t', 
                                  names=['chromosome', 'position', 'ref', 'alt'])
            snp_data['morgans'] = 0.0
            all_snp_data.append(snp_data)
            
            current_variants = len(snp_data)
            print(f"Found {current_variants} variants on chromosome {chrom}")
            total_variants += current_variants
        
        # Combine all SNP data
        print("\nCombining SNP data...")
        combined_snps = pd.concat(all_snp_data, axis=0)
        combined_snps.to_csv(os.path.join(self.output_dir, f'{population}.snp'), 
                            sep='\t', index=False, header=False)
        
        # Create chromosome range index
        chr_ranges = {}
        start = 0
        for chrom in range(1, 23):
            chr_len = len(all_snp_data[chrom-1])
            chr_ranges[chrom] = (start, start + chr_len)
            start += chr_len
        
        # Save chromosome ranges for future reference
        with open(os.path.join(self.output_dir, f'{population}.chr_ranges.txt'), 'w') as f:
            for chrom, (start, end) in chr_ranges.items():
                f.write(f"{chrom}\t{start}\t{end}\n")
        
        # 2. Create .ind file (only need to do once)
        print("\nCreating .ind file...")
        vcf_file = os.path.join(self.data_dir, f'filtered.chr1.phase3.vcf.gz')  # Use chr1 for sample info
        cmd = f"bcftools query -l {vcf_file} > temp_samples.txt"
        subprocess.run(cmd, shell=True, check=True)
        
        with open('temp_samples.txt', 'r') as f:
            samples = [line.strip() for line in f]
        
        ind_data = pd.DataFrame({
            'sample': samples,
            'sex': 'U',
            'pop': population
        })
        ind_data.to_csv(os.path.join(self.output_dir, f'{population}.ind'), 
                        sep='\t', index=False, header=False)
        
        # 3. Create combined .geno file
        print("\nCreating combined genotype file...")
        n_samples = len(samples)
        print(f"Total variants: {total_variants}, Samples: {n_samples}")
        
        with open(os.path.join(self.output_dir, f'{population}.geno'), 'w') as out:
            for chrom in range(1, 23):
                print(f"\nProcessing chromosome {chrom} genotypes...")
                vcf_file = os.path.join(self.data_dir, f'filtered.chr{chrom}.phase3.vcf.gz')
                
                cmd = f"bcftools query -f '%CHROM\t%POS[\t%GT]' {vcf_file}"
                proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
                
                for line in tqdm(proc.stdout, 
                               total=chr_ranges[chrom][1]-chr_ranges[chrom][0],
                               desc=f"Chr{chrom}"):
                    parts = line.strip().split('\t')
                    gts = parts[2:]  # Skip CHROM and POS
                    
                    converted = []
                    for gt in gts:
                        if gt in ['0|0', '0/0']:
                            converted.append('0')
                        elif gt in ['0|1', '1|0', '0/1', '1/0']:
                            converted.append('1')
                        elif gt in ['1|1', '1/1']:
                            converted.append('2')
                        else:
                            converted.append('9')
                    
                    out.write(''.join(converted) + '\n')
        
        # Clean up
        os.remove('temp_snps.txt')
        os.remove('temp_samples.txt')
        
        print("\nConversion complete!")
        print(f"Files created in {self.output_dir}:")
        print(f"1. {population}.snp - SNP information")
        print(f"2. {population}.ind - Sample information")
        print(f"3. {population}.geno - Genotype data")
        print(f"4. {population}.chr_ranges.txt - Chromosome ranges")
        
        return chr_ranges

def test_full_conversion():
    """Test full conversion for one population"""
    converter = VCFConverter()
    print("\nConverting all chromosomes for CHB population...")
    chr_ranges = converter.convert_all_chromosomes('CHB')
    
    # Verify the results
    print("\nChromosome ranges:")
    for chrom, (start, end) in chr_ranges.items():
        print(f"Chromosome {chrom}: {start}-{end} ({end-start} variants)")

if __name__ == "__main__":
    test_full_conversion()


Converting all chromosomes for CHB population...
Processing all chromosomes for CHB...

Processing chromosome 1
Found 6468094 variants on chromosome 1

Processing chromosome 2
Found 7081600 variants on chromosome 2

Processing chromosome 3
Found 5832276 variants on chromosome 3

Processing chromosome 4
Found 5732585 variants on chromosome 4

Processing chromosome 5
Found 5265763 variants on chromosome 5

Processing chromosome 6
Found 5024119 variants on chromosome 6

Processing chromosome 7
Found 4716715 variants on chromosome 7

Processing chromosome 8
Found 4597105 variants on chromosome 8

Processing chromosome 9
Found 3560687 variants on chromosome 9

Processing chromosome 10
Found 3992219 variants on chromosome 10

Processing chromosome 11
Found 4045628 variants on chromosome 11

Processing chromosome 12
Found 3868428 variants on chromosome 12

Processing chromosome 13
Found 2857916 variants on chromosome 13

Processing chromosome 14
Found 2655067 variants on chromosome 14

Proce

Chr1: 100%|████████████████████████| 6468094/6468094 [04:47<00:00, 22534.31it/s]



Processing chromosome 2 genotypes...


Chr2: 100%|████████████████████████| 7081600/7081600 [05:18<00:00, 22215.36it/s]



Processing chromosome 3 genotypes...


Chr3: 100%|████████████████████████| 5832276/5832276 [04:20<00:00, 22413.46it/s]



Processing chromosome 4 genotypes...


Chr4: 100%|████████████████████████| 5732585/5732585 [04:16<00:00, 22376.26it/s]



Processing chromosome 5 genotypes...


Chr5: 100%|████████████████████████| 5265763/5265763 [03:55<00:00, 22348.00it/s]



Processing chromosome 6 genotypes...


Chr6: 100%|████████████████████████| 5024119/5024119 [03:45<00:00, 22246.52it/s]



Processing chromosome 7 genotypes...


Chr7: 100%|████████████████████████| 4716715/4716715 [03:34<00:00, 21989.69it/s]



Processing chromosome 8 genotypes...


Chr8: 100%|████████████████████████| 4597105/4597105 [03:29<00:00, 21930.62it/s]



Processing chromosome 9 genotypes...


Chr9: 100%|████████████████████████| 3560687/3560687 [02:43<00:00, 21748.85it/s]



Processing chromosome 10 genotypes...


Chr10:  48%|███████████            | 1913847/3992219 [01:28<01:36, 21513.47it/s]


KeyboardInterrupt: 

In [None]:
import os
import subprocess
import pandas as pd
import numpy as np
from tqdm import tqdm

class VCFConverter:
    def __init__(self, data_dir='../data/1000g'):
        self.data_dir = data_dir
        self.output_dir = os.path.join(data_dir, 'converted')
        os.makedirs(self.output_dir, exist_ok=True)
    
    def convert_all_chromosomes(self, population):
        """Convert all chromosomes for a population with numpy array support"""
        # 1. First collect all variants and sample counts
        total_variants = 0
        chr_counts = {}
        
        print(f"Counting variants in each chromosome for {population}...")
        for chrom in range(1, 23):
            vcf_file = os.path.join(self.data_dir, f'filtered.chr{chrom}.phase3.vcf.gz')
            cmd = f"bcftools view -H {vcf_file} | wc -l"
            n_variants = int(subprocess.check_output(cmd, shell=True).decode().strip())
            chr_counts[chrom] = n_variants
            total_variants += n_variants
            print(f"Chromosome {chrom}: {n_variants} variants")
        
        # Get sample count
        vcf_file = os.path.join(self.data_dir, f'filtered.chr1.phase3.vcf.gz')
        cmd = f"bcftools query -l {vcf_file} | wc -l"
        n_samples = int(subprocess.check_output(cmd, shell=True).decode().strip())
        print(f"\nTotal variants: {total_variants}, Samples: {n_samples}")
        
        # 2. Initialize the masked array for all chromosomes
        print("\nInitializing genotype array...")
        geno_array = np.ma.empty((total_variants, n_samples), dtype=np.uint8)
        geno_array.mask = False
        
        # 3. Process each chromosome
        current_pos = 0
        chr_ranges = {}
        all_snp_data = []
        
        for chrom in range(1, 23):
            print(f"\nProcessing chromosome {chrom}")
            vcf_file = os.path.join(self.data_dir, f'filtered.chr{chrom}.phase3.vcf.gz')
            
            # Get SNP info
            cmd = f"""bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\n' {vcf_file}"""
            subprocess.run(cmd, shell=True, stdout=open('temp_snps.txt', 'w'), check=True)
            
            snp_data = pd.read_csv('temp_snps.txt', sep='\t', 
                                  names=['chromosome', 'position', 'ref', 'alt'])
            snp_data['morgans'] = 0.0
            all_snp_data.append(snp_data)
            
            # Record chromosome range
            chr_len = len(snp_data)
            chr_ranges[chrom] = (current_pos, current_pos + chr_len)
            
            # Get genotypes
            print(f"Converting genotypes for chromosome {chrom}...")
            cmd = f"bcftools query -f '%CHROM\t%POS[\t%GT]' {vcf_file}"
            proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
            
            for row_idx, line in enumerate(tqdm(proc.stdout, total=chr_len)):
                parts = line.strip().split('\t')
                gts = parts[2:]  # Skip CHROM and POS
                
                for col_idx, gt in enumerate(gts):
                    if gt in ['0|0', '0/0']:
                        code = 0
                    elif gt in ['0|1', '1|0', '0/1', '1/0']:
                        code = 1
                    elif gt in ['1|1', '1/1']:
                        code = 2
                    else:
                        code = 9
                        geno_array.mask[current_pos + row_idx, col_idx] = True
                    
                    geno_array[current_pos + row_idx, col_idx] = code
            
            current_pos += chr_len
        
        # 4. Save all files
        # Save chromosome ranges
        with open(os.path.join(self.output_dir, f'{population}.chr_ranges.txt'), 'w') as f:
            for chrom, (start, end) in chr_ranges.items():
                f.write(f"{chrom}\t{start}\t{end}\n")
        
        # Save SNP info
        print("\nSaving SNP information...")
        combined_snps = pd.concat(all_snp_data, axis=0)
        combined_snps.to_csv(os.path.join(self.output_dir, f'{population}.snp'), 
                            sep='\t', index=False, header=False)
        
        # Save sample info
        print("Saving sample information...")
        cmd = f"bcftools query -l {vcf_file} > temp_samples.txt"
        subprocess.run(cmd, shell=True, check=True)
        
        with open('temp_samples.txt', 'r') as f:
            samples = [line.strip() for line in f]
        
        ind_data = pd.DataFrame({
            'sample': samples,
            'sex': 'U',
            'pop': population
        })
        ind_data.to_csv(os.path.join(self.output_dir, f'{population}.ind'), 
                        sep='\t', index=False, header=False)
        
        # Save genotype data
        print("Saving genotype data...")
        # Save as text file
        with open(os.path.join(self.output_dir, f'{population}.geno'), 'w') as f:
            for i in range(geno_array.shape[0]):
                line = ''.join(map(str, np.where(geno_array.mask[i], '9', geno_array.data[i])))
                f.write(line + '\n')
        
        # Clean up
        os.remove('temp_snps.txt')
        os.remove('temp_samples.txt')
        
        print("\nConversion complete!")
        print(f"Files created in {self.output_dir}:")
        print(f"1. {population}.snp - SNP information")
        print(f"2. {population}.ind - Sample information")
        print(f"3. {population}.geno - Genotype data")
        print(f"4. {population}.chr_ranges.txt - Chromosome ranges")
        
        return geno_array, chr_ranges

def verify_conversion(geno_array, chr_ranges, population, output_dir):
    print("\nVerifying conversion:")

    print(f"Array shape: {geno_array.shape}")
    print(f"Missing genotypes: {geno_array.mask.sum()}")
    
    for chrom, (start, end) in chr_ranges.items():
        chr_data = geno_array[start:end]
        print(f"\nChromosome {chrom}:")
        print(f"Variants: {end-start}")
        print("Genotype counts:")
        for code in [0, 1, 2]:
            count = np.sum(chr_data.data[~chr_data.mask] == code)
            print(f"    {code}: {count}")
        print(f"Missing: {chr_data.mask.sum()}")

if __name__ == "__main__":
    converter = VCFConverter()
    print("Converting all chromosomes for CHB population...")
    geno_array, chr_ranges = converter.convert_all_chromosomes('CHB')
    verify_conversion(geno_array, chr_ranges, 'CHB', converter.output_dir)