In [2]:
import os
import subprocess


vcf_dir = "../data/1000g/" 
output_file = "1000g_snps_combined.txt"  

with open(output_file, "w") as combined_file:
    vcf_files = [f for f in os.listdir(vcf_dir) if f.endswith(".vcf.gz")]
    
    for vcf_file in vcf_files:
        vcf_path = os.path.join(vcf_dir, vcf_file)
        print(f"Processing {vcf_file}...")
        try:
            subprocess.run(
                ["bcftools", "query", "-f", "%CHROM:%POS\n", vcf_path],
                stdout=combined_file,
                check=True
            )
        except subprocess.CalledProcessError as e:
            print(f"Error processing {vcf_file}: {e}")

print(f"All 1000 Genomes SNPs saved to {output_file}")


Processing ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz...
Processing filtered.chr22.phase3.vcf.gz...
Processing filtered.chr3.phase3.vcf.gz...
Processing filtered.chr2.phase3.vcf.gz...
Processing filtered.chr1.phase3.vcf.gz...
Processing filtered.chr4.phase3.vcf.gz...
Processing ALL.chr5.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz...
Processing ALL.chr6.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz...
Processing filtered.chr5.phase3.vcf.gz...
Processing filtered.chr6.phase3.vcf.gz...
Processing ALL.chr7.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz...


[W::bcf_sr_add_reader] No BGZF EOF marker; file '../data/1000g/ALL.chr7.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz' may be truncated
[E::bgzf_read_block] Failed to read BGZF block data at offset 784325627 expected 1560 bytes; hread returned 1011


Processing filtered.chr7.phase3.vcf.gz...
Processing filtered.chr9.phase3.vcf.gz...
Processing filtered.chr8.phase3.vcf.gz...
Processing filtered.chr10.phase3.vcf.gz...
Processing filtered.chr11.phase3.vcf.gz...
Processing filtered.chr12.phase3.vcf.gz...
Processing filtered.chr13.phase3.vcf.gz...
Processing filtered.chr14.phase3.vcf.gz...
Processing filtered.chr15.phase3.vcf.gz...
Processing filtered.chr17.phase3.vcf.gz...
Processing filtered.chr16.phase3.vcf.gz...
Processing filtered.chr18.phase3.vcf.gz...
Processing filtered.chr19.phase3.vcf.gz...
Processing filtered.chr20.phase3.vcf.gz...
Processing filtered.chr21.phase3.vcf.gz...
All 1000 Genomes SNPs saved to 1000g_snps_combined.txt


In [1]:
# 配置路径
file1 = "1000g_snps_combined.txt"
file2 = "../data/1000g/hapmap3_snps_formatted.txt"
output_file = "mutual_snps.txt"

# 加载 HapMap3 SNPs 到集合中
with open(file2, "r") as f2:
    snps2 = set(f2.read().splitlines())

# 逐块处理 1000 Genomes 文件
common_snps = set()
with open(file1, "r") as f1:
    for line in f1:
        snp = line.strip()
        if snp in snps2:
            common_snps.add(snp)

# 保存结果
with open(output_file, "w") as out_file:
    out_file.write("\n".join(common_snps))

print(f"Common SNPs saved to {output_file}")
print(f"Number of common SNPs: {len(common_snps)}")


Common SNPs saved to mutual_snps.txt
Number of common SNPs: 21505


In [9]:
import os
import gzip


vcf_dir = "../data/1000g/" 
common_snps_file = "mutual_snps.txt"  
output_file = "1000g_common_snps_1129.vcf"


def load_common_snps(common_snps_file):
    with open(common_snps_file, "r") as f:
        return set(line.strip() for line in f)


def filter_vcf_by_snps(vcf_file, common_snps, output_handle):
    with gzip.open(vcf_file, "rt") as f:
        header = []
        for line in f:
            # Collect the header lines
            if line.startswith("#"):
                header.append(line)
            else:
                # Process the VCF body
                fields = line.split("\t")
                snp_id = fields[2]  # SNP ID is in the 3rd column
                if snp_id in common_snps:
                    output_handle.write(line)
        
        # Write the header only once to the output file
        output_handle.writelines(header)

def process_all_vcf_files(vcf_dir, common_snps_file, output_file):
    common_snps = load_common_snps(common_snps_file)
    print(f"Loaded {len(common_snps)} common SNPs from {common_snps_file}.")

    with open(output_file, "w") as output_handle:
        for filename in os.listdir(vcf_dir):
            if filename.startswith("filtered.chr") and filename.endswith(".vcf.gz"):
                vcf_file = os.path.join(vcf_dir, filename)
                print(f"Processing {vcf_file}...")
                filter_vcf_by_snps(vcf_file, common_snps, output_handle)
                print(f"Finished {vcf_file}.")

    print(f"Filtered VCF data saved to {output_file}.")

# 执行脚本
process_all_vcf_files(vcf_dir, common_snps_file, output_file)


Loaded 21505 common SNPs from mutual_snps.txt.
Processing ../data/1000g/filtered.chr22.phase3.vcf.gz...
Finished ../data/1000g/filtered.chr22.phase3.vcf.gz.
Processing ../data/1000g/filtered.chr3.phase3.vcf.gz...
Finished ../data/1000g/filtered.chr3.phase3.vcf.gz.
Processing ../data/1000g/filtered.chr2.phase3.vcf.gz...
Finished ../data/1000g/filtered.chr2.phase3.vcf.gz.
Processing ../data/1000g/filtered.chr1.phase3.vcf.gz...
Finished ../data/1000g/filtered.chr1.phase3.vcf.gz.
Processing ../data/1000g/filtered.chr4.phase3.vcf.gz...
Finished ../data/1000g/filtered.chr4.phase3.vcf.gz.
Processing ../data/1000g/filtered.chr5.phase3.vcf.gz...
Finished ../data/1000g/filtered.chr5.phase3.vcf.gz.
Processing ../data/1000g/filtered.chr6.phase3.vcf.gz...
Finished ../data/1000g/filtered.chr6.phase3.vcf.gz.
Processing ../data/1000g/filtered.chr7.phase3.vcf.gz...
Finished ../data/1000g/filtered.chr7.phase3.vcf.gz.
Processing ../data/1000g/filtered.chr9.phase3.vcf.gz...
Finished ../data/1000g/filtered

In [5]:
import os
import subprocess


data_dir = "../data/1000g/" 
output_dir = "../data/1000g_processed/"
common_snps_file = "mutual_snps.txt" 


os.makedirs(output_dir, exist_ok=True)

chromosomes = [f"chr{i}" for i in range(1, 23)]  

def process_chromosome(chromosome):
    input_vcf = os.path.join(data_dir, f"filtered.{chromosome}.phase3.vcf.gz")
    output_vcf = os.path.join(output_dir, f"{chromosome}_common_snps.vcf.gz")
    plink_prefix = os.path.join(output_dir, f"{chromosome}_common_snps")
    plink_qc_prefix = os.path.join(output_dir, f"{chromosome}_common_snps_qc")

    # 检查输入文件是否存在
    if not os.path.exists(input_vcf):
        print(f"Input file for {chromosome} not found: {input_vcf}")
        return

    try:
        # 1. 使用 bcftools 提取交集 SNP
        print(f"Processing {chromosome}: Extracting common SNPs...")
        subprocess.run(
            ["bcftools", "view", "--regions-file", common_snps_file, "-Oz", "-o", output_vcf, input_vcf],
            check=True
        )

        # 2. 转换 VCF 为 PLINK 格式
        print(f"Processing {chromosome}: Converting to PLINK format...")
        subprocess.run(
            ["plink", "--vcf", output_vcf, "--make-bed", "--out", plink_prefix],
            check=True
        )

        # 3. 质量控制
        print(f"Processing {chromosome}: Running quality control...")
        subprocess.run(
            ["plink", "--bfile", plink_prefix, "--geno", "0.05", "--mind", "0.05", "--make-bed", "--out", plink_qc_prefix],
            check=True
        )

        print(f"{chromosome} processing complete!")
    except subprocess.CalledProcessError as e:
        print(f"Error processing {chromosome}: {e}")

# 批量处理所有染色体
for chromosome in chromosomes:
    process_chromosome(chromosome)

print("All chromosomes processed!")


Processing chr1: Extracting common SNPs...
Error processing chr1: Command '['bcftools', 'view', '--regions-file', 'mutual_snps.txt', '-Oz', '-o', '../data/1000g_processed/chr1_common_snps.vcf.gz', '../data/1000g/filtered.chr1.phase3.vcf.gz']' returned non-zero exit status 255.
Processing chr2: Extracting common SNPs...
Error processing chr2: Command '['bcftools', 'view', '--regions-file', 'mutual_snps.txt', '-Oz', '-o', '../data/1000g_processed/chr2_common_snps.vcf.gz', '../data/1000g/filtered.chr2.phase3.vcf.gz']' returned non-zero exit status 255.
Processing chr3: Extracting common SNPs...
Error processing chr3: Command '['bcftools', 'view', '--regions-file', 'mutual_snps.txt', '-Oz', '-o', '../data/1000g_processed/chr3_common_snps.vcf.gz', '../data/1000g/filtered.chr3.phase3.vcf.gz']' returned non-zero exit status 255.
Processing chr4: Extracting common SNPs...
Error processing chr4: Command '['bcftools', 'view', '--regions-file', 'mutual_snps.txt', '-Oz', '-o', '../data/1000g_proce

[E::bcf_sr_regions_init] Could not parse 1-th line of file mutual_snps.txt, using the columns 1,2[,-1]
Failed to read the regions: mutual_snps.txt
[E::bcf_sr_regions_init] Could not parse 1-th line of file mutual_snps.txt, using the columns 1,2[,-1]
Failed to read the regions: mutual_snps.txt
[E::bcf_sr_regions_init] Could not parse 1-th line of file mutual_snps.txt, using the columns 1,2[,-1]
Failed to read the regions: mutual_snps.txt
[E::bcf_sr_regions_init] Could not parse 1-th line of file mutual_snps.txt, using the columns 1,2[,-1]
Failed to read the regions: mutual_snps.txt
[E::bcf_sr_regions_init] Could not parse 1-th line of file mutual_snps.txt, using the columns 1,2[,-1]
Failed to read the regions: mutual_snps.txt
[E::bcf_sr_regions_init] Could not parse 1-th line of file mutual_snps.txt, using the columns 1,2[,-1]
Failed to read the regions: mutual_snps.txt
[E::bcf_sr_regions_init] Could not parse 1-th line of file mutual_snps.txt, using the columns 1,2[,-1]
Failed to read 

In [8]:
with open("mutual_snps.txt", "r") as f:
    print(f.lines)

AttributeError: '_io.TextIOWrapper' object has no attribute 'lines'