# Extract imuted VCF file
Extract the imputed VCF files on 1240k SNPs.

For reproducibilty of ancIBD results and allow others to double-check.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import socket as socket
import os as os
import sys as sys
import multiprocessing as mp
import h5py
import allel

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM O2 Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Harvard Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)

print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

compute-b-16-177.o2.rc.hms.harvard.edu
HSM O2 Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 96


In [2]:
### Version
vrs = 64.0

### Load Meta Data
df_meta = pd.read_csv(f"/n/groups/reich/hringbauer/git/ibd_euro/data/hdf5/1240k_v{vrs}/meta_maxgp.tsv", sep="\t")
print(f"Loaded imputed Meta File of {len(df_meta)} iids")

dfp = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/final_new_genomes210.v54.1.tsv", sep="\t") 
#df = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/final_new_genomes210.v56.3.tsv", sep="\t") 
print(f"Loaded Punic table of {len(dfp)} iids to upload.")

### Intersect
idx = df_meta["Master ID"].isin(dfp["Master ID"])
print(f"Found {np.sum(idx)}/{len(idx)} Punic iids in master df")

### Keep only the highest maxGP
df_upload = df_meta[idx].sort_values(by="frac_gp", ascending=False)
idx = df_upload["Master ID"].duplicated()
df_upload=df_upload[~idx].reset_index(drop=True)
print(f"Kept {np.sum(~idx)}/{len(idx)} unique Master IDs.")

iids = df_upload["iid"].values
iids_all = ",".join(iids)

Loaded imputed Meta File of 57929 iids
Loaded Punic table of 210 iids to upload.
Found 302/57929 Punic iids in master df
Kept 210/302 unique Master IDs.


## 1a) Extract all Punic iids in Punic VCF (faster)
Takes about 30min per long Chromosome

In [None]:
iids_all

In [None]:
for ch in range(1,23):
    path_main_imputed_vcf = f"/n/groups/reich/hringbauer/git/ibd_euro/data/vcf/1240k_v{vrs}/ch{ch}.vcf.gz"
    print(f"Running Chr.: {ch}")
    out_vcf_path = f"/n/groups/reich/hringbauer/git/punic_aDNA/output/v{vrs}_imputed/punic210_ch{ch}.vcf.gz"
        
    !bcftools view -s $iids_all -Oz -o  $out_vcf_path $path_main_imputed_vcf
    !bcftools index $out_vcf_path
print(f"Finished!")

Running Chr.: 1


In [None]:
print("Finished!")

# Extract the individual VCFs in target list From this smaller VCF: 
Takes about 5-10s per long Chromosome

In [None]:
for ch in range(1,23):
    path_main_imputed_vcf = f"/n/groups/reich/hringbauer/git/punic_aDNA/output/v{vrs}_imputed/punic210_ch{ch}.vcf.gz"
    for iid in iids:
        print(f"Running Chr.: {ch}; iid: {iid}")
        out_vcf_path = f"/n/groups/reich/hringbauer/git/punic_aDNA/output/v{vrs}_imputed/{iid}/1240k.imputed/{iid}.ch{ch}.vcf.gz"
        out_folder = os.path.dirname(out_vcf_path)
        ### Create Output folder if not there
        if not os.path.exists(out_folder):
            os.makedirs(out_folder)
            
        !bcftools view -s $iid -Oz -o  $out_vcf_path $path_main_imputed_vcf
        !bcftools index $out_vcf_path
print(f"Finished!")

In [None]:
iids

In [None]:
print('Finished')

# 2) Extract Pablo's imputed data
### 2a) Find all IIDs

In [2]:
### Version
vrs = 65.0

### Load Meta Data
df_meta = pd.read_csv(f"/n/groups/reich/hringbauer/git/ibd_euro/data/hdf5/1240k_v{vrs}/meta_maxgp.tsv", sep="\t")
print(f"Loaded imputed Meta File of {len(df_meta)} iids")

dfp = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/tmp.Visi_ID_60ok.tsv", sep="\t") 
dfp["Master ID"] = dfp["iid"].str.split(".").str[0]
print(f"Loaded Visigoth table of {len(dfp)} iids to upload.")

### Intersect
idx = df_meta["Master ID"].isin(dfp["Master ID"])
print(f"Found {np.sum(idx)}/{len(dfp)} iids with matching Visigoth master id")

### Keep only the highest maxGP
df_upload = df_meta[idx].sort_values(by="frac_gp", ascending=False)
idx = df_upload["Master ID"].duplicated()
df_upload=df_upload[~idx].reset_index(drop=True)
print(f"Kept {np.sum(~idx)}/{len(idx)} unique Master IDs.")

iids = df_upload["iid"].values
iids_all = ",".join(iids)

Loaded imputed Meta File of 61237 iids
Loaded Visigoth table of 65 iids to upload.
Found 69/65 iids with matching Visigoth master id
Kept 65/69 unique Master IDs.


## 2b) Extract first all IIDs into one subset imputed VCF
Takes about 15-30min per long Chromosome

In [None]:
for ch in range(1,23):
    path_main_imputed_vcf = f"/n/groups/reich/hringbauer/git/ibd_euro/data/vcf/1240k_v{vrs}/ch{ch}.vcf.gz"
    print(f"Running Chr.: {ch}")
    out_vcf_path = f"/n/groups/reich/hringbauer/git/punic_aDNA/output/v{vrs}_imputed/pablo65_ch{ch}.vcf.gz"
    !module load bcftools; bcftools view -s $iids_all -Oz -o  $out_vcf_path $path_main_imputed_vcf
    !module load bcftools; bcftools index $out_vcf_path
print(f"Finished!")

Running Chr.: 1
Running Chr.: 2
Running Chr.: 3
Running Chr.: 4


In [12]:
#df_upload.sort_values(by="frac_gp")

# 2c) Extract Pablo's iids into indivdiual 1240 Files
These are alrady formatted in ancIBD main run format

In [None]:
for ch in range(1,23):
    path_main_imputed_vcf = f"/n/groups/reich/hringbauer/git/punic_aDNA/output/v{vrs}_imputed/pablo65_ch{ch}.vcf.gz"
    for iid in iids:
        print(f"Running Chr.: {ch}; iid: {iid}")
        out_vcf_path = f"/n/groups/reich/hringbauer/git/punic_aDNA/output/v{vrs}_imputed_pablo/{iid}/1240k.imputed/{iid}.ch{ch}.vcf.gz"
        out_folder = os.path.dirname(out_vcf_path)
        ### Create Output folder if not there
        if not os.path.exists(out_folder):
            os.makedirs(out_folder)
            
        !bcftools view -s $iid -Oz -o  $out_vcf_path $path_main_imputed_vcf
        !bcftools index $out_vcf_path
print(f"Finished!")

### 2d) Zip the folder into one file
Run from shell:

`zip -r v65.0_imputed_pablo.zip ./v65.0_imputed_pablo/`

## Now you can download this file!

# Area51

## Browse VCF files

In [5]:
vrs = 64.0
ch = 3
#path_vcf = f"/n/groups/reich/hringbauer/git/ibd_euro/data/vcf/1240k_v{vrs}/ch{ch}.vcf.gz"

### Intermediate VCF:
path_vcf = f"/n/groups/reich/hringbauer/git/punic_aDNA/output/v65.0_imputed_pablo/I12031.AG/1240k.imputed/I12031.AG.ch3.vcf.gz"

In [9]:
!bcftools view $path_vcf | head -50

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##fileDate=15/07/2024 - 12:33:28
##source=GLIMPSE_phase v1.0.0
##contig=<ID=3>
##INFO=<ID=RAF,Number=A,Type=Float,Description="ALT allele frequency in the reference panel">
##INFO=<ID=AF,Number=A,Type=Float,Description="ALT allele frequency computed from DS/GP field across target samples">
##INFO=<ID=INFO,Number=A,Type=Float,Description="Imputation information or quality score">
##INFO=<ID=BUF,Number=A,Type=Integer,Description="Is it a variant site falling within buffer regions? (0=no/1=yes)">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Unphased genotypes">
##FORMAT=<ID=DS,Number=1,Type=Float,Description="Genotype dosage">
##FORMAT=<ID=GP,Number=3,Type=Float,Description="Genotype posteriors">
##FORMAT=<ID=HS,Number=1,Type=Integer,Description="Sampled haplotype pairs packed into intergers (max: 15 pairs, see NMAIN header line)">
##NMAIN=10
##bcftools_annotateVersion=1.14+htslib-1.14
##bcftools_annotateCommand

In [2]:
### Should be 88407 variantsb
!bcftools query -f '%POS\n' $path_vcf | wc -l 

77345


In [4]:
### Gives number of individuals
!bcftools query -l $path_vcf #| wc -l # 19260

I12031.AG
