# Process the v43.4 Data into hdf5
### Prepare Imputed Genotype Files in HDF5 Format
Uses imported function from vcf_to_hdf5.py
The original version can be found in `process_alis_imputed_v43.ipynb`

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import socket as socket
import os as os
import sys as sys
import multiprocessing as mp
import h5py
import allel

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM O2 Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/hapBLOCK/"  # The Path on Harvard Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)

print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

sys.path.insert(0, "/n/groups/reich/hringbauer/git/hapBLOCK/python3/prepare")  # hack to get local package first in path
from prepare_h5 import vcf_to_1240K_hdf
#from hapsburg.PackagesSupport.h5_python.h5_functions import merge_in_ld_map
#sys.path.append("/n/groups/reich/hringbauer/git/hapBLOCK/python3/")
#from IO.h5_modify import merge_in_af, get_af, get_af1000G, lift_af

compute-e-16-233.o2.rc.hms.harvard.edu
HSM O2 Computational partition detected.
/n/groups/reich/hringbauer/git/hapBLOCK
CPU Count: 28


# Run all steps of the transformation for whole Chromosomes bundled up
Ultimately loop over multiple chromosomes. 

Can **run in parallel**: See `./cluster/vcf_to_hdf5.py`


Takes about 2 hours for long chromosome

In [None]:
/n/groups/reich/ali/WholeGenomeImputation/imputed/v49.2

./chr*.bcf files contain all 1000GP phase 3 biallelic variants (SNPs and indels).
./1240k/autosomes.bcf file contains 1240k SNPs for 22 autosomes combined in a single file.

In [4]:
%%time
ch = 22
vrs = "49.2"
v0 = vrs.split(".")[0]

base_path = f"/n/groups/reich/hringbauer/git/hapBLOCK"
vcf_to_1240K_hdf(in_vcf_path = f"/n/groups/reich/ali/WholeGenomeImputation/imputed/v{vrs}/chr{ch}.bcf",
                 path_vcf = f"{base_path}/data/vcf/1240k_v{vrs}/ch{ch}.vcf.gz",
                 path_h5 = f"{base_path}/data/hdf5/1240k_v{vrs}/ch{ch}.h5",
                 marker_path = f"{base_path}/data/filters/1240K_1000G/snps_bcftools_ch{ch}.csv",
                 map_path = f"/n/groups/reich/DAVID/V{v0}/V{vrs}/v{vrs}.snp", buffer_size=20000,
                 chunk_width=8, chunk_length=20000,
                 ch=ch)

print(f"Finished running chromosome {ch}")

Print downsampling to 1240K...
Finished BCF tools filtering.
Converting to HDF5...
Finished conversion to hdf5!
Merging in LD Map..
Lifting LD Map from eigenstrat to HDF5...
Loaded 15793 variants.
Loaded 21255 individuals.
Loaded 16420 Chr.22 1240K SNPs.
Intersection 15793 out of 15793 HDF5 SNPs
Finished Chromosome 22.
Adding map to HDF5...
We did it. Finished.
Merging in Allele Frequencies
Adding map to HDF5...
Loaded 15793 variants.
Finshed merged in allele frequencies into /n/groups/reich/hringbauer/git/hapBLOCK/data/hdf5/1240k_v49.2/ch22.h5
Finished running chromosome 22
CPU times: user 7min 35s, sys: 44.3 s, total: 8min 20s
Wall time: 41min 31s


In [5]:
print("Hello? Blizzard?")

Hello? Blizzard?


# Bonus Task: Merge all vcfs into master vcf and create one master hdf5
Needed e.g. for Fst calculation
Takes about ~5 min

In [3]:
def merge_vcfs(in_vcf_paths=[], out_vcf_path=""):
    """Merges Set of VCFs into one VCF. 
    in_vcf_paths: List of VCFs to merge
    out_vcf_path: Output of VCF"""
    paths_merge = " ".join(in_vcf_paths)
    !bcftools concat -n -o $out_vcf_path $paths_merge
    print("Finished BCF tools filtering.")

In [4]:
%%time
### Step 1: Merge all VCFs
base_folder_vcf = "./data/vcf/1240k_v46.2/ch"
out_vcf_path = "./data/vcf/1240k_v49.2/all_ch.vcf.gz"
paths_vcf = [base_folder_vcf + str(ch) + ".vcf.gz" for ch in range(1,23)]

merge_vcfs(in_vcf_paths=paths_vcf, out_vcf_path=out_vcf_path)

Checking the headers of 22 files.
Done, the headers are compatible.
Concatenating ./data/vcf/1240k_v46.2/ch1.vcf.gz	57.157292 seconds
Concatenating ./data/vcf/1240k_v46.2/ch2.vcf.gz	53.625306 seconds
Concatenating ./data/vcf/1240k_v46.2/ch3.vcf.gz	50.910228 seconds
Concatenating ./data/vcf/1240k_v46.2/ch4.vcf.gz	32.858806 seconds
Concatenating ./data/vcf/1240k_v46.2/ch5.vcf.gz	63.243209 seconds
Concatenating ./data/vcf/1240k_v46.2/ch6.vcf.gz	43.311915 seconds
Concatenating ./data/vcf/1240k_v46.2/ch7.vcf.gz	26.023809 seconds
Concatenating ./data/vcf/1240k_v46.2/ch8.vcf.gz	60.150605 seconds
Concatenating ./data/vcf/1240k_v46.2/ch9.vcf.gz	34.420143 seconds
Concatenating ./data/vcf/1240k_v46.2/ch10.vcf.gz	31.067965 seconds
Concatenating ./data/vcf/1240k_v46.2/ch11.vcf.gz	48.944901 seconds
Concatenating ./data/vcf/1240k_v46.2/ch12.vcf.gz	21.056237 seconds
Concatenating ./data/vcf/1240k_v46.2/ch21.vcf.gz	9.016085 seconds
Concatenating ./data/vcf/1240k_v46.2/ch22.vcf.gz	7.188044 seconds
Finis

### And now transform the whole data to hdf5
Takes 8 hours. Ouch.

In [None]:
%%time
out_path_h5="./data/hdf5/1240k_v49.2/all_ch.h5"
allel.vcf_to_hdf5(input=out_vcf_path, output=out_path_h5, chunk_length=10000, chunk_width=8,
                  fields = ['variants/*', 'calldata/*', "samples"], compression="gzip") # Do the conversion to hdf5. Takes 7h30

# Bonus: Create Variant only VCF

In [15]:
### index vcf file  -t
vcf_all = "./data/vcf/1240k_v43/all_ch.vcf.gz"
!bcftools index -f $vcf_all

In [1]:
print("Finished")

Finished


In [17]:

vcf_all = "./data/vcf/1240k_v43/all_ch.vcf.gz"
vcf_var_only = "./data/vcf/1240k_v43/1240k_vars.vcf.gz"

In [18]:
!bcftools view -G -o $vcf_var_only $vcf_all 

# Area 51
Test code here.

### Test vcf

In [19]:
### index vcf file  -t
test = "/n/groups/reich/ali/WholeGenomeImputation/imputed/v43.4/chr3.bcf"

ch = 21
base_path = f"/n/groups/reich/hringbauer/git/hapBLOCK"
test = f"{base_path}/data/vcf/1240k_v43/ch{ch}.vcf.gz" #v46.2
!bcftools view $test | head -25

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##fileDate=15/07/2020 - 20:20:41
##source=GLIMPSE_phase v1.0.0
##contig=<ID=21>
##INFO=<ID=RAF,Number=A,Type=Float,Description="ALT allele frequency in the reference panel">
##INFO=<ID=AF,Number=A,Type=Float,Description="ALT allele frequency computed from DS/GP field across target samples">
##INFO=<ID=INFO,Number=A,Type=Float,Description="Imputation information or quality score">
##INFO=<ID=BUF,Number=A,Type=Integer,Description="Is it a variant site falling within buffer regions? (0=no/1=yes)">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Unphased genotypes">
##FORMAT=<ID=DS,Number=1,Type=Float,Description="Genotype dosage">
##FORMAT=<ID=GP,Number=3,Type=Float,Description="Genotype posteriors">
##FORMAT=<ID=HS,Number=1,Type=Integer,Description="Sampled haplotype pairs packed into intergers (max: 15 pairs, see NMAIN header line)">
##NMAIN=10
##bcftools_annotateVersion=1.10.2+htslib-1.10.2
##bcftools_annotateCo

In [20]:
!bcftools query -f '%POS\n' $test | wc -l

16031


In [21]:
!bcftools query -l $test | wc -l # 19260

14523


### Test Created HDF5

In [3]:
%%time
ch=22
with h5py.File(f"./data/hdf5/1240k_v46.2/ch{ch}.h5", "r") as f: # Load for Sanity Check. See below!
    gp = f["calldata/GP"][:,0,:]
    gt = f["calldata/GT"][:,0,:]
    print(list(f["variants"]))
    print(list(f["calldata"]))
    print(np.shape(f["calldata/GT"]))

['AF', 'AF_ALL', 'ALT', 'BUF', 'CHROM', 'FILTER_PASS', 'ID', 'INFO', 'MAP', 'POS', 'QUAL', 'RAF', 'REF', 'altlen', 'is_snp', 'numalt']
['AD', 'DS', 'GP', 'GT', 'HS', 'PL']
(15793, 19260, 2)
CPU times: user 24.9 ms, sys: 12.4 ms, total: 37.3 ms
Wall time: 300 ms


In [10]:
%%time

with h5py.File(f"./data/hdf5/1240k_v49.2/all_ch.h5", "r") as f: # Load for Sanity Check. See below!
    #gp = f["calldata/GP"][:,0,:]
    #gt = f["calldata/GT"][:,0,:]
    ad = f["calldata/AD"][:,0,:]
    print(list(f["variants"]))
    print(list(f["calldata"]))
    print(np.shape(f["calldata/GT"]))

['AF', 'ALT', 'BUF', 'CHROM', 'FILTER_PASS', 'ID', 'INFO', 'POS', 'QUAL', 'RAF', 'REF', 'altlen', 'is_snp', 'numalt']
['AD', 'DS', 'GP', 'GT', 'HS', 'PL']
(1100313, 19260, 2)
CPU times: user 343 ms, sys: 27 ms, total: 370 ms
Wall time: 11.1 s


In [79]:
snps = range(30000,30200)
j = 12483
ads = f["calldata/AD"][snps, j, :2]
gts = f["calldata/GT"][snps, j, :]
gp = f["calldata/GP"][snps, j, :]
df = pd.DataFrame({"ref":ads[:,0], "alt":ads[:,1], "gt0":np.sum(gts, axis=1)})

In [7]:
f.close()