# Prepare Imputed Genotype Files in HDF5 Format
Uses imported function from vcf_to_hdf5.py
The original version can be found in `process_alis_imputed_v43.ipynb`

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import socket as socket
import os as os
import sys as sys
import multiprocessing as mp
import h5py
import allel

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM O2 Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/hapBLOCK/"  # The Path on Harvard Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)

print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

sys.path.insert(0, "/n/groups/reich/hringbauer/git/hapBLOCK/python3/prepare")  # hack to get local package first in path
from prepare_h5 import vcf_to_1240K_hdf
#from hapsburg.PackagesSupport.h5_python.h5_functions import merge_in_ld_map
#sys.path.append("/n/groups/reich/hringbauer/git/hapBLOCK/python3/")
#from IO.h5_modify import merge_in_af, get_af, get_af1000G, lift_af

# Run all steps of the transformation for whole Chromosomes bundled up
Ultimately loop over multiple chromosomes or run in paralle (see `./cluster/vcf_to_hdf5.py`)
Takes about 2 hours for long chromosome

In [22]:
%%time
ch = 22
base_path = f"/n/groups/reich/hringbauer/git/hapBLOCK"
vcf_to_1240K_hdf(in_vcf_path = f"/n/groups/reich/ali/WholeGenomeImputation/imputed/v46.2/chr{ch}.bcf",
                 path_vcf = f"{base_path}/data/vcf/1240k_v46.2/ch{ch}.vcf.gz",
                 path_h5 = f"{base_path}/data/hdf5/1240k_v46.2/ch{ch}.h5",
                 marker_path = f"{base_path}/data/filters/1240K_1000G/snps_bcftools_ch{ch}.csv",
                 map_path = f"/n/groups/reich/DAVID/V46/V46.2/v46.2.snp", buffer_size=20000,
                 chunk_width=8, chunk_length=20000,
                 ch=ch)

print(f"Finished running chromosome {ch}")

Print downsampling to 1240K...
Converting to HDF5...
Finished conversion to hdf5!
Merging in LD Map..
Lifting LD Map from eigenstrat to HDF5...
Loaded 15793 variants.
Loaded 19260 individuals.
Loaded 16420 Chr.22 1240K SNPs.
Intersection 15793 out of 15793 HDF5 SNPs
Finished Chromosome 22.
Adding map to HDF5...
We did it. Finished.
Merging in Allele Frequencies
Adding map to HDF5...
Loaded 15793 variants.
Finshed merged in allele frequencies into /n/groups/reich/hringbauer/git/hapBLOCK/data/hdf5/1240k_v46.2/ch22.h5
Finished running chromosome 22
CPU times: user 6min 45s, sys: 11.3 s, total: 6min 57s
Wall time: 7min 2s


In [24]:
print("Hello? Blizzard?")

Hello? Blizzard?


# Bonus Task: Merge all vcfs into master vcf and create one master hdf5
Takes about ~5 min

In [5]:
%%time
base_folder_vcf = "./data/vcf/1240k_v43/ch"
out_vcf_path = "./data/vcf/1240k_v43/all_ch.vcf.gz"
paths_vcf = [base_folder_vcf + str(ch) + ".vcf.gz" for ch in range(1,23)]

merge_vcfs(in_vcf_paths=paths_vcf, out_vcf_path=out_vcf_path)

Checking the headers of 22 files.
Done, the headers are compatible.
Concatenating ./data/vcf/1240k_v43/ch1.vcf.gz	30.639150 seconds
Concatenating ./data/vcf/1240k_v43/ch2.vcf.gz	45.643698 seconds
Concatenating ./data/vcf/1240k_v43/ch3.vcf.gz	36.244412 seconds
Concatenating ./data/vcf/1240k_v43/ch4.vcf.gz	24.110753 seconds
Concatenating ./data/vcf/1240k_v43/ch5.vcf.gz	15.251328 seconds
Concatenating ./data/vcf/1240k_v43/ch6.vcf.gz	38.759324 seconds
Concatenating ./data/vcf/1240k_v43/ch7.vcf.gz	22.824363 seconds
Concatenating ./data/vcf/1240k_v43/ch8.vcf.gz	29.042413 seconds
Concatenating ./data/vcf/1240k_v43/ch9.vcf.gz	14.210376 seconds
Concatenating ./data/vcf/1240k_v43/ch10.vcf.gz	13.008295 seconds
Concatenating ./data/vcf/1240k_v43/ch11.vcf.gz	26.994599 seconds
Concatenating ./data/vcf/1240k_v43/ch12.vcf.gz	25.384168 seconds
Concatenating ./data/vcf/1240k_v43/ch13.vcf.gz	15.094192 seconds
Concatenating ./data/vcf/1240k_v43/ch14.vcf.gz	9.998397 seconds
Concatenating ./data/vcf/1240k_v

### And now transform the whole data to hdf5

In [12]:
%%time
out_path_h5="./data/hdf5/1240k_v43/all_ch.h5"
allel.vcf_to_hdf5(input=out_vcf_path, output=out_path_h5, chunk_length=10000, chunk_width=8,
                  fields = ['variants/*', 'calldata/*', "samples"], compression="gzip") # Do the conversion to hdf5. Takes 7h30

CPU times: user 6h 59min 46s, sys: 23min, total: 7h 22min 47s
Wall time: 7h 24min 39s


# Bonus: Create Variant only VCF

In [15]:
### index vcf file  -t
vcf_all = "./data/vcf/1240k_v43/all_ch.vcf.gz"
!bcftools index -f $vcf_all

In [16]:
print("Finished")

Finished


In [17]:

vcf_all = "./data/vcf/1240k_v43/all_ch.vcf.gz"
vcf_var_only = "./data/vcf/1240k_v43/1240k_vars.vcf.gz"

In [18]:
!bcftools view -G -o $vcf_var_only $vcf_all 

# Area 51
Test code here.

### Test vcf

In [19]:
### index vcf file  -t
test = "/n/groups/reich/ali/WholeGenomeImputation/imputed/v43.4/chr3.bcf"

ch = 21
base_path = f"/n/groups/reich/hringbauer/git/hapBLOCK"
test = f"{base_path}/data/vcf/1240k_v43/ch{ch}.vcf.gz" #v46.2
!bcftools view $test | head -25

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##fileDate=15/07/2020 - 20:20:41
##source=GLIMPSE_phase v1.0.0
##contig=<ID=21>
##INFO=<ID=RAF,Number=A,Type=Float,Description="ALT allele frequency in the reference panel">
##INFO=<ID=AF,Number=A,Type=Float,Description="ALT allele frequency computed from DS/GP field across target samples">
##INFO=<ID=INFO,Number=A,Type=Float,Description="Imputation information or quality score">
##INFO=<ID=BUF,Number=A,Type=Integer,Description="Is it a variant site falling within buffer regions? (0=no/1=yes)">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Unphased genotypes">
##FORMAT=<ID=DS,Number=1,Type=Float,Description="Genotype dosage">
##FORMAT=<ID=GP,Number=3,Type=Float,Description="Genotype posteriors">
##FORMAT=<ID=HS,Number=1,Type=Integer,Description="Sampled haplotype pairs packed into intergers (max: 15 pairs, see NMAIN header line)">
##NMAIN=10
##bcftools_annotateVersion=1.10.2+htslib-1.10.2
##bcftools_annotateCo

In [20]:
!bcftools query -f '%POS\n' $test | wc -l

16031


In [21]:
!bcftools query -l $test | wc -l # 19260

14523


### Test Created HDF5

In [28]:
%%time
ch=22
with h5py.File(f"./data/hdf5/1240k_v46.2/ch{ch}.h5", "r") as f: # Load for Sanity Check. See below!
#g = h5py.File("./data/hdf5/HO_v43/ch3.h5", "r")
    gt = f["calldata/GT"][:,0,:]
    print(list(f["variants"]))
    print(list(f["calldata"]))
    print(np.shape(f["calldata/GT"]))

['AF', 'AF_ALL', 'ALT', 'BUF', 'CHROM', 'FILTER_PASS', 'ID', 'INFO', 'MAP', 'POS', 'QUAL', 'RAF', 'REF', 'altlen', 'is_snp', 'numalt']
['AD', 'DS', 'GP', 'GT', 'HS', 'PL']
(15793, 19260, 2)
CPU times: user 9.2 ms, sys: 2.1 ms, total: 11.3 ms
Wall time: 12.9 ms


In [1]:
%%time
ch=22
with h5py.File(f"./data/hdf5/1240k_v46.2/ch{ch}.h5", "r") as f: # Load for Sanity Check. See below!
#g = h5py.File("./data/hdf5/HO_v43/ch3.h5", "r")
    gp = f["calldata/GP"][:,0,:]
    gt = f["calldata/GT"][:,0,:]
    print(list(f["variants"]))
    print(list(f["calldata"]))
    print(np.shape(f["calldata/GT"]))

NameError: name 'h5py' is not defined

In [11]:
with h5py.File(f"./data/hdf5/HO_v43/ch{ch}.h5", "r") as f:
    print(np.shape(f["calldata/GT"]))

(47705, 14523, 2)


In [13]:
samples = pd.Series(f["samples"][:])
samples[samples.str.contains("MA89")]

12483    MA89
dtype: object

In [79]:
snps = range(30000,30200)
j = 12483
ads = f["calldata/AD"][snps, j, :2]
gts = f["calldata/GT"][snps, j, :]
gp = f["calldata/GP"][snps, j, :]
df = pd.DataFrame({"ref":ads[:,0], "alt":ads[:,1], "gt0":np.sum(gts, axis=1)})

In [7]:
f.close()

In [30]:
f =  h5py.File(f"./data/hdf5/1240k_v43/all_ch.h5", "r")

In [31]:
np.shape(f["calldata/GT"])

(1100313, 14523, 2)

In [None]:
pd.value_counts(f["variants/CHROM"])