# Python wrapper for bcftools
Contains additional functions to mimic output of HAPSBURG for downstream analysis
@Harald Ringbauer, October 2019

In [1]:
import pandas as pd
import numpy as np
import socket
import os as os
import sys as sys
import multiprocessing as mp
import h5py

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./PackagesSupport/h5_python/")
from h5_functions import hdf5_to_vcf, load_h5   # Import Function to convert hdf5 to vcf
sys.path.append("./PackagesSupport/parallel_runs/")
from helper_functions import prepare_path, create_folders, postprocess_iid  # To split up ground truth ROH

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

VioletQueen
/home/harald/git/HAPSBURG
CPU Count: 4


### Functions to run VCF Tools for a single sample

In [12]:
def post_process_bcftools(outfile):
    """Post Process bcftools output"""
    outtxt = outfile + ".txt"
    outST = outfile + "ST.txt"
    outRG = outfile + "RG.txt"
    
    !grep ^ST $outtxt > $outST
    !grep ^RG $outtxt > $outRG
    
    df_pos = pd.read_csv(outST, sep='\t', header=None, usecols=range(1,6))
    df_pos.columns = ["iid", "ch", "pos", "state", "qual"]
    df_rohs = pd.read_csv(outRG, sep='\t', header=None, usecols=range(1,8))
    df_rohs.columns = ["iid", "ch", "Start", "End", "length", "markers", "qual"]
    return df_pos, df_rohs

def run_bcftools_roh(vcf_file, outfile, mp="./Data/1000Genomes/Markers/rec_map_bcf.chr3.txt",
                    af="./Data/1000Genomes/Markers/af_1000G_EUR_bcf.chr3.txt.gz"):
    """Run PLINK ROH Caller on path_vcf, and save results in outfile.txt.
    Uses Map File mp and Allele Frequency File AF (prepared in prep_map_af_bcftools.ipynb)
    Return 2 Dataframes (per site,  total roh blocks )"""
    outtxt = outfile + ".txt"
    !bcftools roh -G30 --AF-dflt 0.3 -m $mp --AF-file $af $vcf_file > $outtxt
    
def create_hapsburg_df(df_t, map_dct):
    """Modify bcftools output to HAPSBURG format.
    Return right Dataframe"""
    df_t["StartM"] = df_t["Start"].map(map_dct)
    df_t["EndM"] = df_t["End"].map(map_dct)
    df_t["lengthM"] = df_t["EndM"] - df_t["StartM"]

    # Add all fields for roh.csv
    df_t = df_t[["Start", "End", "StartM", "EndM", "length", "lengthM", "iid", "ch"]]
    return df_t

### Quick Single Example Run on one VCF

In [4]:
%%time
vcf_file = "./Simulated/1000G_Mosaic/TSI5/ch3_6cm/data.vcf"
outfile  = "./Diverse/bcfroh_out"

run_bcftools_roh(vcf_file, outfile)
df_pos, df_rohs = post_process_bcftools(outfile)
#df_rohs = create_hapsburg_df(df_rohs, map_dct)

Number of target samples: 100
Number of --estimate-AF samples: 0
Number of sites in the buffer/overlap: unlimited
Number of lines total/processed: 77652/77651
Number of lines filtered/no AF/not biallelic/dup: 0/0/0/1


NameError: name 'map_dct' is not defined

In [7]:
#############################################
### Combine all subfunctions

def full_bcftools_mosaic(input_base_folder, ch=3, prefix_out="bcftools/"):
    """Run PLINK on Mosaic Data Set in ./Simulated"""
    
    print("Converting HDF5 into VCF...")
    input_h5, input_vcf, bcf_folder = create_folders(input_base_folder, outfolder="bcf_out/")
    hdf5_to_vcf(input_h5, input_vcf, chrom=ch) # Convert to VCF
    
    print("Running bcftools/ROH...")
    outfile = bcf_folder + "bcfroh_out"
    run_bcftools_roh(input_vcf, outfile)   # Run BCF tools on VCF
    df_pos, df_rohs = post_process_bcftools(outfile) # Load the output Data
    
    ### Create the Mapping Dictionary
    print("Creating Map Dict...")
    f = load_h5(path=input_h5, output=False)
    map_dct = dict(zip(f["variants/POS"], f["variants/MAP"]))
    iids = f["samples"][:] # Get the IIDs
    
    print("Splitting up BCF results and GT...")
    df_rohs = create_hapsburg_df(df_rohs, map_dct)
    postprocess_iid(df_rohs, input_base_folder, iids, ch, prefix_out)
    print(f"Finished {len(iids)} Individuals!")

### Run bcftools on all Individuals for Mosaic Folder

In [13]:
%%time
full_bcftools_mosaic(input_base_folder = "./Simulated/1000G_Mosaic/TSI5/ch3_6cm",
                     ch=3, prefix_out="bcftools/")

Converting HDF5 into VCF...
Loaded HDF5
Loaded 77652 variants
Loaded 100 individuals
['AD', 'GT']
['ALT', 'MAP', 'POS', 'REF']
Successfully saved VCF to ./Simulated/1000G_Mosaic/TSI5/ch3_6cm/data.vcf
Running bcftools/ROH...
Number of target samples: 100
Number of --estimate-AF samples: 0
Number of sites in the buffer/overlap: unlimited
Number of lines overlapping with --AF-file/processed: 77652/77651
Number of lines filtered/no AF/not biallelic/dup: 0/0/0/1
Creating Map Dict...
Splitting up BCF results and GT...
Finished 100 Individuals!
CPU times: user 36.7 s, sys: 1.52 s, total: 38.2 s
Wall time: 46.3 s


# Area 51