# Python wrapper for bcftools
Contains additional functions to mimic output of HAPSBURG for downstream analysis
@Harald Ringbauer, October 2019

In [1]:
import pandas as pd
import numpy as np
import socket
import os as os
import sys as sys
import multiprocessing as mp
import h5py

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./PackagesSupport/h5_python/")
from h5_functions import hdf5_to_vcf, load_h5   # Import Function to convert hdf5 to vcf
sys.path.append("./PackagesSupport/parallel_runs/")
from helper_functions import prepare_path, create_folders, postprocess_iid  # To split up ground truth ROH

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Functions to run VCF Tools for a single sample

In [2]:
def post_process_bcftools(outfile):
    """Post Process bcftools output"""
    outtxt = outfile + ".txt"
    outST = outfile + "ST.txt"
    outRG = outfile + "RG.txt"
    
    !grep ^ST $outtxt > $outST
    !grep ^RG $outtxt > $outRG
    #!rm $outtxt # Clean up the now redundant original output
    
    df_pos = pd.read_csv(outST, sep='\t', header=None, usecols=range(1,6))
    df_pos.columns = ["iid", "ch", "pos", "state", "qual"]
    df_rohs = pd.read_csv(outRG, sep='\t', header=None, usecols=range(1,8))
    df_rohs.columns = ["iid", "ch", "Start", "End", "length", "markers", "qual"]
    return df_pos, df_rohs

def run_bcftools_roh(vcf_file, outfile, mp="./Data/1000Genomes/Markers/rec_map_bcf.chr3.txt",
                    af="./Data/1000Genomes/Markers/af_1000G_EUR_bcf.chr3.txt.gz"):
    """Run PLINK ROH Caller on path_vcf, and save results in outfile.txt.
    Uses Map File mp and Allele Frequency File AF (prepared in prep_map_af_bcftools.ipynb)
    Return 2 Dataframes (per site,  total roh blocks )"""
    outtxt = outfile + ".txt"
    !bcftools roh -G30 --AF-file $af -m $mp $vcf_file > $outtxt
    # -V 1e-10   ### Command to do Viterbi Training
    
def create_hapsburg_df(df_t, map_dct):
    """Modify bcftools output to HAPSBURG format.
    Return right Dataframe"""
    df_t["StartM"] = df_t["Start"].map(map_dct)
    df_t["EndM"] = df_t["End"].map(map_dct)
    df_t["lengthM"] = df_t["EndM"] - df_t["StartM"]

    # Add all fields for roh.csv
    df_t = df_t[["Start", "End", "StartM", "EndM", "length", "lengthM", "iid", "ch"]]
    return df_t

### Quick Single Example Run on one VCF

In [None]:
%%time
vcf_file = "./Simulated/1000G_Mosaic/TSI5/ch3_6cm/data.vcf"
outfile  = "./Diverse/bcfroh_out"

run_bcftools_roh(vcf_file, outfile)
df_pos, df_rohs = post_process_bcftools(outfile)
#df_rohs = create_hapsburg_df(df_rohs, map_dct)

In [3]:
#############################################
### Combine all subfunctions

def full_bcftools_mosaic(input_base_folder, ch=3, prefix_out="bcftools/", convert_h5=True):
    """Run PLINK on Mosaic Data Set in ./Simulated"""
    
    print("Converting HDF5 into VCF...")
    input_h5, input_vcf, bcf_folder = create_folders(input_base_folder, outfolder="bcf_out/")
    if convert_h5:
        hdf5_to_vcf(input_h5, input_vcf, chrom=ch) # Convert to VCF
    
    print("Running bcftools/ROH...")
    outfile = bcf_folder + "bcfroh_out"
    run_bcftools_roh(input_vcf, outfile)   # Run BCF tools on VCF
    df_pos, df_rohs = post_process_bcftools(outfile) # Load the output Data
    
    ### Create the Mapping Dictionary
    print("Creating Map Dict...")
    f = load_h5(path=input_h5, output=False)
    map_dct = dict(zip(f["variants/POS"], f["variants/MAP"]))
    iids = f["samples"][:] # Get the IIDs
    
    print("Splitting up BCF results and GT...")
    df_rohs = create_hapsburg_df(df_rohs, map_dct)
    postprocess_iid(df_rohs, input_base_folder, iids, ch, prefix_out)
    print(f"Finished {len(iids)} Individuals!")

### Run bcftools on all Individuals for Mosaic Folder

In [31]:
%%time
full_bcftools_mosaic(input_base_folder = "./Simulated/1000G_Mosaic/TSI5/ch3_6cm",
                     ch=3, prefix_out="bcftools/", convert_h5=False)

Converting HDF5 into VCF...
Running bcftools/ROH...
Number of target samples: 100
Number of --estimate-AF samples: 0
Number of sites in the buffer/overlap: unlimited
Number of lines total/processed: 77652/70453
Creating Map Dict...
Splitting up BCF results and GT...
Finished 100 Individuals!
CPU times: user 18.2 s, sys: 750 ms, total: 19 s
Wall time: 28.5 s


### Run multiple lengths of copied in Chromosomes

In [None]:
base_path =  "./Simulated/1000G_Mosaic/TSI5/"

for l in [0, 2, 4, 6, 8, 10]:
    input_base_folder = base_path + f"ch3_{l}cm"
    print(f"\nDoing ROH bcftools on {input_base_folder}...")
    full_bcftools_mosaic(input_base_folder, ch=3, prefix_out="bcftools/")

## Split up the Posterior Output for Mosaic Folders
Run to split map.csv and posterior0.csv into bcftools/ output folder

In [23]:
def post_process_postbcf(basepath, map_dct):
    """Post Process the Posterior of bcftools"""
    df_t = pd.read_csv(basepath + "bcf_out/bcfroh_outST.txt", header=None, sep="\t")
    df_t.drop(columns=0, inplace=True) # Get rid of the first component
    df_t.columns = ["iid", "chr", "pos", "state", "post"]

    ### Transfrom it from PHRED scale to posterior
    p = 10**(-df_t["post"]/10)  # prob for alternative state
    post = p * df_t["state"] + (1-p) * (1 - df_t["state"])   # 1 ROH State 0 HW
    df_t["post"] = np.log(post.values + 1e-10)
    df_t["map"] = df_t["pos"].map(map_dct)
    return df_t

def split_up_bcftools_post(basepath, df_bcf, iid, ch, prefix_out="bcftools/"):
    """Split up postprocessed Dataframe of bcftools/ROH output into
    Mosaic folders. Save map and posterior file"""
    output_base_folder = os.path.join(basepath, "output/")
    pathout = prepare_path(output_base_folder, iid, ch=3, prefix_out=prefix_out, logfile=False)
    df_t = df_bcf[df_bcf["iid"] == iid]
    
    df_map = df_t["map"]
    mappath = os.path.join(pathout,"map.csv")
    df_map.to_csv(mappath, sep=",", index=None, header=None)
    
    df_pos = df_t["post"]
    postpath = os.path.join(pathout,"posterior0.csv")
    df_pos.to_csv(postpath, sep=",", index=None, header=None)
    print(f"Saved Posterior to {postpath}")

In [None]:
### Merge in Map(Takes about 10s)
### Create the Mapping Dictionary (Run in Wrapper Function)
input_h5 =  "./Simulated/1000G_Mosaic/TSI5/ch3_6cm/data.h5"

print("Creating Map Dict...")
f = load_h5(path = input_h5, output=False)
map_dct = dict(zip(f["variants/POS"], f["variants/MAP"]))

In [24]:
basepath = "./Simulated/1000G_Mosaic/TSI5/ch3_6cm/"
iid="iid1"

df_t = post_process_postbcf(basepath, map_dct)
split_up_bcftools_post(basepath, df_t, iid=iid, ch=3, prefix_out="bcftools/")

Saved Posterior to ./Simulated/1000G_Mosaic/TSI5/ch3_6cm/output/iid1/chr3/bcftools/posterior0.csv


# Area 51
Area to test code

### Split up the posterior from bcftools

### Convert to VCF, storing the Genotype Likelihood!
Move this piece of code eventually to packages support

In [25]:
def to_vcf(chrom, pos, ref, alt, gt, iids, vcf_path, header=[], pl=[]):
    """Saves VCF. If Genotype Likelihoods given (pl), save them too."""
    ### Hard-Coded Default Header
    if len(header)==0:
        header = """##fileformat=VCFv4.3\n##FILTER=<ID=PASS,Description="All filters passed">\n##fileDate=20191010\n##source=1000GenomesPhase3Pipeline\n##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz\n##contig=<ID=3,assembly=b37,length=198022430>\n##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n"""     
        
    #last_line_h =  "\n#CHROM POS ID REF ALT QUAL FILTER INFO"
    dct = {'#CHROM':chrom, 'POS':pos, 'REF':ref, 'ALT':alt}
    df = pd.DataFrame(dct)
    df['ID'] = ""
    df['QUAL'] = 40
    df['FILTER'] = "PASS"
    df['INFO'] = ""
    df["FORMAT"] = "GT"  # GT:AD if allele depth given

    df = df[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', "FORMAT"]] 

    ### Add the Genotype Data
    add_gt_data(df, gt, iids=iids)
    
    ### Write the Header
    with open(vcf_path, 'w') as vcf:
        vcf.write(header)
        #vcf.write(last_line_h)

    #### Write the tab seperated data
    df.to_csv(vcf_path, sep="\t", mode='a', index=False)  # Append
    print(f"Successfully saved VCF to {vcf_path}")
    
def add_gt_data(df, gt, ad=[], iids=[], m_sym="."):
    """Add Genotype and Allele Depth Fields [l,n,2] for iids to pandas dataframe df.
    Return modified Data Frame"""
    assert(np.shape(gt)[1]==len(iids)) # Sanity Check
    
    ### Replace missing Data with dot again
    missing = gt<0  # Missing Data
    gt = gt.astype("str") ## Convert To String
    gt[missing] = m_sym
    
    gt_vcf = np.core.defchararray.add(gt[:,:,0], "/")
    gt_vcf = np.core.defchararray.add(gt_vcf, gt[:,:,1])
        
    for i, iid in enumerate(iids):
        #data = map('/'.join, zip(gt[:,i,0], gt[:,i,1]))
        df[iid] = gt_vcf[:,i]
        
        #if len(ad)>0:   # Add Allele Depth Data if needed
        #    print("Implement this") 
    return df
    
def hdf5_to_vcf(path_h5, path_vcf, iids=[], markers=[], chrom=0, pl_field=False):
    """Load HDF5 from path_h5, extract iids and
    (if given) markers by position and save vcf to path_vcf.
    pl: If True, also save Genotype Likelihoods!
    iids: Which Individuals to match and save. If none given: Save all!"""
    
    f = load_h5(path=path_h5)
    
    if len(iids)==0:
        iids = f["samples"][:]
        
    if chrom==0:
        chrom = f["variants/CHROM"][:]
        
    pos = f["variants/POS"][:]
    ref = f["variants/REF"][:] 
    alt = f["variants/ALT"][:] 
    
    idx = np.isin(f["samples"], iids)
    gt = f["calldata/GT"][:,idx,:]
    
    ### Get Genotype Likelihoods from AD field
    pl=[]  # Default
    if pl_field:  
        ad = f["calldata/AD"][:]
        gl = ad_to_genotypeL(ad)  # Convert Allele Depths to Genotype Likelihood
        pl = gl_to_pl(gl)         # Convert Genotype Likelihood to PHRED scale 
        
    to_vcf(chrom, pos, ref, alt, gt, iids, path_vcf, pl=pl)

In [92]:
from scipy.stats import binom  # Binomial Likelihood

def ad_to_gentoypeL(ad, error=0.001):
    """Convert Allele Depth Fields to Genotype Likelihoods.
    ad: [l,n,2] contains allele contains readcounts (integers)
    error: Flip Error for Read
    return: Genotype Probabilities (Pr(G|RC)) [l,n,3] for 00/01/11"""
    rc_tot = np.sum(ad, axis=2)
    rc_der = ad[:,:,1]

    p_read = np.array([error, 0.5, 1-error])  # Probabilities for the 3 Genotypes
    prob_binom = binom.pmf(rc_der[:, :, None], rc_tot[:, :, None], p_read[None, None, :])
    return prob_binom

def gl_to_pl(gl):
    """Convert Genotype Probabilities to normalized PHRED scores
    gl: [l,n,3] Probabilities Pr(G|RC) (not logscale)
    return: [l,n,3] vector"""
    gl = -10 * np.log10(gl)  # Convert to PHRED scale
    assert(np.min(gl)>=0)
    pl = gl - np.min(gl, axis=2)[:,:,None] # Normalize
    pl = np.round(pl).astype("int16")  # Round to Integers
    assert(np.min(pl)>=0) # Sanity Check
    pl = np.clip(pl, a_min=0, a_max=99)  # Clip to 99    
    return pl

In [99]:
##################################################################
### Example Case, test whether ad_to_genotypeL does what it should
ad = np.array([[[1,0], [1,1]],  [[3,3], [3,0]], [[10,10], [0,10]]])

gl = ad_to_gentoypeL(ad)
pl = gl_to_pl(gl)

i,j=1,1
print(ad[i, j, :])
print(gl[i, j, :])
print(pl[i, j, :])

[3 0]
[9.97002999e-01 1.25000000e-01 1.00000000e-09]
[ 0  9 90]
