# Python wrapper for PLINK
Contains additional functions to mimic output of HAPSBURG for downstream analysis
@Harald Ringbauer, October 2019

In [2]:
import pandas as pd
import numpy as np
import socket
import os as os
import sys as sys
import multiprocessing as mp
import h5py

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./PackagesSupport/h5_python/")
from h5_functions import hdf5_to_vcf, load_h5   # Import Function to convert hdf5 to vcf
sys.path.append("./PackagesSupport/parallel_runs/")
from helper_functions import prepare_path, create_folders, postprocess_iid  # To split up ground truth ROH

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

VioletQueen
/home/harald/git/HAPSBURG
CPU Count: 4


### The PLINK core function

In [9]:
def run_plink(path_vcf, output_folder, window_snp=50, kb=500, het=1, threshold=0.05, gap=1000, density=50):
    """Run PLINK ROH Caller on path_vcf, and save results in output_folder"""
    !module load plink; plink --homozyg --vcf $path_vcf --homozyg-window-snp \
    $window_snp --homozyg-kb $kb --homozyg-window-het $het \
    --homozyg-window-threshold $threshold --homozyg-gap $gap \
    --homozyg-density $density --out $output_folder

# Functions to modify PLINK output for simulated Mosaic Data
Run PLINK on simulated Mosaic Data, for each individual, and save output into PLINK output folder
For post-processing: Need roh.csv as well as roh_gt.csv. The latter will be needed to copy over

In [3]:
def post_process_plink(plink_folder, new_dict):
    """Post Process the PLINK Result to match Hapsburg output. Load dataframe from plink_folder,
    modify and return data frame"""
    path_plink_roh = plink_folder + "roh.hom"
    df_plink = pd.read_csv(path_plink_roh, sep=r"\s+", engine="python")

    df_plink["StartM"] = df_plink["POS1"].map(new_dict)
    df_plink["EndM"] = df_plink["POS2"].map(new_dict)
    df_plink["lengthM"] = df_plink["EndM"] - df_plink["StartM"]
    df_plink.rename(columns = {"POS1":"Start", "POS2": "End", "IID":"iid", "CHR":"ch"}, inplace=True)
    df_plink["length"] = df_plink["End"] - df_plink["Start"]
    # Add all fields for roh.csv
    df_plink = df_plink[["Start", "End", "StartM", "EndM", "length", "lengthM", "iid", "ch"]]
    return df_plink
        
#############################################
### Combine all subfunctions

def full_plink_mosaic(input_base_folder, ch=3, prefix_out="plink/"):
    """Run PLINK on Mosaic Data Set in ./Simulated"""
    
    input_h5, input_vcf, plink_folder = create_folders(input_base_folder, outfolder="plink_out/")
    hdf5_to_vcf(input_h5, input_vcf, chrom=ch) # Convert to VCF
    
    run_plink(input_vcf, plink_folder + "roh") # 1.1: Run PLINK on VCF
    
    ### Create the Mapping Dictionary
    print("Creating Map Dict...")
    f = load_h5(path=input_h5, output=False)
    map_dct = dict(zip(f["variants/POS"], f["variants/MAP"]))
    iids = f["samples"][:] # Get the IIDs
    
    print("Splitting up Plink results and GT...")
    df_plink =  post_process_plink(plink_folder, map_dct)
    postprocess_iid(df_plink, input_base_folder, iids, ch, prefix_out)
    print(f"Finished {len(iids)} Individuals!")

### Run all Individuals for Mosaic Folder

In [None]:
%%time
full_plink_mosaic(input_base_folder = "./Simulated/1000G_Mosaic/TSI5/ch3_6cm",
                  ch=3, prefix_out="plink/")

### Run all lengths of copied in chromosomes

In [None]:
base_path =  "./Simulated/1000G_Mosaic/TSI5/"

for l in [0, 2, 4, 6, 8, 10]:
    input_base_folder = base_path + f"ch3_{l}cm"
    print(f"\nDoing {input_base_folder}")
    full_plink_mosaic(input_base_folder = input_base_folder, ch=3, prefix_out="plink/")

# Area 51: Test code