# Prepare MEGA markers for hapROH analysis

In [2]:
import allel
import h5py  # Python Package to do the HDF5.
import numpy as np
import pandas as pd
import socket
import os as os
import sys as sys
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
sys.path.append("./Python3/")  # Since now we are in the Root Directory

from hapsburg.PackagesSupport.h5_python.h5_functions import merge_in_ld_map
#from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0405.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Prepare downsampled Autosomal MEGA hdf5s
Prepare a 1000 Genome autosomal hdf5 file. Include Recombination Map   
Input: 1000 Genome vcf file, Recombination Map from a 1240k Eigenstrat
## Standalone from here onward.
### Updated for Mega:
prepare_paths function (with Mega paths) and also 
created save_megamarkers to use specific mega text file

In [2]:
### Important Parameters and paths
ch = 3 # Which Chromosome to use:

# Path of the 1000 Genome VCF:
p1, p2 = "", ""
file_vcf100g, path_vcf100g = "", ""
out_vcf_path0, out_vcf_path = "", ""
path_hdf5temp, path_hdf5final = "", ""

snp1240k_path, ind_path = "", ""   # Where to find the 1240k SNPs
snp_filter_path = ""

def prepare_paths(ch = 3):
    """Prepares all the Paths need for processing Steps.
    ch: Which Chromosomes to use"""
    global p1, p2, file_vcf100g, path_vcf100g, out_vcf_path0, out_vcf_path, path_hdf5temp, path_hdf5final
    global snp1240k_path, ind_path, snp_filter_path, out_vcf_path_gz, marker_path
    # Path of the 1000 Genome VCF:
    p1 = "./Data/1000Genomes/AutosomeVCF/"
    p2 = ".phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
    file_vcf100g = "ALL.chr" + str(ch) + p2
    path_vcf100g = p1 + file_vcf100g
    print(f"Full Input path:\n{path_vcf100g}")
    out_vcf_path0 = "./Data/1000Genomes/AutosomeVCF/Subset/" + "1240all/chr" + str(ch) # needs no .vcf
    out_vcf_path = out_vcf_path0 + ".vcf"
    out_vcf_path_gz = out_vcf_path + ".gz"
    path_hdf5temp = "./Data/1000Genomes/HDF5/FULLHDF5/cr" + str(ch) + ".hdf5"
    path_hdf5final = "./Data/1000Genomes/HDF5/mega/all/chr" + str(ch) + ".hdf5"
    snp1240k_path = "./Data/1000Genomes/Markers/MinMyc.snp"   # Where to find the 1240k SNPs
    ind_path = "./Data/1000Genomes/Individuals/NO_EXIST.csv"  # non-existing place-holder (sanity check)
    marker_path = "./Data/1000Genomes/Markers/mega/chr" + str(ch) + ".csv"   
    
    for path in [out_vcf_path, path_hdf5final]:
        path_dir = os.path.dirname(path)
    
        if not os.path.exists(path_dir):
            os.makedirs(path_dir)
            print(f"Created new directory: {path_dir}")
    
    ### Path of SNP Filter
    snp_filter_path = "../Data/1000Genomes/Markers/variants1240k" + str(ch) + ".txt"
    
### Step 0: Download the Data
def download_1kg(cluster=False):
    """cluster: Whether program is run on cluster"""
    if cluster==False:
        path_cl = "/project2/jnovembre/data/external_public/1kg_phase3/haps/"
        path_cluster = "hringbauer@midway.rcc.uchicago.edu:" + path_cl + file_vcf100g
    
    elif cluster==True:
        path_cl = "/project2/jnovembre/data/external_public/1kg_phase3/haps/"
        path_cluster = path_cl + file_vcf100g
    
    p_c = path_cluster + ".tbi"
    p_v = path_vcf100g + ".tbi"
    #!scp $p_c $p_v # Download the tbi
    !scp $path_cluster $path_vcf100g # Only Download the .vcf (not the .tbi)
    
### Step 1: Produce hdf5 file for all markers
def vcf_to_hdf5(in_path, out_path):
    """Transform Full VCF to full HDF5"""
    allel.vcf_to_hdf5(input=in_path, output=out_path, compression="gzip") # Takes 10 Minutes
    
### Step 2: Extract Positions. Match with Eigenstrat File Positions
### Load HDF5

def merge_positions():
    """Creates the Filter File to filter SNPs for"""
    f_full = h5py.File(path_hdf5temp, "r") # Load for Sanity Check. See below!
    print("Loaded %i variants" % np.shape(f_full["calldata/GT"])[0])
    print("Loaded %i individuals" % np.shape(f_full["calldata/GT"])[1])
    print(list(f_full["calldata"].keys()))
    print(list(f_full["variants"].keys()))
    #print(list(f["samples"].keys()))

    ### Load Eigenstrat
    df_snp = pd.read_csv(snp1240k_path, header=None, sep=r"\s+", engine="python")
    df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
    df_snp = df_snp[df_snp["chr"] == ch]
    print(f"Loaded {len(df_snp)} Chr.{ch} SNPs.")

    ### Prepare SNP File for Eigenstrat filtering 
    found = np.isin(f_full["variants/POS"], df_snp["pos"])
    print(f"Intersection: {np.sum(found)} out of {len(found)} SNPS")
    variants = f_full["variants/ID"][found]

    dots = np.where(variants == ".")[0]
    print(f"Found {len(dots)} unnamed SNPs")
    variants = np.delete(variants, dots)

    np.savetxt(snp_filter_path, variants, fmt="%s")
    print(f"Successfully saved to {snp_filter_path}. Length: {len(variants)}")
    
def save_megamarkers():
    """Save all MEGA markers of chromosome
    chr (globally defined) in csv
    (readable by bcftools)"""
    df_snp = pd.read_csv("./Data/mega/MEGAex.txt", 
                 low_memory=False, sep="\t")
    df_snp.columns=["name", "chr", "pos", "map"]
    df_snp  = df_snp[df_snp["chr"] == str(ch)]
    df_snp = df_snp.sort_values(by="pos")
    print(f"Loaded {len(df_snp)} Chr.{ch} SNPs.")
    df_save = df_snp[["chr", "pos"]]
    df_save.to_csv(marker_path, sep="\t", header=None, index=False)
    print(f"Saved {len(df_save)} Mega Markers on Chr. {ch} to {marker_path}")
    
### Step 3: Create new vcf based on subset of Individuals and Markers
def plink_new_vcf():
    !plink --vcf $path_vcf100g --extract $snp_filter_path --keep-fam $ind_path --recode vcf --out $out_vcf_path0 --biallelic-only strict --keep-allele-order

### Step 3b
def bctools_new_vcf0():
    """Same as PLINK, but with bcftools 
    [small hack with marker strings, so LEGACY code and replaced by bcftools_new_vcf]"""
    str_ex = "ID=@" + snp_filter_path
    #!echo bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -i $str_ex -m2 -M2 -v snps $path_vcf100g
    !bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -i $str_ex -m2 -M2 -v snps $path_vcf100g
    print("Finished BCF tools runs.")
    
def bctools_new_vcf(filter_iids=True, cluster=False):
    """Same as PLINK, but with bcftools and directly via Marker Positions.
    filter_iids: Whether to use the .csv with Indivdiduals to extract"""
    if filter_iids==True:
        if cluster==False:
            !bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -T $marker_path -m2 -M2 -v snps $path_vcf100g
        elif cluster==True:
            !module load bcftools; bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -T $marker_path -m2 -M2 -v snps $path_vcf100g     
    elif filter_iids==False:
        if cluster==False:
            !bcftools view -Oz -o $out_vcf_path_gz -T $marker_path -m2 -M2 -v snps $path_vcf100g
        elif cluster==True:
            !module load bcftools; bcftools view -Oz -o $out_vcf_path_gz -T $marker_path -m2 -M2 -v snps $path_vcf100g
    print("Finished BCF tools runs.")

### Step 4: Transfer to hdf5.
#allel.vcf_to_hdf5(input=out_vcf_path, output=path_hdf5final, compression="gzip") # Takes 1s
 
### Step 5: Merge in Linkage Map
    
### Step 6: Delete the Data:
def del_temp_data():
    print(f"Removing temporary vcf files...")
    !rm $path_vcf100g # Delete the full 1000 genome .vcf
    !rm $out_vcf_path_gz # Delete the extracted .vcf
    #!rm $path_hdf5temp # The originally intermediate hdf5 (for 1240k intersection)

# Run the whole Procedure

In [71]:
def prep_mega_from1000G_full(ch, cluster=True, delete=True):
    """ch: Which Chromosome to prepare
    cluster: Whether Function is run on Cluster"""
    prepare_paths(ch = ch)
    download_1kg(cluster=cluster)  # Since we run it on the cluster
    save_megamarkers()
    print("Download Full Data Complete")

    bctools_new_vcf(filter_iids=False, cluster=cluster)  # Important, turn off filter individuals here!
    vcf_to_hdf5(in_path=out_vcf_path_gz, out_path=path_hdf5final)
    merge_in_ld_map(path_h5=path_hdf5final,
                    path_snp1240k=snp1240k_path, chs=[ch])
    
    if delete:
        del_temp_data()
    print("Finished Preparing HDF5. GZ!")

In [4]:
%%time
for ch in range(1,23):
    prep_mega_from1000G_full(ch=ch, cluster=True, delete=True)

Full Input path:
./Data/1000Genomes/AutosomeVCF/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
Lifting LD Map from eigenstrat to HDF5...
Loaded 114727 variants.
Loaded 2504 individuals.
Loaded 93166 Chr.1 1240K SNPs.
Intersection 19489 out of 114727 HDF5 SNPs
Interpolating 95238 variants.
Finished Chromosome 1.
Adding map to HDF5...
We did it. Finished.
CPU times: user 7.51 s, sys: 495 ms, total: 8 s
Wall time: 9.53 s


# Area 51

In [18]:
### Test the Final HDF5 just created
ch=3
#h5_path = "./Data/mega/chr1.hdf5"
#h5_path = f"./Data/1000Genomes/HDF5/FULLHDF5/maf02_chr{ch}.hdf5"
h5_path = f"./Data/1000Genomes/HDF5/mega/all/chr{ch}.hdf5"
path_mega = "./Data/mega/MEGAex.txt"

with h5py.File(h5_path, "r") as f: # Load for Sanity Check. See below!
    print("Loaded %i variants" % np.shape(f["calldata/GT"])[0])
    print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
    print(list(f["calldata"].keys()))
    print(list(f["variants"].keys()))
    pos = f["variants/POS"][:]
f.close()

df = pd.read_csv(path_mega, 
                 low_memory=False, sep="\t")
df1 = df[df["Chr"]==str(ch)]
df1 = df1.sort_values(by="MapInfo")

found = np.isin(df1["MapInfo"].values, pos)
print(f"Found {np.sum(found)}/{len(found)} of Meta positions in HDF5 file with {len(pos)} markers")

Loaded 103146 variants
Loaded 2504 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'MAP', 'POS', 'QUAL', 'REF']
Found 103146/120485 of Meta positions in HDF5 file with 103146 markers
