# Prepare the genetic map as well as the allele Frequencies for bcfTools

In [1]:
import pandas as pd
import numpy as np
import socket
import os as os
import sys as sys
import multiprocessing as mp
import h5py

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./PackagesSupport/h5_python/")
from h5_functions import load_h5   # Import Function to convert hdf5 to vcf

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

VioletQueen
/home/harald/git/HAPSBURG
CPU Count: 4


### Generate the Map File
position COMBINED_rate(cM/Mb) Genetic_Map(cM)

In [2]:
def prep_map_file(snp1240k_path = "./Data/1000Genomes/Markers/MinMyc.snp", ch=3):
    """Prepare map file in bcftools format at save_path from snp1240k_path"""
    save_path = f"./Data/1000Genomes/Markers/rec_map_bcf.chr{ch}.txt"
    
    df_snp = pd.read_csv(snp1240k_path, header=None, sep=r"\s+", engine="python")
    df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
    df_snp["map"] = df_snp["map"] * 100  # Switch to cm Map Position
    df_snp = df_snp[df_snp["chr"] == ch]
    print(f"Loaded {len(df_snp)} SNPs from Chr. {ch}")
    
    df_save = df_snp[["pos", "map", "map"]].copy()
    df_save.columns = ["position","COMBINED_rate(cM/Mb)", "Genetic_Map(cM)"]
    df_save["COMBINED_rate(cM/Mb)"] = 1.0
    df_save.to_csv(save_path, sep=" ", index=None)

In [4]:
prep_map_file(ch=3)

Loaded 81416 SNPs from Chr. 3


### Generate the Allele Frequency File
Tab delimited file with:
CHROM\tPOS\tREF,ALT\tAF

In [4]:
def prep_af_file(h5_path = "./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5", ch=3):
    """Prepare Allele Frequency File for bcftools from h5"""
    savepath = f"./Data/1000Genomes/Markers/af_1000G_EUR_bcf.chr{ch}.txt"

    f = load_h5(path = h5_path)
    p = np.mean(np.mean(f["calldata/GT"], axis=1), axis=1)   # Calculate Mean Allele Frequencies
    df_t = pd.DataFrame({"CHROM":f["variants/CHROM"][:], "POS":f["variants/POS"][:], 
                         "REF":f["variants/REF"][:], "ALT":f["variants/ALT"][:,0],
                          "AF":p})
    
    #df_t["REF"] = df_t["REF"] +","+ df_t["ALT"]
    #df_t.drop(columns="ALT", inplace=True) # Delete Merged column

    df_t.to_csv(savepath, sep="\t", index=None, header=False) # Write the tab delimited file
    
    savepath_c = savepath + ".gz"
    !bgzip -c $savepath > $savepath_c # Compress
    !tabix -s1 -b2 -e2 $savepath_c   # Tabix
    
    print(f"Saved {len(df_t)} AFs to {savepath_c}")

In [5]:
prep_af_file(ch=3)

Loaded HDF5
Loaded 77652 variants
Loaded 503 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'MAP', 'POS', 'QUAL', 'REF']
Saved 77652 AFs to ./Data/1000Genomes/Markers/af_1000G_EUR_bcf.chr3.txt.gz


# Area 51

In [12]:
list(f)

['calldata', 'samples', 'variants']

In [7]:
ch=3
save_path = f"./Data/1000Genomes/Markers/rec_map_bcf.chr{ch}.txt"

df_t = pd.read_csv(save_path, sep=" ")

In [50]:
savepath = "./Data/1000Genomes/Markers/af_1000G_EUR_bcf.chr3.txt"
df_t = pd.read_csv(savepath, sep="\t", header=None)

In [51]:
df_t

Unnamed: 0,0,1,2,3
0,3,63411,"A,C",0.425447
1,3,66894,"G,A",0.284294
2,3,76317,"T,C",0.220676
3,3,82010,"A,G",0.179920
4,3,95973,"C,T",0.682903
5,3,104972,"A,G",0.348907
6,3,105365,"T,C",0.345924
7,3,107626,"A,C",0.072565
8,3,108226,"A,G",0.128231
9,3,108804,"G,A",0.356859
