# Functions and run the Y calling. Eventually put into Python package

In [2]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/ycalling/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-e-16-233.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/ycalling
CPU Count: 28
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


In [15]:
################################################
# Definitions to load File


def preprocess_oufile(path_cts="./oufile.txt"):
    """Preprocess Output File and give back numpy array"""
    with open(path_cts, "r") as fi:
        id = []
        for ln in fi:
            if ln.startswith("count:"):
                ln = ' '.join(ln.split())
                ln = ln[7:]
                ln = ln.split()
                id.append(ln)
    id = np.array(id)
    return id

# Create Dictionary of all hplogroups


def create_haplopath(haplogroup, rest="_data.csv", haplo_folder="./data/isogg_19/"):
    """Give back the full Path of Haplogroup data"""
    assert(len(haplogroup) == 1)
    path = haplo_folder + haplogroup + rest
    return path


def load_haplogroup_data(path, build="Build 37 #"):
    # Change path here depenging on what haplogroup!!!!
    print("Loading: %s" % path)
    df_g = pd.read_csv(path, index_col=False, sep=",")
    df_g = df_g.rename(columns={build: 'pos'})  # For merging
    l = len(df_g)
    df_g = df_g[pd.to_numeric(df_g['pos'], errors='coerce').notnull()]
    df_g['pos'] = df_g['pos'].apply(pd.to_numeric)
    print("Drop from %i to %i numerical pos rows" % (l, len(df_g)))
    return df_g


def load_outfile(path_in="./input/outfiles/I18189.out"):
    """Load Output File (from processing Nick's outfile).
    Return formatted Dataframe"""
    id = preprocess_oufile(path_in)

    df_ofile = pd.DataFrame(id)
    df_ofile.columns = ["snp", "pos", "ref", "alt"]
    for col in ["pos", "ref", "alt"]:
        df_ofile[col] = pd.to_numeric(df_ofile[col], errors="coerce")
        #df_ofile.to_csv("./outfile.csv" ,sep="\t", index=False, header=True)

    mean_cov = np.mean(df_ofile["ref"] + df_ofile["alt"])/2
    print(f"Mean Coverage: {mean_cov:.4f}")
    return df_ofile

################################################
################################################
# Functions to call Y

def call_ind(df_out, df_isogg, pos = [], full = False, output=True, flip_ref=True, gt_only=False):
    """Call Y haplogroup.
    df: Dataframe with Count data
    df_isogg: Dataframe with ISOGG Individual
    pos. If given call that position
    individual: Which individual in h5 to call
    full: Whether to output every single SNP
    
    Return: Dataframe with derived Markers. 
    Nr Derived Markers
    Nr Markers with >0 calls
    Nr potential Calls (target)"""
    # Intersect the Positions
    pos_i = df_isogg["pos"]
    pos_t = df_out["pos"]
    
    its, i1, i_t = np.intersect1d(pos_i, pos_t, return_indices=True)
    df = df_isogg.iloc[i1,:].copy(deep=True)  # Cut out all markers that where found
    
    if output==True:
        print(f"Found {len(i_t)} out of {len(pos_i)} Y SNPs in HDF5")

    df["Ref"] = df_out["ref"].values[i_t]
    df["Alt"] = df_out["alt"].values[i_t]

    ### Drop no Genoytpe fields
    nocalls = (df["Ref"] + df["Alt"])<=0
    if gt_only:
        df = df[~nocalls]
        if output==True:
            print(f"Dropped {np.sum(nocalls)} No Genotype Calls")

    ### Extract only raws with more Alts than Refs
    derived = (df["Alt"]>df["Ref"]) & (df["Alt"]>0)
    tot_calls =  (df["Ref"]>0)  | (df["Alt"]>0) 
    
    if output==True:
        print("Markers witch calls: %i" % np.sum(tot_calls))
        print(f"SNPs covered {np.sum(tot_calls) / len(i_t):.6f}")
        print("Markers derived: %i" % np.sum(derived))
    
    if full == False:
        df_call = df[derived]
    
    elif full == True: 
        df_call = df
    
    df_call = df_call.sort_values(by="Haplogroup")
    
    ### Only return selected Positions if needed
    if len(pos)>0:
        selected = np.where(df["pos"].isin(pos))[0]
        df_call = df.iloc[selected]
    
    return df_call, np.sum(derived), np.sum(tot_calls), len(df)

### Load ISOGG Reference Dictionary

In [13]:
haplogroups = ["o", "d", "g", "i", "r", "j",
               "t", "e", "h", "c", "l", "n", "p", "q"]
haplo_paths = [create_haplopath(
    x, haplo_folder="./data/isogg_19/") for x in haplogroups]
df_groups = [load_haplogroup_data(p) for p in haplo_paths]

dict_y_dfs = dict(zip(haplogroups, df_groups))   # Create Dictionary
# dict_y_dfs['all'] = pd.concat(df_groups)   # Add the combined one

Loading: ./data/isogg_19/o_data.csv
Drop from 3459 to 3455 numerical pos rows
Loading: ./data/isogg_19/d_data.csv
Drop from 2595 to 2586 numerical pos rows
Loading: ./data/isogg_19/g_data.csv
Drop from 7503 to 7492 numerical pos rows
Loading: ./data/isogg_19/i_data.csv
Drop from 11246 to 11231 numerical pos rows
Loading: ./data/isogg_19/r_data.csv
Drop from 9763 to 9726 numerical pos rows
Loading: ./data/isogg_19/j_data.csv
Drop from 5947 to 5940 numerical pos rows
Loading: ./data/isogg_19/t_data.csv
Drop from 382 to 381 numerical pos rows
Loading: ./data/isogg_19/e_data.csv
Drop from 10478 to 10459 numerical pos rows
Loading: ./data/isogg_19/h_data.csv
Drop from 3140 to 3127 numerical pos rows
Loading: ./data/isogg_19/c_data.csv
Drop from 6862 to 6855 numerical pos rows
Loading: ./data/isogg_19/l_data.csv
Drop from 1154 to 1151 numerical pos rows
Loading: ./data/isogg_19/n_data.csv
Drop from 2952 to 2948 numerical pos rows
Loading: ./data/isogg_19/p_data.csv
Drop from 300 to 300 numer

### Load Individual Data 
(change Name here)

In [11]:
# Load the Data
df_ofile = load_outfile(path_in="./input/outfiles/I18189.out")

Mean Coverage: 0.1461


# Call Haplogroup

In [18]:
%%time
haplogroup = "e"
print(f"\nCalling {haplogroup}")

df_t = dict_y_dfs[haplogroup]
df_call, _, _, _ = call_ind(df_ofile, df_t, output=True, full=False, gt_only=True)


Calling e
Found 2731 out of 10459 Y SNPs in HDF5
Dropped 2202 No Genotype Calls
Markers witch calls: 529
SNPs covered 0.193702
Markers derived: 72
CPU times: user 31.7 ms, sys: 4 ms, total: 35.7 ms
Wall time: 31.7 ms


In [30]:
df_call.rename(columns={"pos":"Build 37 #"}, inplace=True)
df_call.to_csv("./output/y_derived_I18189.csv", sep="\t") # Save
df_call[20:]

Unnamed: 0,Name,Haplogroup,Other Names in Tree,rs #,Build 37 #,Build 38 #,Mutation info,Ref,Alt
527,Z15513,E1a,,rs772977596,21530910,19369024,C->T,0,1
289,CTS1172,E1a,,rs757375446,7250851,7382810,C->G,0,2
532,Z15518,E1a,,rs761254767,21746728,19584842,T->G,0,1
481,Z950,E1a,,rs762885611,22052797,19890911,G->A,0,1
482,Z951,E1a,,rs772150115,22068019,19906133,T->G,0,1
483,Z953,E1a,,rs749223657,22171814,20009928,G->A,0,1
543,Z15541,E1a,,rs1030478124,22536137,20374251,C->G,0,1
545,Z15543,E1a,,rs766737446,22603185,20441299,T->A,0,1
418,CTS10644,E1a,Z957,rs780328785,22675129,20513243,C->A,0,1
420,CTS10752,E1a,,rs774330572,22758062,20596176,A->G,0,1
