# Functions and run the Y calling. Eventually put into Python package

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/y_chrom/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-16-35.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/y_chrom
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


In [2]:
################################################
# Definitions to load Files (ISOGG and Output of Pulldown)

def create_haplopath(haplogroup, rest="_data.csv", haplo_folder="./data/isogg_19/"):
    """Give back the full Path of Haplogroup data"""
    assert(len(haplogroup) == 1)
    path = haplo_folder + haplogroup + rest
    return path

def load_haplogroup_data(path, build="Build 37 #"):
    # Change path here depenging on what haplogroup!!!!
    print("Loading: %s" % path)
    df_g = pd.read_csv(path, index_col=False, sep=",")
    df_g = df_g.rename(columns={build: 'pos'})  # For merging
    l = len(df_g)
    df_g = df_g[pd.to_numeric(df_g['pos'], errors='coerce').notnull()]
    df_g['pos'] = df_g['pos'].apply(pd.to_numeric)
    print("Drop from %i to %i numerical pos rows" % (l, len(df_g)))
    return df_g

def load_counts(path_counts, coerce=True):
    """Load Count file and return Dataframe"""
    df_t = pd.read_csv(path_counts, header=None, delim_whitespace=True)
    df_t.columns = ["snp", "chr", "pos", "ref_all", "alt_all","drop","iid","ref", "alt"]
    
    if coerce:
        for col in ["pos", "ref", "alt"]:
            df_t[col] = pd.to_numeric(df_t[col], errors="coerce")
            
    df_t = df_t.drop(columns="drop")
    return df_t

################################################
################################################
# Functions to call Y

def call_ind(df_counts, df_isogg, individual="", pos=[], 
             full=False, output=True):
    """Check individual against all markers in df_marker.
    Return dataframe with calls and 3 additional summary stats
    derived: Array whether ancestral allele is coded as dervied. (for r1b)
    individual: For which Individual to do the call (if none given use all rows).
    pos: List of specific SNPs for which to return calls (if none return all)"""

    ### Filter raws with Individual!
    if len(individual)>0:
        df_counts = df_counts[df_counts["Individual"]==individual]
        
    if output:
        print(f"Total rows of calls: {len(df_counts)}")
    
    ### Merge the two dataframes on identical positions
    df = pd.merge(df_counts, df_isogg, on="pos")
    df.drop_duplicates(subset="pos", keep='first', inplace=True)
    
    flip =  df["alt_all"]==df["Mutation info"].str[0]
    df.loc[flip,["ref", "alt", "ref_all", "alt_all"]]= df[["alt","ref", "alt_all", "ref_all"]][flip].values
        
    ### Extract only raws with more Alts than Refs
    derived = (df["alt"]>df["ref"]) & (df["alt"]>0)  # Derived Calls
    tot_calls =  (df["ref"]>0) | (df["alt"]>0)       # At least one Calls
    
    if output==True:
        print(f"Markers found in Database {len(df)}")
        print(f"Flipping {np.sum(flip)} SNPs for Alt Allele")
        print(f"Markers witch calls {np.sum(tot_calls)}")
        print(f"Frac. of Database SNPs covered: {np.sum(tot_calls) / len(df):.6f}")
        print(f"Markers derived: {np.sum(derived)}")
    
    if full == False:    # Only call the derived Markers
        df_call = df[derived]
    
    elif full == True:   # Report the whole Status
        df_call = df
    
    df_call = df_call.sort_values(by="Haplogroup")
    
    ## Only return selected Positions if needed
    if len(pos)>0:
        idx = df_call["pos"].isin(pos)
        df_call = df_call[idx]
    
    return df_call.copy(), np.sum(derived), np.sum(tot_calls), len(df)

################################################
################################################
# Functions to plot Y calls

def pull_clade(df, haplogroup, exact=True):
    """Pulls a specific clade. 
    if exact=True: Has to match the exact haplogroup"""
    if exact:
        df_t = df[df["Haplogroup"]==haplogroup]
    else:
        df_t = df[df["Haplogroup"].str.contains(haplogroup)]
    return df_t

### Load ISOGG Reference Dictionary

In [3]:
haplogroups = ["o", "d", "g", "i", "r", "j",
               "t", "e", "h", "c", "l", "n", "p", "q"]
haplo_paths = [create_haplopath(
    x, haplo_folder="./data/isogg_19/") for x in haplogroups]
df_groups = [load_haplogroup_data(p) for p in haplo_paths]

dict_y_dfs = dict(zip(haplogroups, df_groups))   # Create Dictionary
# dict_y_dfs['all'] = pd.concat(df_groups)   # Add the combined one

Loading: ./data/isogg_19/o_data.csv
Drop from 3459 to 3455 numerical pos rows
Loading: ./data/isogg_19/d_data.csv
Drop from 2595 to 2586 numerical pos rows
Loading: ./data/isogg_19/g_data.csv
Drop from 7503 to 7492 numerical pos rows
Loading: ./data/isogg_19/i_data.csv
Drop from 11246 to 11231 numerical pos rows
Loading: ./data/isogg_19/r_data.csv
Drop from 9763 to 9726 numerical pos rows
Loading: ./data/isogg_19/j_data.csv
Drop from 5947 to 5940 numerical pos rows
Loading: ./data/isogg_19/t_data.csv
Drop from 382 to 381 numerical pos rows
Loading: ./data/isogg_19/e_data.csv
Drop from 10478 to 10459 numerical pos rows
Loading: ./data/isogg_19/h_data.csv
Drop from 3140 to 3127 numerical pos rows
Loading: ./data/isogg_19/c_data.csv
Drop from 6862 to 6855 numerical pos rows
Loading: ./data/isogg_19/l_data.csv
Drop from 1154 to 1151 numerical pos rows
Loading: ./data/isogg_19/n_data.csv
Drop from 2952 to 2948 numerical pos rows
Loading: ./data/isogg_19/p_data.csv
Drop from 300 to 300 numer

## Call Individual Haplogroup
(change Name here)

In [11]:
punic_df = pd.read_csv("../punic_aDNA/data/males_feb20.csv")
punic_df.iloc[:,0].values

array(['I12517', 'I18193', 'I18199', 'I18202', 'I18194', 'I18201',
       'I18189'], dtype=object)

In [23]:
iid="I10266"
haplogroup = "j"

path_counts = f"./output/{iid}.txt"
df_counts = load_counts(path_counts)
df_isogg = dict_y_dfs[haplogroup]
df_call, _, _, _ = call_ind(df_counts, df_isogg, output=True, 
                            full=False)

Total rows of calls: 32670
Markers found in Database 814
Flipping 1 SNPs for Alt Allele
Markers witch calls 70
Frac. of Database SNPs covered: 0.085995
Markers derived: 16


In [24]:
df_call[-50:]

Unnamed: 0,snp,chr,pos,ref_all,alt_all,iid,ref,alt,Name,Haplogroup,Other Names in Tree,rs #,Build 38 #,Mutation info
106,snp_24_7048870,24,7048870,G,A,I10266,0,2,CTS852,J,PF4504,rs372536048,7180829,G->A
174,snp_24_7759610,24,7759610,C,T,I10266,0,1,PF4513,J,,rs372625055,7891569,C->T
305,snp_24_8669451,24,8669451,C,G,I10266,0,1,PF4519,J,,rs375166209,8801410,C->G
564,snp_24_15602183,24,15602183,G,A,I10266,0,2,CTS4349,J,PF4547,rs373857973,13490303,G->A
693,snp_24_16268345,24,16268345,C,G,I10266,0,1,F2116,J,PF4553; YSC0000785,rs370214135,14156465,C->G
724,snp_24_16427564,24,16427564,A,T,I10266,0,2,CTS5678,J,PF4556,rs372543511,14315684,A->T
1032,snp_24_18773505,24,18773505,C,T,I10266,0,1,F2839,J,PF4580,rs369715205,16661625,C->T
1144,snp_24_19460042,24,19460042,G,C,I10266,0,2,CTS10446,J,PF4586,rs374895223,17348162,G->C
1451,snp_24_23088142,24,23088142,T,C,I10266,0,1,L778,J,PF4616; YSC0000236,rs370646613,20926256,T->C
373,snp_24_13832032,24,13832032,G,T,I10266,0,1,PF4905,J2,,rs376988610,11711326,G->T


In [41]:
pull_clade(df_call, "E1b1b1a", exact=True)

Unnamed: 0,snp,chr,pos,ref_all,alt_all,iid,ref,alt,Name,Haplogroup,Other Names in Tree,rs #,Build 38 #,Mutation info
3656,snp_24_22080316,24,22080316,G,A,I15940,0,0,PF2188,E1b1b1a,,rs745543869,19918430,G->A
1617,snp_24_15095592,24,15095592,C,T,I15940,0,0,CTS3657,E1b1b1a,PF2134,rs772458390,12983680,C->T
1280,snp_24_14242950,24,14242950,G,A,I15940,0,0,CTS2270,E1b1b1a,PF2127,rs772692516,12122244,G->A
3450,snp_24_21583211,24,21583211,C,A,I15940,0,0,PF2178,E1b1b1a,,rs768083503,19421325,C->A
674,snp_24_8361073,24,8361073,G,T,I15940,0,1,PF2115,E1b1b1a,,rs760552030,8493032,G->T
1776,snp_24_15520497,24,15520497,T,C,I15940,0,0,CTS4256,E1b1b1a,PF2138; V3011,rs529980048,13408617,T->C
3166,snp_24_19396726,24,19396726,T,C,I15940,0,0,CTS10323,E1b1b1a,PF2167; V4083,rs554084026,17284846,T->C
3249,snp_24_21036413,24,21036413,C,T,I15940,0,0,PF2173,E1b1b1a,Z1211,rs765518572,18874527,C->T
465,snp_24_7804308,24,7804308,C,T,I15940,0,0,PF2108,E1b1b1a,,rs767418565,7936267,C->T
3842,snp_24_22898839,24,22898839,T,C,I15940,0,0,CTS11015,E1b1b1a,PF1955,rs754318075,20736953,T->C


# Run all Haplogroups

In [None]:
%%time
iid="I16339"
for h in haplogroups:

#print(f"\nCalling {haplogroup}")
    print(f"\nDoing Haplogroup: {h}")
    path_counts = f"./output/{iid}.txt"
    df_counts = load_counts(path_counts)
    df_isogg = dict_y_dfs[h]
    df_call, _, _, _ = call_ind(df_counts, df_isogg, output=True, 
                                full=False)

# Area 51

In [8]:
df_t = load_counts("./output/I0410.txt")

In [20]:
np.sum(df_t["ref"]>0)

0

In [62]:
idx = df_isogg["Haplogroup"]=="R1b1a1b1b"
np.sum(idx)

15