# Functions and run the Y calling. Eventually put into Python package

In [2]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/y_chrom/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-16-77.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/y_chrom
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


In [79]:
################################################
# Definitions to load Files (ISOGG and Output of Pulldown)

def create_haplopath(haplogroup, rest="_data.csv", haplo_folder="./data/isogg_19/"):
    """Give back the full Path of Haplogroup data"""
    assert(len(haplogroup) == 1)
    path = haplo_folder + haplogroup + rest
    return path

def load_haplogroup_data(path, build="Build 37 #"):
    # Change path here depenging on what haplogroup!!!!
    print("Loading: %s" % path)
    df_g = pd.read_csv(path, index_col=False, sep=",")
    df_g = df_g.rename(columns={build: 'pos'})  # For merging
    l = len(df_g)
    df_g = df_g[pd.to_numeric(df_g['pos'], errors='coerce').notnull()]
    df_g['pos'] = df_g['pos'].apply(pd.to_numeric)
    print("Drop from %i to %i numerical pos rows" % (l, len(df_g)))
    return df_g

def load_counts(path_counts, coerce=True):
    """Load Count file and return Dataframe"""
    df_t = pd.read_csv(path_counts, header=None, delim_whitespace=True)
    df_t.columns = ["snp", "chr", "pos", "ref_all", "alt_all","drop","iid","ref", "alt"]
    
    if coerce:
        for col in ["pos", "ref", "alt"]:
            df_t[col] = pd.to_numeric(df_t[col], errors="coerce")
            
    df_t = df_t.drop(columns="drop")
    return df_t

################################################
################################################
# Functions to call Y

def call_ind(df_counts, df_isogg, individual="", pos=[], 
             full=False, output=True):
    """Check individual against all markers in df_marker.
    Return dataframe with calls and 3 additional summary stats
    derived: Array whether ancestral allele is coded as dervied. (for r1b)
    individual: For which Individual to do the call (if none given use all rows).
    pos: List of specific SNPs for which to return calls (if none return all)"""

    ### Filter raws with Individual!
    if len(individual)>0:
        df_counts = df_counts[df_counts["Individual"]==individual]
        
    if output:
        print(f"Total rows of calls: {len(df_counts)}")
    
    ### Merge the two dataframes on identical positions
    df = pd.merge(df_counts, df_isogg, on="pos")
    df.drop_duplicates(subset="pos", keep='first', inplace=True)
    
    flip =  df["alt_all"]==df["Mutation info"].str[0]
    df.loc[flip,["ref", "alt", "ref_all", "alt_all"]]= df[["alt","ref", "alt_all", "ref_all"]][flip].values
        
    ### Extract only raws with more Alts than Refs
    derived = (df["alt"]>df["ref"]) & (df["alt"]>0)  # Derived Calls
    tot_calls =  (df["ref"]>0) | (df["alt"]>0)       # At least one Calls
    
    if output==True:
        print(f"Markers found in Database {len(df)}")
        print(f"Flipping {np.sum(flip)} SNPs for Alt Allele")
        print(f"Markers witch calls {np.sum(tot_calls)}")
        print(f"Frac. of Database SNPs covered: {np.sum(tot_calls) / len(df):.6f}")
        print(f"Markers derived: {np.sum(derived)}")
    
    if full == False:    # Only call the derived Markers
        df_call = df[derived]
    
    elif full == True:   # Report the whole Status
        df_call = df
    
    df_call = df_call.sort_values(by="Haplogroup")
    
    ## Only return selected Positions if needed
    if len(pos)>0:
        idx = df_call["pos"].isin(pos)
        df_call = df_call[idx]
    
    return df_call.copy(), np.sum(derived), np.sum(tot_calls), len(df)

################################################
################################################
# Functions to plot Y calls

def pull_clade(df, haplogroup, exact=True):
    """Pulls a specific clade. 
    if exact=True: Has to match the exact haplogroup"""
    if exact:
        df_t = df[df["Haplogroup"]==haplogroup]
    else:
        df_t = df[df["Haplogroup"].str.contains(haplogroup)]
    return df_t

### Load ISOGG Reference Dictionary

In [4]:
haplogroups = ["o", "d", "g", "i", "r", "j",
               "t", "e", "h", "c", "l", "n", "p", "q"]
haplo_paths = [create_haplopath(
    x, haplo_folder="./data/isogg_19/") for x in haplogroups]
df_groups = [load_haplogroup_data(p) for p in haplo_paths]

dict_y_dfs = dict(zip(haplogroups, df_groups))   # Create Dictionary
# dict_y_dfs['all'] = pd.concat(df_groups)   # Add the combined one

Loading: ./data/isogg_19/o_data.csv
Drop from 3459 to 3455 numerical pos rows
Loading: ./data/isogg_19/d_data.csv
Drop from 2595 to 2586 numerical pos rows
Loading: ./data/isogg_19/g_data.csv
Drop from 7503 to 7492 numerical pos rows
Loading: ./data/isogg_19/i_data.csv
Drop from 11246 to 11231 numerical pos rows
Loading: ./data/isogg_19/r_data.csv
Drop from 9763 to 9726 numerical pos rows
Loading: ./data/isogg_19/j_data.csv
Drop from 5947 to 5940 numerical pos rows
Loading: ./data/isogg_19/t_data.csv
Drop from 382 to 381 numerical pos rows
Loading: ./data/isogg_19/e_data.csv
Drop from 10478 to 10459 numerical pos rows
Loading: ./data/isogg_19/h_data.csv
Drop from 3140 to 3127 numerical pos rows
Loading: ./data/isogg_19/c_data.csv
Drop from 6862 to 6855 numerical pos rows
Loading: ./data/isogg_19/l_data.csv
Drop from 1154 to 1151 numerical pos rows
Loading: ./data/isogg_19/n_data.csv
Drop from 2952 to 2948 numerical pos rows
Loading: ./data/isogg_19/p_data.csv
Drop from 300 to 300 numer

## Call Individual Haplogroup
(change Name here)

In [66]:
punic_df = pd.read_csv("../punic_aDNA/data/males_feb20.csv")
punic_df.iloc[:,0].values

array(['I12517', 'I18193', 'I18199', 'I18202', 'I18194', 'I18201',
       'I18189'], dtype=object)

In [87]:
iid="I7121"
haplogroup = "r"

path_counts = f"./output/{iid}.txt"
df_counts = load_counts(path_counts)
df_isogg = dict_y_dfs[haplogroup]
df_call, _, _, _ = call_ind(df_counts, df_isogg, output=True, 
                            full=False)

Total rows of calls: 32670
Markers found in Database 1084
Flipping 204 SNPs for Alt Allele
Markers witch calls 42
Frac. of Database SNPs covered: 0.038745
Markers derived: 5


In [88]:
df_call[-50:]

Unnamed: 0,snp,chr,pos,ref_all,alt_all,iid,ref,alt,Name,Haplogroup,Other Names,rs #,Build 38 #,Mutation info
271,rs35768594,24,7671535,C,T,I7121,0,1,F93,R1,M621; PF6114,rs35768594,7803494,C->T
555,rs73620075,24,13807475,C,A,I7121,0,1,FGC189,R1,Y305,rs73620075,11686769,C->A
553,rs113598050,24,13657777,T,C,I7121,0,1,L777,R1b1a1b,YSC0000248,rs113598050,11502101,T->C
215,snp_24_7266032,24,7266032,T,G,I7121,0,1,PF6287,R1b1b,,rs752185125,7397991,T->G
235,snp_24_7366284,24,7366284,G,T,I7121,0,1,Y7771,R1b1b2a2a,,,7498243,G->T


In [82]:
len(pull_clade(df_call, "R1b1b", exact=True))

37

# Run all Haplogroups

In [None]:
%%time
iid="I16339"
for h in haplogroups:

#print(f"\nCalling {haplogroup}")
    print(f"\nDoing Haplogroup: {h}")
    path_counts = f"./output/{iid}.txt"
    df_counts = load_counts(path_counts)
    df_isogg = dict_y_dfs[h]
    df_call, _, _, _ = call_ind(df_counts, df_isogg, output=True, 
                                full=False)

# Area 51

In [8]:
df_t = load_counts("./output/I0410.txt")

In [20]:
np.sum(df_t["ref"]>0)

0

In [62]:
idx = df_isogg["Haplogroup"]=="R1b1a1b1b"
np.sum(idx)

15