# Functions and run the Y calling. Eventually put into Python package

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/y_chrom/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-e-16-230.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/y_chrom
CPU Count: 28
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


In [2]:
################################################
# Definitions to load Files (ISOGG and Output of Pulldown)

def create_haplopath(haplogroup, rest="_data.csv", haplo_folder="./data/isogg_19/"):
    """Give back the full Path of Haplogroup data"""
    assert(len(haplogroup) == 1)
    path = haplo_folder + haplogroup + rest
    return path

def load_haplogroup_data(path, build="Build 37 #"):
    # Change path here depenging on what haplogroup!!!!
    print("Loading: %s" % path)
    df_g = pd.read_csv(path, index_col=False, sep=",")
    df_g = df_g.rename(columns={build: 'pos'})  # For merging
    l = len(df_g)
    df_g = df_g[pd.to_numeric(df_g['pos'], errors='coerce').notnull()]
    df_g['pos'] = df_g['pos'].apply(pd.to_numeric)
    print("Drop from %i to %i numerical pos rows" % (l, len(df_g)))
    return df_g

def load_counts(path_counts, coerce=True):
    """Load Count file and return Dataframe"""
    df_t = pd.read_csv(path_counts, header=None, delim_whitespace=True)
    df_t.columns = ["snp", "chr", "pos", "ref_all", "alt_all","drop","iid","ref", "alt"]
    
    if coerce:
        for col in ["pos", "ref", "alt"]:
            df_t[col] = pd.to_numeric(df_t[col], errors="coerce")
            
    df_t = df_t.drop(columns="drop")
    return df_t

################################################
################################################
# Functions to call Y

def call_ind(df_counts, df_isogg, individual="", pos=[], 
             full=False, output=True):
    """Check individual against all markers in df_marker.
    Return dataframe with calls and 3 additional summary stats
    derived: Array whether ancestral allele is coded as dervied. (for r1b)
    individual: For which Individual to do the call (if none given use all rows).
    pos: List of specific SNPs for which to return calls (if none return all)"""

    ### Filter raws with Individual!
    if len(individual)>0:
        df_counts = df_counts[df_counts["Individual"]==individual]
        
    if output:
        print(f"Total rows of calls: {len(df_counts)}")
    
    ### Merge the two dataframes on identical positions
    df = pd.merge(df_counts, df_isogg, on="pos")
    df.drop_duplicates(subset="pos", keep='first', inplace=True)
    
    flip =  df["alt_all"]==df["Mutation info"].str[0]
    df.loc[flip,["ref", "alt", "ref_all", "alt_all"]]= df[["alt","ref", "alt_all", "ref_all"]][flip].values
        
    ### Extract only raws with more Alts than Refs
    derived = (df["alt"]>df["ref"]) & (df["alt"]>0)  # Derived Calls
    tot_calls =  (df["ref"]>0) | (df["alt"]>0)       # At least one Calls
    
    if output==True:
        print(f"Markers found in Database {len(df)}")
        print(f"Flipping {np.sum(flip)} SNPs for Alt Allele")
        print(f"Markers witch calls {np.sum(tot_calls)}")
        print(f"Frac. of Database SNPs covered: {np.sum(tot_calls) / len(df):.6f}")
        print(f"Markers derived: {np.sum(derived)}")
    
    if full == False:    # Only call the derived Markers
        df_call = df[derived]
    
    elif full == True:   # Report the whole Status
        df_call = df
    
    df_call = df_call.sort_values(by="Haplogroup")
    
    ## Only return selected Positions if needed
    if len(pos)>0:
        idx = df_call["pos"].isin(pos)
        df_call = df_call[idx]
    
    return df_call.copy(), np.sum(derived), np.sum(tot_calls), len(df)

################################################
################################################
# Functions to plot Y calls

### Load ISOGG Reference Dictionary

In [3]:
haplogroups = ["o", "d", "g", "i", "r", "j",
               "t", "e", "h", "c", "l", "n", "p", "q"]
haplo_paths = [create_haplopath(
    x, haplo_folder="./data/isogg_19/") for x in haplogroups]
df_groups = [load_haplogroup_data(p) for p in haplo_paths]

dict_y_dfs = dict(zip(haplogroups, df_groups))   # Create Dictionary
# dict_y_dfs['all'] = pd.concat(df_groups)   # Add the combined one

Loading: ./data/isogg_19/o_data.csv
Drop from 3459 to 3455 numerical pos rows
Loading: ./data/isogg_19/d_data.csv
Drop from 2595 to 2586 numerical pos rows
Loading: ./data/isogg_19/g_data.csv
Drop from 7503 to 7492 numerical pos rows
Loading: ./data/isogg_19/i_data.csv
Drop from 11246 to 11231 numerical pos rows
Loading: ./data/isogg_19/r_data.csv
Drop from 9763 to 9726 numerical pos rows
Loading: ./data/isogg_19/j_data.csv
Drop from 5947 to 5940 numerical pos rows
Loading: ./data/isogg_19/t_data.csv
Drop from 382 to 381 numerical pos rows
Loading: ./data/isogg_19/e_data.csv
Drop from 10478 to 10459 numerical pos rows
Loading: ./data/isogg_19/h_data.csv
Drop from 3140 to 3127 numerical pos rows
Loading: ./data/isogg_19/c_data.csv
Drop from 6862 to 6855 numerical pos rows
Loading: ./data/isogg_19/l_data.csv
Drop from 1154 to 1151 numerical pos rows
Loading: ./data/isogg_19/n_data.csv
Drop from 2952 to 2948 numerical pos rows
Loading: ./data/isogg_19/p_data.csv
Drop from 300 to 300 numer

## Call Individual Haplogroup
(change Name here)

In [45]:
iid="I11999"
haplogroup = "r"

#print(f"\nCalling {haplogroup}")

path_counts = f"./output/{iid}.txt"
df_counts = load_counts(path_counts)

df_isogg = dict_y_dfs[haplogroup]
df_call, _, _, _ = call_ind(df_counts, df_isogg, output=True, 
                            full=True)

Total rows of calls: 32670
Markers found in Database 1084
Flipping 204 SNPs for Alt Allele
Markers witch calls 170
Frac. of Database SNPs covered: 0.156827
Markers derived: 33


In [46]:
df_call[-50:]

Unnamed: 0,snp,chr,pos,ref_all,alt_all,iid,ref,alt,Name,Haplogroup,Other Names,rs #,Build 38 #,Mutation info
1833,snp_24_23128862,24,23128862,T,C,I11999,0,0,FGC12642,R2a2,Y3434,rs371063858,20966976,T->C
1247,PF6109,24,18028669,C,G,I11999,0,0,P267,R2a2,PF6109,rs368057017,15916789,C->G
299,snp_24_7804375,24,7804375,T,G,I11999,0,0,Y12100,R2a2b,FGC12586,rs377726348,7936334,T->G
1747,snp_24_22725944,24,22725944,G,A,I11999,1,0,FGC12582,R2a2b1,Y8764,rs374176506,20564058,G->A
1819,snp_24_23020448,24,23020448,T,G,I11999,0,0,Y8763,R2a2b1,FGC12615,rs372559321,20858562,T->G
404,snp_24_8428326,24,8428326,C,T,I11999,0,0,FGC12643,R2a2b1,V1946; SK2140,rs372031220,8560285,C->T
246,snp_24_7542004,24,7542004,A,C,I11999,0,0,FGC12630,R2a2b1,V1180,rs367549058,7673963,A->C
720,snp_24_14984657,24,14984657,G,A,I11999,0,0,PF7499,R2a2b1,V2738,rs371989279,12872725,G->A
895,snp_24_15826970,24,15826970,G,A,I11999,0,0,SK2148,R2a2b1a,FGC12677; Y3341,rs1035921799,13715090,G->A
266,L1069,24,7593759,A,T,I11999,0,0,L1069,R2a2b1a,,,7725718,A->T


In [58]:
df_call[df_call["Haplogroup"].str.contains("R1b1a1b1b")][:50]

Unnamed: 0,snp,chr,pos,ref_all,alt_all,iid,ref,alt,Name,Haplogroup,Other Names,rs #,Build 38 #,Mutation info
119,rs9785971,24,6753511,G,A,I11999,0,0,L23,R1b1a1b1,PF6534; S141,rs9785971,6885470,G->A
437,rs9786140,24,8502236,G,A,I11999,1,0,L51,R1b1a1b1a,M412; PF6536; S167,rs9786140,8634195,G->A
501,rs67952918,24,9084870,G,T,I11999,0,0,PF6540,R1b1a1b1a1,YSC0000082,rs67952918,9247261,G->T
1377,rs9786283,24,18907236,A,C,I11999,0,0,P310,R1b1a1b1a1,PF6546; S129,rs9786283,16795356,A->C
685,rs13304168,24,14641193,C,T,I11999,0,0,L52,R1b1a1b1a1,PF6541,rs13304168,12529262,C->T
1317,rs9785659,24,18248698,A,G,I11999,0,0,P311,R1b1a1b1a1,PF6545; S128,rs9785659,16136818,A->G
975,rs2082033,24,16492547,C,T,I11999,1,0,L151,R1b1a1b1a1a,PF6542,rs2082033,14380667,C->T
1212,rs9786076,24,17844018,T,C,I11999,0,0,L11,R1b1a1b1a1a,,rs9786076,15732138,T->C
1025,rs67643699,24,16751825,G,A,I11999,0,0,PF6543,R1b1a1b1a1a,S1159; YSC0000191,rs67643699,14639945,G->A
491,rs16981293,24,8796078,C,T,I11999,0,0,M405,R1b1a1b1a1a1,U106; S21,rs16981293,8928037,C->T


# Run all Haplogroups

In [None]:
%%time
iid="I16339"
for h in haplogroups:

#print(f"\nCalling {haplogroup}")
    print(f"\nDoing Haplogroup: {h}")
    path_counts = f"./output/{iid}.txt"
    df_counts = load_counts(path_counts)
    df_isogg = dict_y_dfs[h]
    df_call, _, _, _ = call_ind(df_counts, df_isogg, output=True, 
                                full=False)

# Area 51

In [8]:
df_t = load_counts("./output/I0410.txt")

In [20]:
np.sum(df_t["ref"]>0)

0

In [62]:
idx = df_isogg["Haplogroup"]=="R1b1a1b1b"
np.sum(idx)

15