# Notebook to plot Y chromosome diversity over time

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt
import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time
from scipy.spatial.distance import pdist

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
rcParams['font.sans-serif'] = ['Arial']
from matplotlib import gridspec
#plt.style.use('ggplot') #..../whitegrid.mplstyle  # Nice Plotting Style

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

from python.plot_pca import *  # Import functions needed for the PCA plotting
from hapsburg.PackagesSupport.sqrt_scale import SquareRootScale

compute-e-16-235.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 28
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


### Helper Functions

In [2]:
def filter_df_age(df, age_delta = 300, output=False):
    """Takes Dataframe df as Input, and filters to samples within age_delta of median age.
    Return filtered Dataframe and medium Age"""
    age_med = np.median(df["age"])
    idx = (df["age"]< age_med + age_delta) & (df["age"] > age_med - age_delta)
    df = df[idx].copy().reset_index(drop=False)
    if output:
        print(f"{np.sum(idx)}/{len(idx)} IIDs within {age_delta} y of median age {age_med}")
    return df, age_med

def get_y_counts(df, digits=3, col="Y_haplo"):
    """Get Y Chromosome counts from Dataframe df"""
    ys = df[col].str[:3]
    cts = ys.value_counts().values
    return cts

def simpson_di(x):
    """ Given a count vector, returns the Simpson Diversity Index
    """
    n = np.sum(x) # Sample Size
    h = np.sum(x*(x-1)) / (n*(n-1)) # Fraction of pairs are identiclal
    
    if h==0: ### Set minimimum homo-cutoff (one homo-pair):
        h = 2 / (n*(n-1))
    return 1 / h

def frac_max_haplo(x):
    """Given a count vector, return frequency of non most-common alleles"""
    f = np.max(x) / np.sum(x)
    return f

def create_ydiv_df(df, sites=[], col_loc="loc", method="simpson",
                   age_delta = 300, digits=3, min_m=5):
    """Take Meta Data as input, and for each site calculate
    the Simpson Index of Y chromosomes.
    method: simpson, frac_max_haplo"""
    data = []
    
    for s in sites:
        df_t = df[df[col_loc]==s]
        df_t, age = filter_df_age(df_t, age_delta=age_delta)
        m = len(df_t)
        
        if m >= min_m: # Only run full analysis if enough males
            y = get_y_counts(df_t)
            if method=="simpson":
                D = simpson_di(y)
            elif method=="frac_max_haplo":
                D = frac_max_haplo(y)
            else:
                raise RuntimeWarning("No fitting mode found.")
            data.append([s, age, m, D])
          
        else:
            continue

    df = pd.DataFrame(data)
    df.columns = ["loc", "age", "males", "D"]
    return df

def get_sub_df_region(df, region="", rec_col="region", loc_col="loc", min_n=5):
    """Get a Dataframe of Y haplogroup diversities per sites"""
    df_ib = df[df[rec_col].isin(region)]
    cts = df_ib[loc_col].value_counts()
    sites = cts[cts>=min_n].index.values
    df_y_it = create_ydiv_df(df, sites=sites)
    return df_y_it

def set_age_ydiv_df(df, site="", age=0,
                    site_col="loc", age_col="age"):
    """Set the Age of a Y Diversity Cluster"""
    idx = df[site_col]==site
    df.loc[idx, age_col]= age

def set_legends(ax, plots=[], legs=[], title="", loc="lower right"):
    """Set Legends in Panel Plots"""
    l1 = ax.legend(plots, legs, fontsize=11, loc=loc,
             title=title)
    
    l1.get_title().set_fontsize('13')
    l1.get_title().set_fontweight("bold")
    [lgd.set_color('white') for lgd in l1.legendHandles]
    [lgd.set_edgecolor('k') for lgd in l1.legendHandles]

### Load Meta and Y haplogroup Data

In [3]:
df_meta = pd.read_csv("/n/groups/reich/hringbauer/Data/v56.3.anno.haplogroups.csv") # Load Meta Data

min_snp = 100000 # Min SNP coverage for Y Call
age = [0,12000]
lat = [20,90]
lon = [-28, 180]
flag = ["_contam", "_dup"]

df_meta["study"]=df_meta["study"].fillna("missing") # Add Nan
idx = df_meta["n_cov_snp"]>min_snp
df=df_meta[idx].reset_index(drop=True)
print(f"Filtering to {np.sum(idx)}/{len(idx)} indiviuals with >{min_snp} SNPs.")
df["include"]=df["include_alt"].astype("int")

### Filtering based on Age
min_age=age[0]
idx = df["age"]>min_age
df=df[idx].reset_index(drop=True)
print(f"Filtering to {np.sum(idx)}/{len(idx)} inds >{min_age} BP.")

max_age = age[1]
idx = df["age"]<max_age
df = df[idx].reset_index(drop=True)
print(f"Filtering to {np.sum(idx)}/{len(idx)} inds <{max_age} BP.")

### Geographic Filtering
if (len(lat)>0) | (len(lon)>0):
    idx_lat = (lat[0] < df["lat"]) & (df["lat"] < lat[1])
    idx_lon = (lon[0] < df["lon"]) & (df["lon"] < lon[1])
    idx = (idx_lat & idx_lon)
    df=df[idx].reset_index(drop=True)
    print(f"Kept {np.sum(idx)}/{len(idx)} inds with matching lat/lon.")

### Flag tricky Indivdiuals
idx = df["clst"].str.contains("|".join(flag))
print(f"Kept {np.sum(~idx)}/{len(idx)} inds with good cluster labels.")
df=df[~idx].reset_index(drop=True)
df = df.sort_values(by="avg_cov_snp", ascending=False)
idx = df["Master ID"].duplicated()
print(f"Kept {np.sum(~idx)}/{len(idx)} unique Master IDs.")
df=df[~idx].reset_index(drop=True)

### Extract Males
idx= df["sex"]=="M"
print(f"Kept {np.sum(idx)}/{len(idx)} Males.")
df=df[idx].reset_index(drop=True)

### Flag Punic Individuals
df1 = pd.read_csv("./data/cluster_assignments_punic.v54.1i.tsv", sep="\t")
print(f"Extracted IIDs of {len(df1)} IIDs in Punic Project")
df_punic = pd.merge(df, df1, on="iid")
print(f"Merged to {len(df_punic)} Punic Males")

### Remove Romans
label_inc = ["Punic_Early", "Punic_Late", "Punic_Late2"]
df_punic = df_punic[df_punic["label"].isin(label_inc)]

#df_punic = df_punic[~df_punic["label"].str.contains("Roman")]
print(f"Filtered to {len(df_punic)} Punic Samples based on label")

### Remove Punics from generated Meta
df = df[~df["iid"].isin(df1["iid"])]
print(f"Filtered general Y to {len(df)} ancient, non Punic individuals")

### Go to published indivduals only
df =df[~df["study"].str.contains("Unpublished")]
print(f"Filtered to {len(df)} published ancient males")

Filtering to 29227/35545 indiviuals with >100000 SNPs.
Filtering to 22574/29227 inds >0 BP.
Filtering to 22448/22574 inds <12000 BP.
Kept 17910/22448 inds with matching lat/lon.
Kept 17707/17910 inds with good cluster labels.
Kept 16715/17707 unique Master IDs.
Kept 9262/16715 Males.
Extracted IIDs of 160 IIDs in Punic Project
Merged to 68 Punic Males
Filtered to 58 Punic Samples based on label
Filtered general Y to 9194 ancient, non Punic individuals
Filtered to 4059 published ancient males


### [Browse] See Punic Y Haplogroups

In [4]:
df_punic["Y_haplo"].str[:3].value_counts()

E1b    14
R1b    10
J2a     8
G2a     7
J1a     4
J2b     3
T1a     3
I2a     2
E1      1
I       1
R1a     1
C1a     1
L       1
E1a     1
J1      1
Name: Y_haplo, dtype: int64

In [11]:
dft = df_punic[df_punic["Y_haplo"].str[:2] == "J2"]

In [31]:
dft["Y_haplo"].str[:5].value_counts()

J2a1a    7
J2b2a    3
J2a2a    1
Name: Y_haplo, dtype: int64

## Browse all published Y haplogroups

In [8]:
idx = df["Y_haplo"].str.contains("J2")
print(f"{np.sum(idx)}/{len(idx)} Hits in data. {np.mean(idx)*100:.4f}%")
df[idx].sort_values(by="age", ascending=False)[100:150];

284/4059 Hits in data. 6.9968%


### Highest Frequency of J2

In [9]:
def top_y_haplos(df, y = "J2", m = 5, col_agg="clst"):
    """"Create top Y Haplogroup Hits for Y haplo y. 
    Return sort df (by max. frac)"""
    dft = df.copy()
    dft["match"] = df["Y_haplo"].str.contains(y)

    dft2 = dft.groupby([col_agg]).agg({'iid':'size','match':'mean'}) \
              .rename(columns={'iid':'count'}) \
              .reset_index()
    dft2["sum"] = (dft2["match"] * dft2["count"]).astype("int")
    dft3 = dft2[dft2["count"]>=m]
    dft3 = dft3.sort_values(by="match", ascending=False)
    return dft3

In [11]:
dft3 = top_y_haplos(df, y="J2a", m=4)[:50]
dft3.to_csv("./output/tables/yfreqs/J2a.most.common.tsv", sep="\t", index=False)

In [13]:
dft3 = top_y_haplos(df, y="J2b", m=4)[:50]
dft3.to_csv("./output/tables/yfreqs/J2b.most.common.tsv", sep="\t", index=False)

### Frequency of J2 in group

In [15]:
def calc_freq_pergroup(df, clst = "Israel_MLBA", y="J2"):
    """Calculate Y Haplogroup Frequency per Subgroup"""
    
    idx = df["clst"].str.contains(clst)
    print(f"Found {np.sum(idx)}/{len(idx)} {clst} males")

    dft = df[idx]
    idx = dft["Y_haplo"].str.contains(y)
    print(f"Found {np.sum(idx)}/{len(idx)} {y} : {np.mean(idx):.4f}%")

In [16]:
calc_freq_pergroup(df, clst = "Israel_MLBA", y="J2")

Found 20/4059 Israel_MLBA males
Found 4/20 J2 : 0.2000%


In [117]:
calc_freq_pergroup(df, clst = "Israel_IA", y="J2")

Found 2/4059 Israel_IA males
Found 0/2 J2 : 0.0000%


In [110]:
calc_freq_pergroup(df, clst = "Lebanon_MBA", y="J2")

Found 2/4059 Lebanon_MBA males
Found 1/2 J2 : 0.5000%


In [113]:
calc_freq_pergroup(df, clst = "Lebanon_IA", y="J2")

Found 8/4059 Lebanon_IA males
Found 0/8 J2 : 0.0000%


In [18]:
calc_freq_pergroup(df, clst = "Lebanon_IA|Lebanon_MBA|Israel_MLBA|Israel_IA", y="J2a")

Found 32/4059 Lebanon_IA|Lebanon_MBA|Israel_MLBA|Israel_IA males
Found 3/32 J2a : 0.0938%


In [19]:
calc_freq_pergroup(df, clst = "Lebanon_IA|Lebanon_MBA|Israel_MLBA|Israel_IA", y="J2b")

Found 32/4059 Lebanon_IA|Lebanon_MBA|Israel_MLBA|Israel_IA males
Found 2/32 J2b : 0.0625%


In [122]:
calc_freq_pergroup(df, clst = "Greece_Aidonia_LBA", y="J2a")

Found 4/4059 Greece_Aidonia_LBA males
Found 2/4 J2a : 0.5000%


In [24]:
df[df["clst"].str.contains("Lebanon_IA|Lebanon_MBA|Israel_MLBA|Israel_IA")]["Y_haplo"].str[:3].value_counts()

J1a    16
J2a     3
E1b     3
R1b     2
J2b     2
T1a     2
J       1
H       1
I2a     1
G2a     1
Name: Y_haplo, dtype: int64

In [25]:
len(df[df["clst"].str.contains("Lebanon_IA|Lebanon_MBA|Israel_MLBA|Israel_IA")])

32