# Notebook to plot Y chromosome diversity over time

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time


# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

from python.plot_pca import *  # Import functions needed for the PCA plotting

compute-a-16-108.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


In [14]:
df_meta = pd.read_csv("/n/groups/reich/hringbauer/Data/v53.1.anno.haplogroups.csv") # Load Meta Data
min_snp=600000
age = [0,12000]
lat = [20,90]
lon = [-28, 180]
flag = ["_contam", "_dup"]

In [23]:
df_meta["study"]=df_meta["study"].fillna("missing")
idx = df_meta["n_cov_snp"]>min_snp
df=df_meta[idx].reset_index(drop=True)
print(f"Filtering to {np.sum(idx)}/{len(idx)} indiviuals with >{min_snp} SNPs.")
df["include"]=df["include_alt"].astype("int")

### Filtering based on Age
min_age=age[0]
idx = df["age"]>min_age
df=df[idx].reset_index(drop=True)
print(f"Filtering to {np.sum(idx)}/{len(idx)} inds >{min_age} BP.")

max_age = age[1]
idx = df["age"]<max_age
df = df[idx].reset_index(drop=True)
print(f"Filtering to {np.sum(idx)}/{len(idx)} inds <{max_age} BP.")

### Geographic Filtering
if (len(lat)>0) | (len(lon)>0):
    idx_lat = (lat[0] < df["lat"]) & (df["lat"] < lat[1])
    idx_lon = (lon[0] < df["lon"]) & (df["lon"] < lon[1])
    idx = (idx_lat & idx_lon)
    df=df[idx].reset_index(drop=True)
    print(f"Kept {np.sum(idx)}/{len(idx)} inds with matching lat/lon.")

### Flag tricky Indivdiuals
idx = df["clst"].str.contains("|".join(flag))
print(f"Kept {np.sum(~idx)}/{len(idx)} inds with good cluster labels.")
df=df[~idx].reset_index(drop=True)
df = df.sort_values(by="avg_cov_snp", ascending=False)
idx = df["Master ID"].duplicated()
print(f"Kept {np.sum(~idx)}/{len(idx)} unique Master IDs.")
df=df[~idx].reset_index(drop=True)

idx= df["sex"]=="M"
print(f"Kept {np.sum(idx)}/{len(idx)} Males.")
df=df[idx].reset_index(drop=True)

Filtering to 18885/31997 indiviuals with >600000 SNPs.
Filtering to 12446/18885 inds >0 BP.
Filtering to 12396/12446 inds <12000 BP.
Kept 9540/12396 inds with matching lat/lon.
Kept 9469/9540 inds with good cluster labels.
Kept 8854/9469 unique Master IDs.
Kept 4900/8854 Males.


# Prepare Sardinia

In [33]:
df_sar = df[df["region"].str.contains("Sardinia")]

In [32]:
df_sar

Unnamed: 0,iid,Master ID,loc,lat,lon,age,region,study,clst,mean_cov,n_cov_snp,avg_cov_snp,include_alt,sex,Y_haplo,mtDNA_haplo,include


In [28]:
df_ib = df[df["region"]=="Italy"]
df_ib["loc"].value_counts()[:50]

Montecilfone                                      28
Abruzzo, Peltuinum                                26
Campania, Naples, Herculaneum                     26
Lazio, Rieti, Corvaro                             25
Sicily, Himera                                    22
Marche, Pesaro e Urbino, Urbino Bivio C.M.        21
Castel Sozzio                                     19
Bologna, Via Orfeo Jewish Medieval Cemetery       19
Molise, Isernia, Pozzilli                         16
Campania, San Marzano                             15
Abruzzo, Presciano                                13
Emilia Romagna, Ceretolo, Casalecchio di Reno     13
Piedmont, Collegno                                12
San Marzano                                       12
Tortora                                           12
Campania, Eboli                                   11
Sardinia, Su Crucefissu Mannu                     10
Marche, Pesaro e Urbino, Urbino San Donato        10
Lazio, Roma, Magliana                         