# Infer Ne from Punic ROH

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import socket as socket
import os as os
import sys as sys
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("No compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

from hapsburg.figures.plot_bars import plot_panel_row, prepare_dfs_plot
from hapsburg.PackagesSupport.fit_ne import MLE_ROH_Ne, load_roh_vec # For Ne MLE Analysis

compute-e-16-237.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 28


# Load Data and browse Indivdiuals

In [39]:
### Load ROH data
df1 = pd.read_csv("/n/groups/reich/hringbauer/hapsburg_runs/output/combined_roh05.v56.1.unique.tsv", sep='\t')
df1 = df1.sort_values(by="sum_roh>4", ascending=False)
print(f"Loaded {len(df1)} Individuals with ROH")

### Load Punic Cluster Assignments
df_clst = pd.read_csv("./data/cluster_assignments_punic.v54.1j.tsv", sep="\t")
print(f"Loaded Cluster Information for {len(df_clst)} Individuals.\n")
df_pun = df_clst.copy()

df_pun = df_pun[df_pun["labelPCA"].str.contains("Punic")].copy()
print(f"\nLoaded {len(df_pun)} Punic Indiviudals")

### With ROH info:
df_proh = pd.merge(df_pun, df1, on="iid")
print(f"With ROH info: n={len(df_proh)}")

df_proh["labelPCA"].value_counts()

### Filter to Individuals to fit Ne
df_proh1 = df_proh[df_proh["sum_roh>20"]<50] # Remove close-kin consanguineous individuals - 50 cm in total of very long ROH (>20cm) is the cutoff chosen here.
iids = df_proh1["iid"].values # Load list of all iids
print(f"Loaded {len(iids)}/{len(df_proh)} IIDs without >50cm ROH20")

Loaded 20398 Individuals with ROH
Loaded Cluster Information for 160 Individuals.


Loaded 140 Punic Indiviudals
With ROH info: n=90
Loaded 79/90 IIDs without >50cm ROH20


### Load the ROH segments of Target Indivdiuals

In [40]:
roh_vec = load_roh_vec(iids=iids, base_path = "/n/groups/reich/hringbauer/hapsburg_runs/output/v56.1/", suffix="_roh_full.csv")
print(f"Loaded ROH Vector for {len(roh_vec)} IIDs")
assert(len(roh_vec)==len(iids))

Loaded ROH Vector for 79 IIDs


### Run Ne estimates [4-8 cm, IN MANUSCRIPT]

In [41]:
%%time
output = True
min_len = 4 # Min ROH length in cM to fit
max_len = 8 # Max ROH length in cM to fit

mle = MLE_ROH_Ne(start_params=1000, endog=roh_vec,
                 min_len=min_len, max_len=max_len,
                 chr_lgts=[],      # lengths of Chromosomes to fit (in cM). If len 0, use default for 1240K
                 error_model=False, output=False)
fit = mle.fit_ll_profile()
#summary = fit.summary()
mle.summary/2  # to get estimates in terms of Ne

CPU times: user 3.83 s, sys: 2.12 ms, total: 3.83 s
Wall time: 3.83 s


Unnamed: 0,coef,std err,0.025,0.975,n
0,6413.249153,,4931.329231,8515.325146,39.5


### Run Ne estimates [4-20 cm]

In [19]:
%%time
output = True
min_len = 4 # Min ROH length in cM to fit
max_len = 20 # Max ROH length in cM to fit

mle = MLE_ROH_Ne(start_params=1000, endog=roh_vec,
                 min_len=min_len, max_len=max_len,
                 chr_lgts=[],      # lengths of Chromosomes to fit (in cM). If len 0, use default for 1240K
                 error_model=False, output=False)
fit = mle.fit_ll_profile()
#summary = fit.summary()
mle.summary/2  # to get estimates in terms of Ne

CPU times: user 4.46 s, sys: 19.9 ms, total: 4.48 s
Wall time: 4.48 s


Unnamed: 0,coef,std err,0.025,0.975,n
0,4698.324157,,3844.64186,5821.515665,39.5


# 2) Split Ne estimates per macro region

In [10]:
df_proh["cluster_geo"].value_counts()

cluster_geo
NorthAfrica    31
Sicily         27
Sardinia       14
Iberia         14
Levant          4
Name: count, dtype: int64

In [17]:
geo_clusters = ["NorthAfrica", "Sicily", "Sardinia", "Iberia"]
output = True
min_len = 4 # Min ROH length in cM to fit
max_len = 8 # Max ROH length in cM to fit

res =[]
for g in geo_clusters:
    dft = df_proh1[df_proh1["cluster_geo"]==g]
    iids = dft["iid"].values
    roh_vec = load_roh_vec(iids=iids, base_path = "/n/groups/reich/hringbauer/hapsburg_runs/output/v56.1/", suffix="_roh_full.csv")
    print(f"Loaded ROH Vector for {len(roh_vec)} IIDs")
    assert(len(roh_vec)==len(iids))
 
    mle = MLE_ROH_Ne(start_params=1000, endog=roh_vec,
                     min_len=min_len, max_len=max_len,
                     chr_lgts=[],      # lengths of Chromosomes to fit (in cM). If len 0, use default for 1240K
                     error_model=False, output=False)
    fit = mle.fit_ll_profile()
    #summary = fit.summary()
    x = mle.summary/2  # to get estimates in terms of Ne
    res.append(x)
df_res = pd.concat(res)
df_res.index = geo_clusters
df_res["n"] = df_res["n"]*2 # To account for the halfing up there

Loaded ROH Vector for 27 IIDs
Loaded ROH Vector for 26 IIDs
Loaded ROH Vector for 12 IIDs
Loaded ROH Vector for 10 IIDs


In [18]:
df_res

Unnamed: 0,coef,std err,0.025,0.975,n
NorthAfrica,10192.466991,,5943.538489,19523.676184,27.0
Sicily,8999.642534,,5357.966999,16653.001718,26.0
Sardinia,6195.310785,,3348.080027,13533.26035,12.0
Iberia,2063.410423,,1362.716266,3302.096981,10.0


# 3) Run ROH Ne estimates for context populations

In [29]:
df_context = pd.read_csv("./output/tables/reference_samples_plot.v54.1.tsv", sep="\t")

df1 = pd.read_csv("/n/groups/reich/hringbauer/hapsburg_runs/output/combined_roh05.v56.1.unique.tsv", sep='\t')
df1 = df1.sort_values(by="sum_roh>4", ascending=False)
print(f"Loaded {len(df1)} Individuals with ROH")

### With ROH info:
df_croh = pd.merge(df_context[["iid","label"]], df1, on="iid")
print(f"With ROH info: n={len(df_croh)}")

### Filter to Individuals to fit Ne
df_croh1 = df_croh[df_croh["sum_roh>20"]<50] # Remove close-kin consanguineous individuals - 50 cm in total of very long ROH (>20cm) is the cutoff chosen here.
print(f"Loaded {len(df_croh1)}/{len(df_croh)} IIDs without >50cm ROH20")

Loaded 20398 Individuals with ROH
With ROH info: n=71
Loaded 70/71 IIDs without >50cm ROH20


In [33]:
geo_clusters = df_croh1["label"].value_counts().index.values
output = True
min_len = 4 # Min ROH length in cM to fit
max_len = 8 # Max ROH length in cM to fit

res =[]
for g in geo_clusters:
    dft = df_croh1[df_croh1["label"]==g]
    iids = dft["iid"].values
    roh_vec = load_roh_vec(iids=iids, base_path = "/n/groups/reich/hringbauer/hapsburg_runs/output/v56.1/", suffix="_roh_full.csv")
    print(f"Loaded ROH Vector for {len(roh_vec)} IIDs")
    assert(len(roh_vec)==len(iids))
 
    mle = MLE_ROH_Ne(start_params=1000, endog=roh_vec,
                     min_len=min_len, max_len=max_len,
                     chr_lgts=[],      # lengths of Chromosomes to fit (in cM). If len 0, use default for 1240K
                     error_model=False, output=False)
    fit = mle.fit_ll_profile()
    #summary = fit.summary()
    x = mle.summary/2  # to get estimates in terms of Ne
    res.append(x)
df_resc = pd.concat(res)
df_resc.index = geo_clusters
df_resc["n"] = df_resc["n"]*2 # To account for the halfing up there

Loaded ROH Vector for 19 IIDs
Loaded ROH Vector for 17 IIDs
Loaded ROH Vector for 10 IIDs
Loaded ROH Vector for 8 IIDs
Loaded ROH Vector for 7 IIDs
Loaded ROH Vector for 3 IIDs
Loaded ROH Vector for 3 IIDs
Loaded ROH Vector for 2 IIDs
Loaded ROH Vector for 1 IIDs


In [34]:
df_resc

Unnamed: 0,coef,std err,0.025,0.975,n
Sicily Polizzello IA,8754.135159,,4863.601596,18093.749062,19.0
Sardinia BA,7016.445424,,4007.503481,14009.332782,17.0
Canaanite MLBA,13816.92645,,5321.04622,50000.0,10.0
Iberia IA,16538.248904,,5357.966999,50000.0,8.0
Tunisia M/N,2226.479255,,1344.000511,4035.310071,7.0
Mycenean BA,4119.892842,,1586.614817,16538.248904,3.0
Menorca LBA,1760.015736,,906.208774,4119.892842,3.0
Sicily Motya MBA,8283.029795,,1873.025016,50000.0,2.0
Algeria IA,50000.0,,2063.410423,50000.0,1.0


In [36]:
df_croh1[df_croh1["label"]=="Sardinia BA"]

Unnamed: 0,iid,label,max_roh,pop,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,...,study,clst,mean_cov,n_cov_snp,avg_cov_snp,include_alt,family,sex,contact,data_type
22,ORC003,Sardinia BA,0.0,Italy_Sardinia_BA_Nuragic,0.0,0,0.0,0,0.0,0,...,MarcusNatureCommunications2020,Italy_Sardinia_BA_Nuragic,0.691849,830219,1.137992,True,n/a (no relatives detected),M,John Novembre,1240k
23,ORC007,Sardinia BA,0.0,Italy_Sardinia_BA_Nuragic,0.0,0,0.0,0,0.0,0,...,MarcusNatureCommunications2020,Italy_Sardinia_BA_Nuragic,0.689399,827279,1.11362,True,n/a (no relatives detected),M,John Novembre,1240k
24,ORC009,Sardinia BA,13.105005,Italy_Sardinia_BA_Nuragic,22.766411,3,13.105005,1,13.105005,1,...,MarcusNatureCommunications2020,Italy_Sardinia_BA_Nuragic,0.685355,822426,1.125571,True,n/a (no relatives detected),F,John Novembre,1240k
25,ORC008,Sardinia BA,0.0,Italy_Sardinia_BA_Nuragic,0.0,0,0.0,0,0.0,0,...,MarcusNatureCommunications2020,Italy_Sardinia_BA_Nuragic,0.6714,805680,1.037642,True,n/a (no relatives detected),M,John Novembre,1240k
26,ORC005,Sardinia BA,4.053897,Italy_Sardinia_BA_Nuragic,4.053897,1,0.0,0,0.0,0,...,MarcusNatureCommunications2020,Italy_Sardinia_BA_Nuragic,0.67116,805392,1.035292,True,n/a (no relatives detected),F,John Novembre,1240k
27,ISB001,Sardinia BA,0.0,Italy_Sardinia_EBA,0.0,0,0.0,0,0.0,0,...,MarcusNatureCommunications2020,Italy_Sardinia_EBA,0.655011,786013,1.511072,True,n/a (no relatives detected),M,John Novembre,1240k
28,ORC001,Sardinia BA,5.293321,Italy_Sardinia_BA_Nuragic,10.364021,2,0.0,0,0.0,0,...,MarcusNatureCommunications2020,Italy_Sardinia_BA_Nuragic,0.652923,783508,0.94156,True,n/a (no relatives detected),F,John Novembre,1240k
29,SUC005,Sardinia BA,4.3717,Italy_Sardinia_EBA,4.3717,1,0.0,0,0.0,0,...,MarcusNatureCommunications2020,Italy_Sardinia_EBA,0.647917,777500,0.93928,True,n/a (no relatives detected),M,John Novembre,1240k
30,SUC004,Sardinia BA,11.820901,Italy_Sardinia_EBA,16.060401,2,11.820901,1,0.0,0,...,MarcusNatureCommunications2020,Italy_Sardinia_EBA,0.645385,774462,0.89811,True,n/a (no relatives detected),F,John Novembre,1240k
31,ORC006,Sardinia BA,0.0,Italy_Sardinia_BA_Nuragic,0.0,0,0.0,0,0.0,0,...,MarcusNatureCommunications2020,Italy_Sardinia_BA_Nuragic,0.643172,771806,1.007005,True,n/a (no relatives detected),M,John Novembre,1240k


### Save results

In [38]:
dfs = pd.concat((df_res, df_resc))
dfs.to_csv("./output/tables/roh_ne.tsv", sep="\t", index=True)