# Infer Ne from Punic ROH

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import socket as socket
import os as os
import sys as sys
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("No compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

from hapsburg.figures.plot_bars import plot_panel_row, prepare_dfs_plot
from hapsburg.PackagesSupport.fit_ne import MLE_ROH_Ne, load_roh_vec # For Ne MLE Analysis

compute-a-16-58.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32


# Load Data and browse Indivdiuals

In [9]:
### Load ROH data
df1 = pd.read_csv("/n/groups/reich/hringbauer/hapsburg_runs/output/combined_roh05.v56.1.unique.tsv", sep='\t')
df1 = df1.sort_values(by="sum_roh>4", ascending=False)
print(f"Loaded {len(df1)} Individuals with ROH")

### Load Punic Cluster Assignments
df_clst = pd.read_csv("./data/cluster_assignments_punic.v54.1h.tsv", sep="\t")
print(f"Loaded Cluster Information for {len(df_clst)} Individuals.\n")
df_pun = df_clst.copy()

df_pun = df_pun[df_pun["label"].str.contains("Punic")].copy()
print(f"\nLoaded {len(df_pun)} Punic Indiviudals")

### With ROH info:
df_proh = pd.merge(df_pun, df1, on="iid")
print(f"With ROH info: n={len(df_proh)}")

df_proh["label"].value_counts()

Loaded 20398 Individuals with ROH
Loaded Cluster Information for 153 Individuals.


Loaded 140 Punic Indiviudals
With ROH info: n=90


Punic_Early    44
Punic_Late     25
Punic_NoRC     13
Punic_Late2     8
Name: label, dtype: int64

### Filter to Individuals to fit Ne

In [11]:
dft = df_proh[df_proh["sum_roh>20"]<50] # Remove close-kin consanguineous individuals - 50 cm in total of very long ROH (>20cm) is the cutoff chosen here.
iids = dft["iid"].values # Load list of all iids
print(f"Loaded {len(iids)}/{len(df_proh)} IIDs without >50cm ROH20")

Loaded 79/90 IIDs without >50cm ROH20


### Load the ROH segments of Target Indivdiuals

In [17]:
roh_vec = load_roh_vec(iids=iids, base_path = "/n/groups/reich/hringbauer/hapsburg_runs/output/v56.1/", suffix="_roh_full.csv")
print(f"Loaded ROH Vector for {len(roh_vec)} IIDs")
assert(len(roh_vec)==len(iids))

Loaded ROH Vector for 79 IIDs


### Run Ne estimates [4-8 cm, IN MANUSCRIPT]

In [18]:
%%time
output = True
min_len = 4 # Min ROH length in cM to fit
max_len = 8 # Max ROH length in cM to fit

mle = MLE_ROH_Ne(start_params=1000, endog=roh_vec,
                 min_len=min_len, max_len=max_len,
                 chr_lgts=[],      # lengths of Chromosomes to fit (in cM). If len 0, use default for 1240K
                 error_model=False, output=False)
fit = mle.fit_ll_profile()
#summary = fit.summary()
mle.summary/2  # to get estimates in terms of Ne

CPU times: user 4.21 s, sys: 7.12 ms, total: 4.22 s
Wall time: 4.22 s


Unnamed: 0,coef,std err,0.025,0.975,n
0,6413.249153,,4931.329231,8515.325146,39.5


### Run Ne estimates [4-20 cm]

In [19]:
%%time
output = True
min_len = 4 # Min ROH length in cM to fit
max_len = 20 # Max ROH length in cM to fit

mle = MLE_ROH_Ne(start_params=1000, endog=roh_vec,
                 min_len=min_len, max_len=max_len,
                 chr_lgts=[],      # lengths of Chromosomes to fit (in cM). If len 0, use default for 1240K
                 error_model=False, output=False)
fit = mle.fit_ll_profile()
#summary = fit.summary()
mle.summary/2  # to get estimates in terms of Ne

CPU times: user 4.46 s, sys: 19.9 ms, total: 4.48 s
Wall time: 4.48 s


Unnamed: 0,coef,std err,0.025,0.975,n
0,4698.324157,,3844.64186,5821.515665,39.5
