# Code to process the Ceballos samples

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colorbar as clb
import matplotlib.colors as cls
from matplotlib import gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
#from mpl_toolkits.basemap import Basemap

import socket
import os as os
import sys as sys
import multiprocessing as mp

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
rcParams['font.sans-serif'] = ['Arial']  # Make sure to have the font installed (it is on cluster for Harald)

### To do lowess smoothing
#import statsmodels.api as sm
#lowess = sm.nonparametric.lowess

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
elif socket_name.startswith("Harald-Laptop"):
    print("Midway jnovmbre partition detected.")
    path = "/home/hringbauer/git/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

sys.path.append("./package/hapsburg") # Append Hapsburg Folder
from PackagesSupport.roh_expectations import Expected_Roh

midway2-0402.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [2]:
def load_df_ceballos(path_ceballos="./Data/Ceballos/ROH500_cM_het0_v3.csv"):
    """Load Dataframe of Ceballos"""
    df = pd.read_csv(path_ceballos)
    l = len(set(df["IID"]))
    print(f"Loaded {len(df)} ROH from {l} Individuals")

    df1 = df[["POS1", "POS2", "cM.1", "cM.2", "IID", "CHR"]].copy()
    df1.columns = ['Start', 'End', 'StartM', 'EndM','iid', 'ch']
    df1['length'] = df1["End"] - df1["Start"]
    df1["EndM"] = df1["EndM"]/100
    df1["StartM"] = df1["StartM"]/100
    df1["lengthM"] = df1['EndM'] - df1['StartM']
    return df1

def save_roh_to_ch(df, path="./Empirical/ceballos/Ust_Ishim/chr", ch=3, file="roh.csv", suffix=""):
    """Save Chromosomes of hapROH dataframe
    path: The path without chromosomes number"""
    folder = path + str(ch) + suffix
    if not os.path.exists(folder):
        print("Creating Path...")
        os.makedirs(folder)
    
    ### Extract and Save
    df_c = df[df["ch"]==ch].copy()
    save_path = os.path.join(folder,file)
    df_c.to_csv(save_path, sep=",", index=False)
    print(f"Saved {len(df_c)} ROH to {save_path}")

# Extract and save single Individual

In [11]:
### Load and convert dataframe of Ceballos
#df1 = load_df_ceballos(path_ceballos="./Data/Ceballos/ROH500_cM_het1_v3.csv")
df1 = load_df_ceballos(path_ceballos="./Data/Ceballos/ROH500_cM_het1_4samples.csv")
# I6671, I2521, R7 and Villabruna

Loaded 830 ROH from 4 Individuals


In [15]:
### Copy over ROH calls from hapROH
iid="I2521"
folder_from = f"./Empirical/1240k/MarcusAncs/{iid}/"
folder_to = f"./Empirical/ceballos/{iid}/"
!cp -r $folder_from $folder_to

In [16]:
## Copy in ROH calls from Ceballos
df2 = df1[df1["iid"].str.contains(iid)]

for ch in range(1,23):
    save_roh_to_ch(df2, path=f"./Empirical/ceballos/{iid}/chr", 
                   ch=ch, file="roh.csv", suffix="/e01/")

Saved 10 ROH to ./Empirical/ceballos/I2521/chr1/e01/roh.csv
Saved 15 ROH to ./Empirical/ceballos/I2521/chr2/e01/roh.csv
Saved 9 ROH to ./Empirical/ceballos/I2521/chr3/e01/roh.csv
Saved 8 ROH to ./Empirical/ceballos/I2521/chr4/e01/roh.csv
Saved 9 ROH to ./Empirical/ceballos/I2521/chr5/e01/roh.csv
Saved 10 ROH to ./Empirical/ceballos/I2521/chr6/e01/roh.csv
Saved 8 ROH to ./Empirical/ceballos/I2521/chr7/e01/roh.csv
Saved 10 ROH to ./Empirical/ceballos/I2521/chr8/e01/roh.csv
Saved 1 ROH to ./Empirical/ceballos/I2521/chr9/e01/roh.csv
Saved 7 ROH to ./Empirical/ceballos/I2521/chr10/e01/roh.csv
Saved 4 ROH to ./Empirical/ceballos/I2521/chr11/e01/roh.csv
Saved 6 ROH to ./Empirical/ceballos/I2521/chr12/e01/roh.csv
Saved 6 ROH to ./Empirical/ceballos/I2521/chr13/e01/roh.csv
Saved 5 ROH to ./Empirical/ceballos/I2521/chr14/e01/roh.csv
Saved 4 ROH to ./Empirical/ceballos/I2521/chr15/e01/roh.csv
Saved 5 ROH to ./Empirical/ceballos/I2521/chr16/e01/roh.csv
Saved 1 ROH to ./Empirical/ceballos/I2521/chr

In [33]:
df2[df2["ch"]==4]

Unnamed: 0,Start,End,StartM,EndM,iid,ch,length,lengthM
34,7300000,49000000,0.185248,0.697731,I2521.mathieson2018.hs37d5.fa.cons.90perc.trim...,4,41700000,0.512483
35,52600000,60100000,0.697889,0.780209,I2521.mathieson2018.hs37d5.fa.cons.90perc.trim...,4,7500000,0.08232
36,77600000,120700000,0.909272,1.286625,I2521.mathieson2018.hs37d5.fa.cons.90perc.trim...,4,43100000,0.377353
37,123600000,125300000,1.310808,1.32186,I2521.mathieson2018.hs37d5.fa.cons.90perc.trim...,4,1700000,0.011052
38,130600000,131700000,1.365177,1.375321,I2521.mathieson2018.hs37d5.fa.cons.90perc.trim...,4,1100000,0.010144
39,151000000,152100000,1.543699,1.546018,I2521.mathieson2018.hs37d5.fa.cons.90perc.trim...,4,1100000,0.002319
40,170900000,172000000,1.760142,1.767402,I2521.mathieson2018.hs37d5.fa.cons.90perc.trim...,4,1100000,0.00726
41,172600000,173700000,1.771628,1.780137,I2521.mathieson2018.hs37d5.fa.cons.90perc.trim...,4,1100000,0.008508


In [32]:
for c in [0.04,0.08,0.12,0.2]: 
    d = np.sum(df2[df2["lengthM"]>c]["lengthM"])
    print(f"ROH>{c}: {d}")

ROH>0.04: 3.323087629562328
ROH>0.08: 3.140473442863358
ROH>0.12: 3.0581537653606494
ROH>0.2: 2.587069508011721


# Area 51

In [4]:
df_m = pd.read_csv("./Empirical/1240k/MarcusAncs/combined_roh05.csv", sep="\t")

In [10]:
df_m[df_m["iid"].str.contains("I2521")]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,study,clst_alt,period_alt,include_alt,clst,mean_cov,med_cov,n_cov_snp_read,full_iid,n_cov_snp
1,I2521,Balkans-MNCA,68.356599,328.405093,14,299.530998,9,299.530998,9,265.892696,...,Mathieson et al. 2018,Balkans,N,1,Balkans-MNCA,4.805385,2.0,775852.0,I2521,775852


In [45]:
for c in [0.04,0.08,0.12,0.2]: 
    d = df_m[df_m["iid"].str.contains("I2521")][f"sum_roh>{int(c*100)}"].values[0]
    print(f"ROH>{c}: {d}")

ROH>0.04: 328.405093
ROH>0.08: 299.530998
ROH>0.12: 299.530998
ROH>0.2: 265.89269599999994


In [135]:
iids = df_m.sort_values(by="sum_roh>20", ascending=False)[:50]["iid"]

In [138]:
df_m.sort_values(by="sum_roh>20", ascending=False)[:50]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,study,clst_alt,period_alt,include_alt,clst,mean_cov,med_cov,n_cov_snp_read,full_iid,n_cov_snp
1,I2521,Balkans-MNCA,68.356599,328.405093,14,299.530998,9,299.530998,9,265.892696,...,Mathieson et al. 2018,Balkans,N,1,Balkans-MNCA,4.805385,2.0,775852.0,I2521,775852
0,MA89,Sar-ECA,40.350711,402.742118,25,380.444419,21,307.097828,14,179.34333,...,Marcus et al. 2018,Sar,Fil,1,Sar-ECA,2.391523,2.0,879353.0,MA89,879353
2,I1131,Balkans-MNCA,39.564692,290.128077,19,247.52278,12,229.526279,10,168.055276,...,Mathieson et al. 2018,Balkans,N,1,Balkans-MNCA,2.732549,1.0,741304.0,I1131,741304
6,I9006,Myc-BA,62.245902,215.232998,11,189.613403,7,169.344593,5,133.492599,...,Lazaridis 2017,Mycenaean,BA,1,Myc-BA,1.389786,0.0,357533.0,I9006,357533
4,I2606,GB-LN,35.242498,246.37099,16,218.319189,11,197.068489,9,117.157989,...,Olalde et al. 2018,England,N,1,GB-LN,1.182767,1.0,652314.0,I2606,652314
8,I4068,Netherlands-BA,54.713997,192.621688,12,174.248491,9,136.088499,5,105.571898,...,Olalde et al. 2018,The Netherlands,Bk,1,Netherlands-BA,0.794659,0.0,450148.0,I4068,450148
13,I2597,GB-EBA,36.6283,121.454683,6,112.610594,4,103.9179,3,103.9179,...,Olalde et al. 2018,England,EBA,1,GB-EBA,3.133947,2.0,796671.0,I2597,796671
9,I9005,Minoan-BA,57.713205,149.654708,7,139.057807,5,117.944108,3,101.902304,...,Mathieson et al. 2018,Minoan,BA,1,Minoan-BA,1.341981,0.0,383696.0,I9005,383696
7,I4916,Iron_Gates-HG,45.426095,198.441118,16,164.355296,9,135.331983,6,92.306293,...,Mathieson et al. 2018,Iron_Gates,HG,1,Iron_Gates-HG,2.44595,1.0,774805.0,I4916,774805
25,I7278,CE-EBA,27.917099,89.01299,5,84.25389,4,84.25389,4,71.545094,...,Olalde et al. 2018,Central Europe,Bk,1,CE-EBA,3.39922,2.0,767989.0,I7278,767989


In [None]:
df_g = pd.read_csv("./Empirical/roh_all_inds_final_v42.1.csv", sep="\t")
df_g[df_g["iid"].str.contains("Kosten")]
df_g.head(3)

In [None]:
### Test whether Ceballos Individuals are covered
for iid in iids:
    print(iid)
    df2 = df1[df1["iid"].str.contains(iid)]
    print(len(df2))