# Stats for populations
Use v42 data release here

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd

import socket
import os as os
import sys as sys
import multiprocessing as mp

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
rcParams['font.sans-serif'] = ['Arial']  # Make sure to have the font installed (it is on cluster for Harald)

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
elif socket_name.startswith("Harald-Laptop"):
    print("Harald laptop detected.")
    path = "/home/hringbauer/git/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Function to print ROH summary stats

In [2]:
def give_roh_stats(df, cm1, cm2=0):
    """Gives ROH Stats for dataframe df
    for all ROH blocks between cm1-cm2 cm long"""
    cm_vec = df[f"sum_roh>{cm1}"]
    if cm2>0:
        cm_vec = cm_vec -  df[f"sum_roh>{cm2}"]
        
    print(f"# Individuals: {len(cm_vec)}")
    print(f"Mean: {np.mean(cm_vec):.4f}")
    print(f"Median: {np.median(cm_vec):.4f}")
    
    idx = cm_vec==0
    print(f"No ROH in bin: n={np.sum(idx)}/{len(idx)}")
    return cm_vec

# Populations older than 10k years

In [47]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")

Loaded 3732 Individuals


In [48]:
### Analyze 8000 BP Individuals
df_anc = df1[df1["age"]>10000]
cm_vec = give_roh_stats(df_anc, cm1=4, cm2=8)
#cm_vec

# Individuals: 43
Mean: 58.5418
Median: 54.6003
No ROH in bin: n=0/43


In [None]:
df_anc.sort_values(by="sum_roh>4")

### Populations between 10-8k years

In [27]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Hunter Gatherer"]
df_t = df_t[(df_t["age"]<10000) & (df_t["age"]>8000)]

Loaded 3732 Individuals


In [33]:
cm_vec = give_roh_stats(df_t, cm1=4, cm2=8)
#df_t.sort_values(by="sum_roh>4")

# Individuals: 39
Mean: 18.5735
Median: 14.2146
No ROH in bin: n=3/39


### Long ROH in San Nicholas

In [58]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t = df1[df1["pop"].str.contains("SanNicolas")]
df_t

Loaded 3732 Individuals


Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,region,color,economy
101,SN-44.SG,USA_CA_Early_SanNicolas.SG,65.982402,688.231434,58,525.930033,29,436.336621,20,247.849303,...,-119.539,5337.0,ScheibScience2018,USA_CA_Early_SanNicolas.SG,3.09714,1111229,1,Pacific NW,silver,
236,SN-11.SG,USA_CA_Late_SanNicolas.SG,36.82,325.577445,33,216.50768,13,169.222088,8,87.133599,...,-119.539,1100.0,ScheibScience2018,USA_CA_Late_SanNicolas.SG,1.850441,975879,1,Pacific NW,silver,
3507,SN-13.SG,USA_CA_Late_SanNicolas.SG,59.737399,455.327516,32,387.192213,19,351.594299,15,226.224903,...,-119.539,811.0,ScheibScience2018,USA_CA_Late_SanNicolas.SG,0.580568,509837,1,Pacific NW,silver,
3680,SN-50.SG,USA_CA_Late_SanNicolas.SG,20.095896,256.183607,30,171.953598,13,109.210103,7,20.095896,...,-119.499561,1477.0,ScheibScience2018,USA_CA_Late_SanNicolas.SG,0.44741,420801,1,Pacific NW,silver,


### Anatolian Neolithic Farmer populations

In [23]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Agricultural"]

df_t = df_t[df_t["pop"].str.contains("Anatolia_N")]
cm_vec = give_roh_stats(df_t, cm1=4, cm2=0)

Loaded 3732 Individuals
# Individuals: 23
Mean: 3.2379
Median: 0.0000
No ROH in bin: n=15/23


### Anatolian aceramic Farmer populations

In [50]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Aceramic Farmer"]
df_t=df_t[df_t["pop"].str.contains("Anatolia")]
cm_vec = give_roh_stats(df_t, cm1=4, cm2=8)

Loaded 3732 Individuals
# Individuals: 6
Mean: 28.8752
Median: 36.7019
No ROH in bin: n=0/6


In [46]:
df_t

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,region,color,economy
128,Bon002.SG,Anatolia_Boncuklu_N.SG,10.415697,23.633311,4,10.415697,1,0.0,0,0.0,...,32.864901,10078.0,KilincCurrentBiology2016,Anatolia_Boncuklu_N.SG,3.95,1077088,1,Aegan,plum,Aceramic Farmer
843,ZHAG_BON004.A0101_Luk10,Anatolia_N,19.8112,84.981384,10,47.030193,3,38.851399,2,0.0,...,32.866667,9900.0,FeldmanNatureCommunications2019,Anatolia_N,1.243703,772944,1,Aegan,plum,Aceramic Farmer
1387,ZKO_BON001.A0101_Luk7,Anatolia_N,9.699196,46.889787,8,9.699196,1,0.0,0,0.0,...,32.866667,10000.0,FeldmanNatureCommunications2019,Anatolia_N,0.742933,592138,1,Aegan,plum,Aceramic Farmer
3460,ZHJ_BON024.A0101_Luk84,Anatolia_N,12.0234,16.641604,2,12.0234,1,12.0234,1,0.0,...,32.866667,10190.0,FeldmanNatureCommunications2019,Anatolia_N,0.629825,529709,1,Aegan,plum,Aceramic Farmer
3461,ZMOJ_BON014.A0101_Luk21,Anatolia_N,6.821099,36.213295,7,0.0,0,0.0,0,0.0,...,32.866667,9900.0,FeldmanNatureCommunications2019,Anatolia_N,0.646215,529618,1,Aegan,plum,Aceramic Farmer
3556,ZHAJ_BON034.A0101_Luk9,Anatolia_N,9.003902,53.063994,9,9.003902,1,0.0,0,0.0,...,32.866667,9900.0,FeldmanNatureCommunications2019,Anatolia_N,0.567341,486511,1,Aegan,plum,Aceramic Farmer


### Steppe Pastoralists

In [66]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Pastoralist"]
df_t = df_t[(df_t["age"]>4000) & (df_t["age"]<5000)]

Loaded 3732 Individuals


In [75]:
cm_vec = give_roh_stats(df_t, cm1=4, cm2=8)

# Individuals: 91
Mean: 14.6942
Median: 15.0520
No ROH in bin: n=11/91


In [71]:
df_t[df_t["sum_roh>20"]>50]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,region,color,economy


# Order all ROH

In [None]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df1.sort_values(by="sum_roh>12", ascending=False)