# Stats for populations
Use v42 data release here

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd

import socket
import os as os
import sys as sys
import multiprocessing as mp

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
rcParams['font.sans-serif'] = ['Arial']  # Make sure to have the font installed (it is on cluster for Harald)

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
elif socket_name.startswith("Harald-Laptop"):
    print("Harald laptop detected.")
    path = "/home/hringbauer/git/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Function to print ROH summary stats

In [2]:
def give_roh_stats(df, cm1, cm2=0):
    """Gives ROH Stats for dataframe df
    for all ROH blocks between cm1-cm2 cm long"""
    cm_vec = df[f"sum_roh>{cm1}"]
    if cm2>0:
        cm_vec = cm_vec -  df[f"sum_roh>{cm2}"]
        
    print(f"# Individuals: {len(cm_vec)}")
    print(f"Mean: {np.mean(cm_vec):.4f}")
    print(f"Median: {np.median(cm_vec):.4f}")
    
    idx = cm_vec==0
    print(f"No ROH in bin: n={np.sum(idx)}/{len(idx)}")
    return cm_vec

# Populations older than 10k years

In [35]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")

Loaded 3732 Individuals


In [None]:
### Analyze 8000 BP Individuals
df_anc = df1[df1["age"]>10000]
cm_vec = give_roh_stats(df_anc, cm1=4, cm2=8)
#cm_vec

In [None]:
df_anc.sort_values(by="sum_roh>4")

# Hunter Gatherers in General

In [31]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Hunter Gatherer"]
df_t = df_t[(df_t["age"]>8000)]
cm_vec = give_roh_stats(df_t, cm1=4, cm2=8)

Loaded 3732 Individuals
# Individuals: 64
Mean: 29.4594
Median: 22.3342
No ROH in bin: n=3/64


### Populations between 10-8k years

In [4]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Hunter Gatherer"]
df_t = df_t[(df_t["age"]<10000) & (df_t["age"]>8000)]

Loaded 3732 Individuals


In [5]:
cm_vec = give_roh_stats(df_t, cm1=4, cm2=8)
#df_t.sort_values(by="sum_roh>4")

# Individuals: 39
Mean: 18.5735
Median: 14.2146
No ROH in bin: n=3/39


### Long ROH in San Nicholas

In [None]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t = df1[df1["pop"].str.contains("SanNicolas")]
df_t

### Anatolian Neolithic Farmer populations

In [32]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Agricultural"]

df_t = df_t[df_t["pop"].str.contains("Anatolia_N")]
cm_vec = give_roh_stats(df_t, cm1=4, cm2=8)

Loaded 3732 Individuals
# Individuals: 23
Mean: 1.9081
Median: 0.0000
No ROH in bin: n=18/23


### Anatolian aceramic Farmer populations

In [50]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Aceramic Farmer"]
df_t=df_t[df_t["pop"].str.contains("Anatolia")]
cm_vec = give_roh_stats(df_t, cm1=4, cm2=8)
df_t

Loaded 3732 Individuals
# Individuals: 6
Mean: 28.8752
Median: 36.7019
No ROH in bin: n=0/6


# Iberia Middle Neolithic

In [18]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Agricultural"]
df_t=df_t[df_t["pop"].str.contains("Iberia_EN")]
cm_vec = give_roh_stats(df_t, cm1=4, cm2=8)
np.median(cm_vec)

Loaded 3732 Individuals
# Individuals: 7
Mean: 30.4273
Median: 32.8301
No ROH in bin: n=0/7


32.83009500000001

### Steppe Pastoralists

In [66]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Pastoralist"]
df_t = df_t[(df_t["age"]>4000) & (df_t["age"]<5000)]
cm_vec = give_roh_stats(df_t, cm1=4, cm2=8)

Loaded 3732 Individuals


# Pakistan Middle Age

In [29]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
df_t = df1[df1["region"].str.contains("Pakistan") & (df1["age"]==0)]

In [None]:
#df_t[df_t["pop"].str.contains("Pakistan_IA")][["pop", "age","lat", "lon"]].sort_values(by="lat")

### Present-day Pakistan

In [8]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
df_t = df1[df1["region"].str.contains("Pakistan") & (df1["age"]==0)]
df_t = df_t[df_t["pop"].str.contains("Kalash")]

In [10]:
cm_vec = give_roh_stats(df_t, cm1=20, cm2=0)

# Individuals: 18
Mean: 9.8760
Median: 0.0000
No ROH in bin: n=13/18


In [12]:
len(cm_vec)

18

# San Nicholas

In [4]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")

Loaded 3732 Individuals


In [None]:
df1[df1["pop"].str.contains("Nicolas")]

# Order all ROH

In [None]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df1.sort_values(by="sum_roh>12", ascending=False)

### Steppe Cultures

# Area 51

In [34]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")

Loaded 3732 Individuals


In [None]:
df1[(df1["region"].str.contains("Levant")) & (df1["age"]==0)]["pop"].value_counts()

In [None]:
df1[df1["region"].str.contains("Levant")  & (df1["age"]>0)].sort_values(by="age")[-160:]