# Stats for populations
Use v42 data release here

In [8]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd

import socket
import os as os
import sys as sys
import multiprocessing as mp

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
rcParams['font.sans-serif'] = ['Arial']  # Make sure to have the font installed (it is on cluster for Harald)

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
elif socket_name.startswith("Harald-Laptop"):
    print("Harald laptop detected.")
    path = "/home/hringbauer/git/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Function to print ROH summary stats

In [3]:
def give_roh_stats(df, cm1, cm2=0):
    """Gives ROH Stats for dataframe df
    for all ROH blocks between cm1-cm2 cm long"""
    cm_vec = df[f"sum_roh>{cm1}"]
    if cm2>0:
        cm_vec = cm_vec -  df[f"sum_roh>{cm2}"]
        
    print(f"# Individuals: {len(cm_vec)}")
    print(f"Mean: {np.mean(cm_vec):.4f}")
    print(f"Median: {np.median(cm_vec):.4f}")
    
    idx = cm_vec==0
    print(f"No ROH in bin: n={np.sum(idx)}/{len(idx)}")
    return cm_vec

# Populations older than 10k years

In [9]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")

Loaded 3732 Individuals


In [12]:
### Analyze 8000 BP Individuals
df_anc = df1[df1["age"]>10000]
cm_vec = give_roh_stats(df_anc, cm1=4, cm2=8)
cm_vec

# Individuals: 43
Mean: 58.5418
Median: 54.6003
No ROH in bin: n=0/43


1        79.637618
4        85.807974
6        83.518210
17       54.600319
21       29.889899
26       58.481305
33      139.802338
36       90.924997
39       65.861404
40      174.326384
45      150.097824
55       59.542312
75       80.381389
85       39.840312
112      25.771197
128      13.217614
166      30.013402
199      29.058105
219      14.039600
242      29.301394
247      56.603412
285      33.822387
336      79.766186
337      38.737098
420      30.923099
482      63.446514
502      10.646403
505      28.167613
514      50.969411
545      41.023893
573      20.446998
599      95.478908
605      14.769305
624     104.788499
638      17.252407
660      21.525019
661      33.905602
662      75.908400
938      99.006383
956     120.105108
1290     73.191812
1355     68.081001
3460      4.618204
dtype: float64

In [11]:
df_anc.sort_values(by="sum_roh>4")

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,region,color,economy
219,TAF014,Morocco_Iberomaurusian,5.383801,14.0396,3,0.0,0,0.0,0,0.0,...,-2.410889,14500.0,vandeLoosdrechtScience2018,Morocco_Iberomaurusian,5.801154,986561,1,North Africa,purple,Hunter Gatherer
3460,ZHJ_BON024.A0101_Luk84,Anatolia_N,12.0234,16.641604,2,12.0234,1,12.0234,1,0.0,...,32.866667,10190.0,FeldmanNatureCommunications2019,Anatolia_N,0.629825,529709,1,Aegan,plum,Aceramic Farmer
573,I5240,Serbia_Mesolithic_IronGates,5.741501,20.446998,4,0.0,0,0.0,0,0.0,...,22.010568,10805.0,MathiesonNature2018,Serbia_Mesolithic_IronGates,2.675,826331,1,Balkans,purple,Hunter Gatherer
660,I5239,Serbia_Mesolithic_IronGates,6.033015,21.525019,4,0.0,0,0.0,0,0.0,...,22.010568,10333.0,MathiesonNature2018,Serbia_Mesolithic_IronGates,2.708,809534,1,Balkans,purple,Hunter Gatherer
128,Bon002.SG,Anatolia_Boncuklu_N.SG,10.415697,23.633311,4,10.415697,1,0.0,0,0.0,...,32.864901,10078.0,KilincCurrentBiology2016,Anatolia_Boncuklu_N.SG,3.95,1077088,1,Aegan,plum,Aceramic Farmer
605,I5241,Serbia_Mesolithic_IronGates_daughter_I5236,14.649093,29.418398,4,14.649093,1,14.649093,1,0.0,...,22.010568,11196.0,MathiesonNature2018,Serbia_Mesolithic_IronGates_daughter_I5236,2.708,818133,1,Balkans,purple,Hunter Gatherer
199,TAF013,Morocco_Iberomaurusian,8.042896,37.101001,7,8.042896,1,0.0,0,0.0,...,-2.410889,14500.0,vandeLoosdrechtScience2018,Morocco_Iberomaurusian,7.166829,998988,1,North Africa,purple,Hunter Gatherer
420,ZBC_IPB001.B-C0101_Luk2-Pinarbasi,Anatolia_Epipaleolithic,13.088405,44.011504,6,13.088405,1,13.088405,1,0.0,...,33.033333,15405.0,FeldmanNatureCommunications2019,Anatolia_Epipaleolithic,2.350848,863964,1,Aegan,purple,Hunter Gatherer
638,I1819,Ukraine_Mesolithic,12.8437,47.712309,6,30.459902,3,12.8437,1,0.0,...,35.276389,10643.0,MathiesonNature2018,Ukraine_Mesolithic,3.435,813114,1,Black Sea,purple,Hunter Gatherer
112,Kostenki14,Russia_Kostenki14,13.4772,48.336097,7,22.5649,2,13.4772,1,0.0,...,39.3,37470.0,FuNature2016,Russia_Kostenki14,16.136,1095867,1,Steppe,purple,Hunter Gatherer


### Populations between 10-8k years

In [27]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Hunter Gatherer"]
df_t = df_t[(df_t["age"]<10000) & (df_t["age"]>8000)]

Loaded 3732 Individuals


In [33]:
cm_vec = give_roh_stats(df_t, cm1=4, cm2=8)
#df_t.sort_values(by="sum_roh>4")

# Individuals: 39
Mean: 18.5735
Median: 14.2146
No ROH in bin: n=3/39


### Long ROH in San Nicholas

In [None]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t = df1[df1["pop"].str.contains("SanNicolas")]
df_t

### Anatolian Neolithic Farmer populations

In [23]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Agricultural"]

df_t = df_t[df_t["pop"].str.contains("Anatolia_N")]
cm_vec = give_roh_stats(df_t, cm1=4, cm2=0)

Loaded 3732 Individuals
# Individuals: 23
Mean: 3.2379
Median: 0.0000
No ROH in bin: n=15/23


### Anatolian aceramic Farmer populations

In [50]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Aceramic Farmer"]
df_t=df_t[df_t["pop"].str.contains("Anatolia")]
cm_vec = give_roh_stats(df_t, cm1=4, cm2=8)
df_t

Loaded 3732 Individuals
# Individuals: 6
Mean: 28.8752
Median: 36.7019
No ROH in bin: n=0/6


### Steppe Pastoralists

In [66]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df_t= df1[df1["economy"]=="Pastoralist"]
df_t = df_t[(df_t["age"]>4000) & (df_t["age"]<5000)]
cm_vec = give_roh_stats(df_t, cm1=4, cm2=8)

Loaded 3732 Individuals


# Pakistan Middle Age

In [29]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
df_t = df1[df1["region"].str.contains("Pakistan") & (df1["age"]==0)]

In [None]:
#df_t[df_t["pop"].str.contains("Pakistan_IA")][["pop", "age","lat", "lon"]].sort_values(by="lat")

# San Nicholas

In [4]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")

Loaded 3732 Individuals


In [7]:
df1[df1["pop"].str.contains("Nicolas")]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,region,color,economy
101,SN-44.SG,USA_CA_Early_SanNicolas.SG,65.982402,688.231434,58,525.930033,29,436.336621,20,247.849303,...,-119.539,5337.0,ScheibScience2018,USA_CA_Early_SanNicolas.SG,3.09714,1111229,1,Pacific NW,silver,
236,SN-11.SG,USA_CA_Late_SanNicolas.SG,36.82,325.577445,33,216.50768,13,169.222088,8,87.133599,...,-119.539,1100.0,ScheibScience2018,USA_CA_Late_SanNicolas.SG,1.850441,975879,1,Pacific NW,silver,
3507,SN-13.SG,USA_CA_Late_SanNicolas.SG,59.737399,455.327516,32,387.192213,19,351.594299,15,226.224903,...,-119.539,811.0,ScheibScience2018,USA_CA_Late_SanNicolas.SG,0.580568,509837,1,Pacific NW,silver,
3680,SN-50.SG,USA_CA_Late_SanNicolas.SG,20.095896,256.183607,30,171.953598,13,109.210103,7,20.095896,...,-119.499561,1477.0,ScheibScience2018,USA_CA_Late_SanNicolas.SG,0.44741,420801,1,Pacific NW,silver,


# Order all ROH

In [None]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")
df1.sort_values(by="sum_roh>12", ascending=False)

# Area 51

In [34]:
df1 = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep='\t')
print(f"Loaded {len(df1)} Individuals")

Loaded 3732 Individuals


In [None]:
df1[(df1["region"].str.contains("Levant")) & (df1["age"]==0)]["pop"].value_counts()

In [None]:
df1[df1["region"].str.contains("Levant")  & (df1["age"]>0)].sort_values(by="age")[-160:]