# Run apAdm
Goal:
Run qp-Admix for identifying question about Sardinian Demography.
Use a wrapper to run it with various parameters, for various populations.

Use two different set of outgroups: m13 and anc_euro. Save them in different subfolders.

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt
import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

sys.path.append("/n/groups/reich/hringbauer/o2bin")
sys.path.append("./python")
from qpAdm.run_qpadm import qpAdm_run, remove_individuals, get_sub_pops_exact

compute-a-17-76.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


# Define Some Standard Outgroups

In [2]:
### Parameters for qpAdm_run are set there!
a12 = ["Mota", "Ust_Ishim", "Kostenki14", "GoyetQ116-1", "Vestonice16", "MA1",
           "ElMiron", "Villabruna", "EHG", "CHG", "Natufian", "Levant_N"] # List of ancient European populations for right populations
distal = ["Anatolia_N", "WHG", "Steppe_EMBA", 
          "Iran_N", "Morocco_EN"]
print("# Anc_Euro: %i populations" % len(a12))
print("# Distal sources: %i populations" % len(distal))

# Anc_Euro: 12 populations
# Distal sources: 5 populations


### Get Dataframe with .anno clusters, merged with qpAdm eigenstrat IIDs

In [3]:
min_snp = 30000
ind_merged="./eigenstrat/combined/punic.v46.3.ind"          # What .ind to load
path_anno = "/n/groups/reich/hringbauer/Data/v46.3.anno.csv"

df = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df.columns=["iid", "sex", "clst"]
df = remove_individuals(df, remove_list=["_d"])
print(f"Loaded {len(df)} Individuals")

df_meta = pd.read_csv(path_anno, sep=",")
df2 = df_meta[["iid", "Master ID", "loc", "n_cov_snp", "mean_cov", "sex"]]
df = pd.merge(df, df2, on="iid", how="left")
idx = (df["n_cov_snp"]<min_snp)
df = df[~idx]
df.loc[df["loc"].isnull(), "loc"]="not assigned"
print(f"Filtered to {len(df)} Individuals based on Min SNP: {min_snp}")

df = df.sort_values(by="n_cov_snp", ascending=False)
dup = (df["Master ID"].duplicated() & ~df["Master ID"].isnull())
df = df[~dup].copy().reset_index(drop=True)
print(f"Filtered to {len(df)} based on duplicates")

Filtering to 1215/1240
Loaded 1215 Individuals
Filtered to 1170 Individuals based on Min SNP: 30000
Filtered to 1095 based on duplicates


## Test qpAdm Modelling Run for single Individual

In [None]:
%%time
iid = "I21197"  # The Sicilian "inbred individual"

#sources = ["Anatolia_N", "WHG", "Steppe_EMBA", "Israel_MLBA_Canaanite", "Morocco_EN"]
#sources = ["Anatolia_N", "WHG", "Steppe_EMBA", "Israel_MLBA_Canaanite", "I12433"] # Greece_BA_Mycenaean
sources = ["Greece_BA_Mycenaean"]
#"Steppe_EMBA"  "WHG" "Anatolia_N", "Steppe_EMBA", 

add_out = ["Israel_Phoenician", "Italy_Sicily_IA_Polizzello", "Tunisia_N"]
leftpops = [iid] + sources

qpAdm_run(leftpops = leftpops, 
          rightpops = a12 + add_out, 
          output_file = ".".join(leftpops) + ".log", 
          input_folder = "/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined", 
          input_file = "punic.v46.3",
          par_file_folder = "/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/", 
          input_ind_suff = "_ind", 
          output_folder = "/n/groups/reich/hringbauer/git/punic_aDNA/output/qpAdm/v46.3/ind/", 
          path_bin_qpAdm = "/n/groups/reich/hringbauer/git/AdmixTools/bin/qpAdm",
          all_snps=False)

print(f"Finished run!")

Running command: 
/n/groups/reich/hringbauer/git/AdmixTools/bin/qpAdm -p /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/parfile_qpAdm > /n/groups/reich/hringbauer/git/punic_aDNA/output/qpAdm/v46.3/ind/I21197.Greece_BA_Mycenaean.log


# Run Individual 3-Way Sicily IA, Greece BA, NW Africa N

In [27]:
#pops = ["Italy_Sardinia_Punic", "Italy_Sardinia_Punic_Roman", "Italy_Sardinia_BA_Nuragic",
#        "Spain_IA", "Ibiza_Punic.SG", "Italy_Phoenician_Sicily", "Italy_Sicily_Punic",
#        "Italy_Sicily_IA_Polizzello"] # "Greece_BA_Mycenaean"
pops = ["Italy_Sicily_Punic"]
df1 = get_sub_pops_exact(df, pop_list=pops)

df_all = pd.concat((df1,))
all_iids = df_all["iid"].values
print(f"Will run {len(all_iids)} Indvidiuals")

Found: 47/1095
Will run 47 Indvidiuals


In [29]:
a12 = ["Mota", "Ust_Ishim", "Kostenki14", "GoyetQ116-1", "Vestonice16", "MA1",
       "ElMiron", "Villabruna", "EHG", "CHG", "Natufian", "Levant_N"]
sources = ["Italy_Sicily_IA_Polizzello", "Greece_BA_Mycenaean", "Tunisia_N"] 
add_out = ["Anatolia_N", "Steppe_EMBA", "Israel_Phoenician"]

for iid in all_iids[:]:
    print(f"Running Individual {iid}")
    leftpops = [iid] + sources
    qpAdm_run(leftpops = leftpops, 
              rightpops = a12 + add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined", 
              input_file = "punic.v46.3",
              par_file_folder = "/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/", 
              input_ind_suff = "_ind", 
              output_folder = "/n/groups/reich/hringbauer/git/punic_aDNA/output/qpAdm/v46.3/3waySicGreAfr/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/git/AdmixTools/bin/qpAdm",
              all_snps=True)
    print(f"Finished run!")

Running Individual I24553
/n/groups/reich/hringbauer/git/punic_aDNA
Running command: 
/n/groups/reich/hringbauer/git/AdmixTools/bin/qpAdm -p /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/parfile_qpAdm > /n/groups/reich/hringbauer/git/punic_aDNA/output/qpAdm/v46.3/3waySicGreAfr/I24553.Italy_Sicily_IA_Polizzello.Greece_BA_Mycenaean.Tunisia_N.log
Runtime: 65.866350
Finished run!
Running Individual I12844
/n/groups/reich/hringbauer/git/punic_aDNA
Running command: 
/n/groups/reich/hringbauer/git/AdmixTools/bin/qpAdm -p /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/parfile_qpAdm > /n/groups/reich/hringbauer/git/punic_aDNA/output/qpAdm/v46.3/3waySicGreAfr/I12844.Italy_Sicily_IA_Polizzello.Greece_BA_Mycenaean.Tunisia_N.log
Runtime: 65.624856
Finished run!
Running Individual I24555
/n/groups/reich/hringbauer/git/punic_aDNA
Running command: 
/n/groups/reich/hringbauer/git/AdmixTools/bin/qpAdm -p /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/parfile_qpAdm > /n/groups/reich/hringbau

In [None]:
%%time
iid = "I4799"  # The Sicilian "inbred individual"

#sources = ["Anatolia_N", "WHG", "Steppe_EMBA", "Israel_MLBA_Canaanite", "Morocco_EN"]
#sources = ["Anatolia_N", "WHG", "Steppe_EMBA", "Israel_MLBA_Canaanite", "I12433"] # Greece_BA_Mycenaean


# Run 1-Way Model Sicily IA

# Run distal Models

In [4]:
### Load the individual IIDs
path_ind = "./eigenstrat/combined/punic1.v43.ind"
outfolder="./output/qpAdm/5wayAll/"

pops = ["Iberia_Tartessian", "Iberia_Iberian", "Iberia_Celtiberian", "Nuragic", 
        "Sicily_Phoenician", "Sicily_IA_Polizzello"]
pops = ["Sicani"]
all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]
all_iids = np.concatenate(all_iids)
all_iids = ["I4246"] ### For testing

pops_tar = []
pops_ib = []
pops_clt =  []

sources = ["Anatolia_N", "WHG", "Steppe_EMBA",
           "Iran_N", "Morocco_EN"]
add_out = []

print(f"Will run {len(all_iids)} Individuals")
for iid in all_iids[:]:
    print(f"Running Individual {iid}")
    leftpops = [iid] + sources
    qpAdm_run(leftpops = leftpops, 
          rightpops = a12 + add_out, 
          output_file =  str(iid) + ".5way.log", 
          input_folder = "./eigenstrat/combined/", 
          input_file = "punic.v44",
          par_file_folder = "./parfiles/", 
          input_ind_suff = "_mod_ib", 
          output_folder = outfolder, 
          path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
          all_snps = True)

Will run 4 Individuals
Running Individual I13140
Runtime: 75.193629
Running Individual I13125
Runtime: 73.936484
Running Individual I13128
Runtime: 73.976746
Running Individual I13142
Runtime: 73.878589


## Run models with joined output name

In [8]:
### Load the individual IIDs
path_ind = "./eigenstrat/combined/punic1.v43.ind"

pops = ["Sardinia", "Algeria_N", "Punic"]
all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]
all_iids = np.concatenate(all_iids)

all_iids = ["I18195", "MSR002", "I4798", "I12666"]

sources = ["Anatolia_N", "WHG", "Steppe_EMBA", 
           "I12433", "Israel_MLBA_Canaanite"]
add_out = ["Iran_N"]

for iid in all_iids:
    leftpops = [iid] + sources
    qpAdm_run(leftpops = leftpops, 
          rightpops = a12 + add_out, 
          output_file =  ".".join(leftpops) + ".log", 
          input_folder = "./eigenstrat/combined/", 
          input_file = "punic.v44",
          par_file_folder = "./parfiles/", 
          input_ind_suff = "_mod_ib", 
          output_folder = "./output/qpAdm/5way_Canaanite/", 
          path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
          all_snps = True)

Runtime: 52.080487
Runtime: 52.311518
Runtime: 52.035265
Runtime: 52.738554


## Run 2/3-way Models for African - European Admixture

In [6]:
### Load the individual IIDs
pops = ["Iberia_North_BA_Africa_all", "Algeria_N", 
        "Morocco_LN", "Punic_oAfrican", "Sardinia"]

path_ind = "./eigenstrat/combined/punic1.v43.ind"

all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]
all_iids = np.concatenate(all_iids)
#all_iids = ["I4246"] #The Iberian Outlier

for iid in all_iids:
    print(f"Doing Individual: {iid}")
    sources = ["Anatolia_N", "WHG", "Morocco_EN"]
    add_out = ["Iran_N"]
    leftpops = [iid] + sources

    qpAdm_run(leftpops = leftpops, 
              rightpops = a12+add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "./eigenstrat/combined/", 
              input_file = "punic0.v43",
              par_file_folder = "./parfiles/", 
              input_ind_suff = "_mod1", 
              output_folder = "./output/qpAdm/3wayNAf/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=True)

Doing Individual: MSR002
Runtime: 32.329889
Doing Individual: MSR003
Runtime: 32.197911
Doing Individual: ORC002
Runtime: 32.199739
Doing Individual: VIL004
Runtime: 32.198779
Doing Individual: VIL006
Runtime: 32.501195
Doing Individual: VIL007
Runtime: 32.166153
Doing Individual: VIL009
Runtime: 32.191764
Doing Individual: VIL010
Runtime: 32.105941
Doing Individual: VIL011
Runtime: 32.232729


# Run 4-way distal models
- with Israel_MLBA_Canaaninte as Outgroup

Add as proxy for North African ancestry:

Morocco_EN, Algeria_N (I12433) or Iberian Outlier (I4246)

In [None]:
df

In [None]:
### Load the individual IIDs
pops = ["Italy_Sardinia_Punic", "Italy_Sardinia_Punic_Roman", "Italy_Sardinia_BA_Nuragic",
        "Spain_IA", "Ibiza_Punic.SG", "Italy_Phoenician_Sicily", "Italy_Sicily_Punic",
        "Italy_Sicily_IA_Polizzello"] # "Greece_BA_Mycenaean"

pops1 = ["Villaricos", "Menorca"] # "Himera",
iids = ["VIL", "MSR", "I12517|I15434|I8135"]

#pops = ["Morocco_LN"]
#df1 = get_sub_pops(df, pop_list=pops)
df1 = get_sub_pops_exact(df, pop_list=pops)
#df2 = get_sub_pops(df, pops1, pop_col="loc")
#df3 = get_sub_iid(df, iids)

df_all = pd.concat((df1,))
all_iids = df_all["iid"].values
print(f"Will run {len(all_iids)} Indvidiuals")
# df_meno = get_sub_pops(df, ["Menorca"])
# "Italy_Sardinia_EBA"
# df_cad = get_sub_iid(df, ["I12517|I15434|I8135"])
all_iids = ["I22112"] # For Testing

for iid in all_iids:
    print(f"Doing Individual: {iid}")
    sources = ["Anatolia_N", "WHG", "Steppe_EMBA", "I12433"]
    add_out = ["Israel_MLBA_Canaanite"]
    leftpops = [iid] + sources

    qpAdm_run(leftpops = leftpops, 
              rightpops = a12 + add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "./eigenstrat/combined/", 
              input_file = "punic.v44",
              par_file_folder = "./parfiles/", 
              input_ind_suff = "_ind",   # _mod1
              output_folder = "./output/qpAdm/v44/4way_r_canaanite/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=True)

In [23]:
df[df["clst"].str.contains("Morocco_LN")]

Unnamed: 0,iid,sex_x,clst,Master ID,loc,n_cov_snp,mean_cov,sex_y
680,KEB.4.SG,F,Morocco_LN.SG,KEB.4,Kelif el Boroud,156837.0,0.130697,F
711,KEB.1_KEB.8.SG,F,Morocco_LN.SG,KEB.1.plus.8,Kelif el Boroud,135590.0,0.112992,F
794,KEB.6.SG,M,Morocco_LN.SG,KEB.6,Kelif el Boroud,75303.0,0.062753,M


# Run 4-way Source Models including Myc 
Add as proxy for North African ancestry:
Algeria_N (I12433)

In [None]:
### Load the individual IIDs
pops = ["Iberia_Punic", "Sardinia", "Iberia_North_BA_Africa_all",
        "Morocco_LN", "Sicily_Phoenician", "Ibiza"]

path_ind = "./eigenstrat/combined/punic1.v43.ind"

all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]
all_iids = np.concatenate(all_iids)
#all_iids = ["I7253"] # Do a single Individual

for iid in all_iids:
    print(f"Doing Individual: {iid}")
    sources = ["Anatolia_N", "WHG", "Steppe_EMBA", "I12433"]
    add_out = ["Israel_MLBA_Canaanite"]
    leftpops = [iid] + sources

    qpAdm_run(leftpops = leftpops, 
              rightpops = a12 + add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "./eigenstrat/combined/", 
              input_file = "punic1.v43",
              par_file_folder = "./parfiles/", 
              input_ind_suff = "_mod1", 
              output_folder = "./output/qpAdm/4way_p_Canaanite/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=True)

# Test Continutity to Mycenean Phoenician Group

In [12]:
### Load the individual IIDs

#all_iids = ["I18199", "I18201", "I18203", "I18202", "I18189", "I18187"]
#all_iids = ["MSR002", "MSR003"]
all_iids = ["MS10614.SG"]

for iid in all_iids:
    print(f"Doing Individual: {iid}")
    sources = ["Mycenaean"]
    add_out = ["Israel_MLBA_Canaanite", "I12433"]
    leftpops = [iid] + sources

    qpAdm_run(leftpops = leftpops, 
              rightpops = a12 + add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "./eigenstrat/combined/", 
              input_file = "punic1.v43",
              par_file_folder = "./parfiles/", 
              input_ind_suff = "_mod1", 
              output_folder = "./output/qpAdm/t774_cont/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=True)

Doing Individual: MS10614.SG
Runtime: 33.257306


# Run three-way models for Empuries individuals

In [None]:
pops = ["Iberia_Greek", "Iberia_Hellenistic"]

path_ind = "./eigenstrat/combined/punic1.v43.ind"

all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]
all_iids = np.concatenate(all_iids)

print(f"Will run {len(all_iids)} Individuals")

for iid in all_iids:
    print(f"Doing Individual: {iid}")
    sources = ["Iberia_BA", "Greece_BA_Mycenaean", "I12433"]
    add_out = ["Israel_MLBA_Canaanite"]
    leftpops = [iid] + sources

    qpAdm_run(leftpops = leftpops, 
              rightpops = a12 + add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "./eigenstrat/combined/", 
              input_file = "punic1.v43",
              par_file_folder = "./parfiles/", 
              input_ind_suff = "_mod1", 
              output_folder = "./output/qpAdm/Empuries_3way/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=True)

In [24]:
print("test")

test


# Do 3-way distal models for Iberan Individuals (EEF, Steppe, WHG)

In [18]:
pops = ["Iberia_BA", "Iberia_IA", "Iberia_Tartessian", 
        "Iberia_Iberian", "Iberia_Celtiberian"]
path_ind = "./eigenstrat/combined/punic1.v43.ind"


all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]
all_iids = np.concatenate(all_iids)
#all_iids = ["I18187"] # Do Villaricos Local individual
all_iids = ["I8339", "I18203", "I8206", "I8207"] # Empuries Individuals

print(f"Will run {len(all_iids)} Individuals")

for iid in all_iids[:]:
    print(f"Doing Individual: {iid}")
    sources = ["Anatolia_N", "WHG", "Steppe_EMBA"]
    add_out = []
    leftpops = [iid] + sources

    qpAdm_run(leftpops = leftpops, 
              rightpops = a12 + add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "./eigenstrat/combined/", 
              input_file = "punic1.v43",
              par_file_folder = "./parfiles/", 
              input_ind_suff = "_mod_ib", 
              output_folder = "./output/qpAdm/3wayIberia/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=True)

Will run 1 Individuals
Doing Individual: I8339
Runtime: 51.068431


# Explore the Ind File (semi Area 51)

In [10]:
df_ind = pd.read_csv("./eigenstrat/combined/punic.v46.3.ind", delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]

In [11]:
df_ind["clst"].value_counts()

Europe_LNBA                    78
Italy_Sicily_Punic             53
Israel_MLBA                    35
Europe_EN                      29
Europe_MNChL                   28
                               ..
Egypt_Dynastic_Nubian           1
Lebanon_IA3_o2.SG               1
Italy_Sicily_MBA_o3.SG          1
Spain_Punic_Roman_oAfrican3     1
Israel_MLBA_son.I8187           1
Name: clst, Length: 346, dtype: int64

In [17]:
df_ind[df_ind["clst"].str.contains("Israel_MLBA")]['clst'].value_counts()

Israel_MLBA                                 35
Israel_MLBA_in.preparation                   2
Israel_MLBA_o                                2
Israel_MLBA_brother.I10359                   1
Israel_MLBA_sister.I10101                    1
Israel_MLBA_published                        1
Israel_MLBA_brother.I10770                   1
Israel_MLBA_o_brother.I2200                  1
Israel_MLBA_sister.I10101_in.preparation     1
Israel_MLBA_son.I8187                        1
Name: clst, dtype: int64

In [16]:
df_ind = pd.read_csv("./eigenstrat/combined/punic1.v43.ind", delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex","clst"]

In [17]:
df_ind[df_ind["clst"].str.contains("Poliz")]

Unnamed: 0,iid,sex,clst
156,I13390,F,Italy_Sicily_IA_Polizzello
157,I13381,F,Italy_Sicily_IA_Polizzello
158,I13376,F,Italy_Sicily_IA_Polizzello
159,I13395,F,Italy_Sicily_IA_Polizzello
160,I13382,M,Italy_Sicily_IA_Polizzello
161,I13377,F,Italy_Sicily_IA_Polizzello
162,I13389,M,Italy_Sicily_IA_Polizzello
163,I13383,M,Italy_Sicily_IA_Polizzello
164,I13393,F,Italy_Sicily_IA_Polizzello
165,I13384,F,Italy_Sicily_IA_Polizzello


# Area 51