# Run apAdm
Goal:
Run qp-Admix for identifying question about Sardinian Demography.
Use a wrapper to run it with various parameters, for various populations.

Use two different set of outgroups: m13 and anc_euro. Save them in different subfolders.

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-16-166.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


# Define Some Standard Outgroups

In [2]:
### Parameters for qpAdm_run are set there!
a12 = ["Mota", "Ust_Ishim", "Kostenki14", "GoyetQ116-1", "Vestonice16", "MA1",
           "ElMiron", "Villabruna", "EHG", "CHG", "Natufian",
           "Levant_N"] # List of ancient European populations for right populations
distal = ["Anatolia_N", "WHG", "Steppe_EMBA", 
          "Iran_N", "Morocco_EN"]
print("# Anc_Euro: %i populations" % len(a12))
print("# Distal sources: %i populations" % len(distal))

# Anc_Euro: 12 populations
# Distal sources: 5 populations


In [3]:
def qpAdm_run(leftpops, rightpops, output_file, 
              input_folder = "./eigenstrat/combined/", 
              input_file="punic0.v43.geno",
              par_file_folder = "./parfiles/", 
              input_ind_suff="_mod1", 
              output_folder ="./output/qpAdm/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=False):
    """Run qpAdm. Write temporary parfile and run the analysis
    leftpops: List of left populations
    rightpops: List of right populations
    input_folder: Where to find the input files
    par_file_folder: Folder of the parameterfile
    input_file: The name of the input file
    input_ind_suff: Suffix of .ind file. To allow modified populations
    input_ind: name of the input .ind file. If given - write over default"""
    
    parfile_path = par_file_folder + "parfile_qpAdm"
    left_path, right_path = par_file_folder + "left", par_file_folder + "right"
    
    ### Create the parfile:
    with open(parfile_path, 'w') as f:
        f.write("%s\n" % ("DIR: " + input_folder))
        f.write("%s\n" % ("S1: " + input_file))
        
        indline = "indivname: DIR/S1" + input_ind_suff + ".ind"
        f.write("%s\n" % indline)
        f.write("%s\n" % "snpname: DIR/S1.snp")
        f.write("%s\n" % "genotypename: DIR/S1.geno")
        f.write("%s\n" % ("popleft: " + left_path))
        f.write("%s\n" % ("popright: " + right_path))
        f.write("%s\n" % "details: YES")   
        if all_snps==True:
            f.write("%s\n" % "allsnps: YES")
    
    ### Write leftpops rightpops:       
    with open(left_path, 'w') as f:
        f.write("\n".join(leftpops))
        
    with open(right_path, 'w') as f:
        f.write("\n".join(rightpops))
      
    ### Run qpAdm
    start = time()
    output_path = output_folder + output_file
    !$path_bin_qpAdm -p $parfile_path > $output_path
    end = time()
    print("Runtime: %2f" % (end - start))
    return 0


def load_iids_from_indfile(path_ind, string, 
                           col="clst", col_iid="iid"):
    """Load IIDs from Ind File
    Return List of IIDs"""
    df_ind = pd.read_csv(path_ind, delim_whitespace=True, header=None)
    df_ind.columns=["iid", "sex","clst"]
    idx = df_ind[col].str.contains(string)
    ls = df_ind[idx][col_iid].values
    return ls

### Test distal Modelling Run for single Individual

In [5]:
%%time
#iid = "I12433"  # The Iron Age African
#iid = "I11896"  # The old African one
#iid = "I18400"   # Steppe 1  # The Sardinian Outlier
# Iron Age Nigeria iid = "I15940"
#iid = "I18399"
iid = "I12844"  # The Sicilian "inbred individual"

#sources = ["Anatolia_N", "WHG", "Steppe_EMBA", "Israel_MLBA_Canaanite", "Morocco_EN"]
sources = ["Anatolia_N", "WHG", "Steppe_EMBA", "Israel_MLBA_Canaanite", "I12433"]
sources = ["Israel_MLBA_Canaanite", "I12433"]
#"Steppe_EMBA"  "WHG" "Anatolia_N", "Steppe_EMBA", 

add_out = ["Mycenaean", "Anatolia_N", "Steppe_EMBA"]
leftpops = [iid] + sources

qpAdm_run(leftpops = leftpops, 
          rightpops = a12+add_out, 
          output_file = ".".join(leftpops) + ".log", 
          input_folder = "./eigenstrat/combined/", 
          input_file = "punic1.v43",
          par_file_folder = "./parfiles/", 
          input_ind_suff = "_mod1", 
          output_folder = "./output/qpAdm/diverse/", 
          path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
          all_snps=True)

^C
Runtime: 4.866229
CPU times: user 99.4 ms, sys: 20.4 ms, total: 120 ms
Wall time: 4.95 s


0

In [27]:
print("Finished run!")

Finished run!


### Run several Individuals

# Run distal Models

In [4]:
### Load the individual IIDs
path_ind = "./eigenstrat/combined/punic1.v43.ind"
outfolder="./output/qpAdm/5wayAll/"

pops = ["Iberia_Tartessian", "Iberia_Iberian", "Iberia_Celtiberian", "Nuragic", 
        "Sicily_Phoenician", "Sicily_IA_Polizzello"]
pops = ["Sicani"]
all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]
all_iids = np.concatenate(all_iids)
#all_iids = ["I4246"]

pops_tar = []
pops_ib = []
pops_clt =  []

sources = ["Anatolia_N", "WHG", "Steppe_EMBA",
           "Iran_N", "Morocco_EN"]
add_out = []

print(f"Will run {len(all_iids)} Individuals")
for iid in all_iids[:]:
    print(f"Running Individual {iid}")
    leftpops = [iid] + sources
    qpAdm_run(leftpops = leftpops, 
          rightpops = a12 + add_out, 
          output_file =  str(iid) + ".5way.log", 
          input_folder = "./eigenstrat/combined/", 
          input_file = "punic1.v43",
          par_file_folder = "./parfiles/", 
          input_ind_suff = "_mod_ib", 
          output_folder = outfolder, 
          path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
          all_snps = True)

Will run 4 Individuals
Running Individual I13140
Runtime: 75.193629
Running Individual I13125
Runtime: 73.936484
Running Individual I13128
Runtime: 73.976746
Running Individual I13142
Runtime: 73.878589


## Run models with joined output name

In [8]:
### Load the individual IIDs
path_ind = "./eigenstrat/combined/punic1.v43.ind"

pops = ["Sardinia", "Algeria_N", "Punic"]
all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]
all_iids = np.concatenate(all_iids)

all_iids = ["I18195", "MSR002", "I4798", "I12666"]

sources = ["Anatolia_N", "WHG", "Steppe_EMBA", 
           "I12433", "Israel_MLBA_Canaanite"]
add_out = ["Iran_N"]

for iid in all_iids:
    leftpops = [iid] + sources
    qpAdm_run(leftpops = leftpops, 
          rightpops = a12 + add_out, 
          output_file =  ".".join(leftpops) + ".log", 
          input_folder = "./eigenstrat/combined/", 
          input_file = "punic1.v43",
          par_file_folder = "./parfiles/", 
          input_ind_suff = "_mod_ib", 
          output_folder = "./output/qpAdm/5way_Canaanite/", 
          path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
          all_snps = True)

Runtime: 52.080487
Runtime: 52.311518
Runtime: 52.035265
Runtime: 52.738554


## Run 2/3-way Models for African - European Admixture

In [6]:
### Load the individual IIDs
pops = ["Iberia_North_BA_Africa_all", "Algeria_N", 
        "Morocco_LN", "Punic_oAfrican", "Sardinia"]

path_ind = "./eigenstrat/combined/punic1.v43.ind"

all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]
all_iids = np.concatenate(all_iids)
#all_iids = ["I4246"] #The Iberian Outlier

for iid in all_iids:
    print(f"Doing Individual: {iid}")
    sources = ["Anatolia_N", "WHG", "Morocco_EN"]
    add_out = ["Iran_N"]
    leftpops = [iid] + sources

    qpAdm_run(leftpops = leftpops, 
              rightpops = a12+add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "./eigenstrat/combined/", 
              input_file = "punic0.v43",
              par_file_folder = "./parfiles/", 
              input_ind_suff = "_mod1", 
              output_folder = "./output/qpAdm/3wayNAf/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=True)

Doing Individual: MSR002
Runtime: 32.329889
Doing Individual: MSR003
Runtime: 32.197911
Doing Individual: ORC002
Runtime: 32.199739
Doing Individual: VIL004
Runtime: 32.198779
Doing Individual: VIL006
Runtime: 32.501195
Doing Individual: VIL007
Runtime: 32.166153
Doing Individual: VIL009
Runtime: 32.191764
Doing Individual: VIL010
Runtime: 32.105941
Doing Individual: VIL011
Runtime: 32.232729


# Run Models with Israle_MLBA_Canaaninte as Outgroup
Add as proxy for North African ancestry:
Morocco_EN, Algeria_N (I12433) or Iberian Outlier (I4246)

In [None]:
### Load the individual IIDs
pops = ["Iberia_Punic", "Sardinia", "Iberia_North_BA_Africa_all",
        "Morocco_LN", "Sicily_Phoenician", "Ibiza",
        "Celtiberian", "Iberia_Iberian", 
        "Iberia_Tartessian", "Nuragic"]

pops = ["Sicily_IA_Polizzello"]

path_ind = "./eigenstrat/combined/punic1.v43.ind"

all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]

all_iids = np.concatenate(all_iids)[:1]

for iid in all_iids:
    print(f"Doing Individual: {iid}")
    sources = ["Anatolia_N", "WHG", "Steppe_EMBA", "I12433"]
    add_out = ["Israel_MLBA_Canaanite"]
    leftpops = [iid] + sources

    qpAdm_run(leftpops = leftpops, 
              rightpops = a12 + add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "./eigenstrat/combined/", 
              input_file = "punic1.v43",
              par_file_folder = "./parfiles/", 
              input_ind_suff = "_mod_ib",   # _mod1
              output_folder = "./output/qpAdm/4way_p_Canaanite/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=True)

Doing Individual: I13390


# Run 4-way Source Models including Myc - with Israle_MLBA_Canaaninte as Outgroup
Add as proxy for North African ancestry:
Algeria_N (I12433)

In [None]:
### Load the individual IIDs
pops = ["Iberia_Punic", "Sardinia", "Iberia_North_BA_Africa_all",
        "Morocco_LN", "Sicily_Phoenician", "Ibiza"]

path_ind = "./eigenstrat/combined/punic1.v43.ind"

all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]
all_iids = np.concatenate(all_iids)
#all_iids = ["I7253"] # Do a single Individual

for iid in all_iids:
    print(f"Doing Individual: {iid}")
    sources = ["Anatolia_N", "WHG", "Steppe_EMBA", "I12433"]
    add_out = ["Israel_MLBA_Canaanite"]
    leftpops = [iid] + sources

    qpAdm_run(leftpops = leftpops, 
              rightpops = a12 + add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "./eigenstrat/combined/", 
              input_file = "punic1.v43",
              par_file_folder = "./parfiles/", 
              input_ind_suff = "_mod1", 
              output_folder = "./output/qpAdm/4way_p_Canaanite/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=True)

# Test Continutity to Mycenean Phoenician Group

In [12]:
### Load the individual IIDs

#all_iids = ["I18199", "I18201", "I18203", "I18202", "I18189", "I18187"]
#all_iids = ["MSR002", "MSR003"]
all_iids = ["MS10614.SG"]

for iid in all_iids:
    print(f"Doing Individual: {iid}")
    sources = ["Mycenaean"]
    add_out = ["Israel_MLBA_Canaanite", "I12433"]
    leftpops = [iid] + sources

    qpAdm_run(leftpops = leftpops, 
              rightpops = a12 + add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "./eigenstrat/combined/", 
              input_file = "punic1.v43",
              par_file_folder = "./parfiles/", 
              input_ind_suff = "_mod1", 
              output_folder = "./output/qpAdm/t774_cont/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=True)

Doing Individual: MS10614.SG
Runtime: 33.257306


# Run three-way models for Empuries individuals

In [None]:
pops = ["Iberia_Greek", "Iberia_Hellenistic"]

path_ind = "./eigenstrat/combined/punic1.v43.ind"

all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]
all_iids = np.concatenate(all_iids)

print(f"Will run {len(all_iids)} Individuals")

for iid in all_iids:
    print(f"Doing Individual: {iid}")
    sources = ["Iberia_BA", "Greece_BA_Mycenaean", "I12433"]
    add_out = ["Israel_MLBA_Canaanite"]
    leftpops = [iid] + sources

    qpAdm_run(leftpops = leftpops, 
              rightpops = a12 + add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "./eigenstrat/combined/", 
              input_file = "punic1.v43",
              par_file_folder = "./parfiles/", 
              input_ind_suff = "_mod1", 
              output_folder = "./output/qpAdm/Empuries_3way/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=True)

In [24]:
print("test")

test


# Do 3-way distal models for Iberan Individuals (EEF, Steppe, WHG)

In [18]:
pops = ["Iberia_BA", "Iberia_IA", "Iberia_Tartessian", 
        "Iberia_Iberian", "Iberia_Celtiberian"]
path_ind = "./eigenstrat/combined/punic1.v43.ind"


all_iids = [load_iids_from_indfile(path_ind=path_ind, 
                        string=pop) for pop in pops]
all_iids = np.concatenate(all_iids)
#all_iids = ["I18187"] # Do Villaricos Local individual
all_iids = ["I8339", "I18203", "I8206", "I8207"] # Empuries Individuals

print(f"Will run {len(all_iids)} Individuals")

for iid in all_iids[:]:
    print(f"Doing Individual: {iid}")
    sources = ["Anatolia_N", "WHG", "Steppe_EMBA"]
    add_out = []
    leftpops = [iid] + sources

    qpAdm_run(leftpops = leftpops, 
              rightpops = a12 + add_out, 
              output_file = ".".join(leftpops) + ".log", 
              input_folder = "./eigenstrat/combined/", 
              input_file = "punic1.v43",
              par_file_folder = "./parfiles/", 
              input_ind_suff = "_mod_ib", 
              output_folder = "./output/qpAdm/3wayIberia/", 
              path_bin_qpAdm = "/n/groups/reich/hringbauer/o2bin/qpAdm",
              all_snps=True)

Will run 1 Individuals
Doing Individual: I8339
Runtime: 51.068431


# Explore the Ind File (semi Area 51)

In [15]:
df_ind = pd.read_csv("./eigenstrat/combined/punic1.v43_mod_ib.ind", delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]

In [13]:
df_ind["clst"].value_counts()

Europe_LNBA                              78
Iberia_BA                                42
Europe_EN                                29
Europe_MNChL                             28
Israel_MLBA_Canaanite                    26
                                         ..
Iran_IA                                   1
Israel_MLBA_Canaanite_o_brother.I2200     1
Israel_IA_IIA_Tevet                       1
Crete_Armenoi                             1
Villabruna                                1
Name: clst, Length: 132, dtype: int64

In [14]:
df_ind[df_ind["clst"].str.contains("Polizello")]['clst'].value_counts()

Series([], Name: clst, dtype: int64)

In [16]:
df_ind = pd.read_csv("./eigenstrat/combined/punic1.v43.ind", delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex","clst"]

In [17]:
df_ind[df_ind["clst"].str.contains("Poliz")]

Unnamed: 0,iid,sex,clst
156,I13390,F,Italy_Sicily_IA_Polizzello
157,I13381,F,Italy_Sicily_IA_Polizzello
158,I13376,F,Italy_Sicily_IA_Polizzello
159,I13395,F,Italy_Sicily_IA_Polizzello
160,I13382,M,Italy_Sicily_IA_Polizzello
161,I13377,F,Italy_Sicily_IA_Polizzello
162,I13389,M,Italy_Sicily_IA_Polizzello
163,I13383,M,Italy_Sicily_IA_Polizzello
164,I13393,F,Italy_Sicily_IA_Polizzello
165,I13384,F,Italy_Sicily_IA_Polizzello


# Area 51