# Prepare Files for qpAdm

In [64]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-16-166.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


# Key Definitions

In [65]:
base_path = "/n/groups/reich/DAVID/V42/V42.3/v42.3"
ind_path = base_path + ".ind"

# Save Population File (what to pull)

In [66]:
def return_pops(df, string, col="clst", 
                output=False):
    """Return list of clusters that contain string."""
    df1 = df[df[col].str.contains(string)]
    if output:
        print(df1[col].value_counts())
    clsts = list(set(df1[col].values))

    return clsts

def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./explore_ntbk/parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile

In [67]:
df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 16667 Individuals


In [68]:
##############################################
### Testing for single Populations Populations
return_pops(df_ind, string="Iberia", 
            output=True);

Iberia_C                                  94
Iberia_LN                                 51
Iberia_BellBeaker                         44
Iberia_BA                                 42
Iberia_LN_C                               25
                                          ..
Iberia_Islamic_lc                          1
Iberia_Tartessian_o_published_lc           1
Iberia_NazariPeriod_Muslim_o_published     1
Iberia_Muslim_o1_lc                        1
Iberia_BA_1d.rel.I4561                     1
Name: clst, Length: 195, dtype: int64


In [69]:
pops = ["Algeria", "Morocco", "Punic", "Phoenician", 
        "Canaanite", "Ashkelon", "Greek_Sicily", "Sicily_IA",
        "Israel_IA", "Israel_EIA", "Israel_Persian", "Gibraltar",
        "Iberia_North_BA_Africa_all", "Iberia_BellBeaker_o", "Iberia_Greek",
        "Iberia_Hellenistic", "Iberia_Iberian", "Iberia_Celtiberian",
        "Iberia_Tartessian",
        "Nigeria_IA", "Nigeria_Medieval", "Italy_Sardinia_C_o",
        "Italy_Sardinia_BA_Nuragic", "Iberia_IA", "Iberia_BA", "Greece_BA_Mycenaean"]

exclude_strings = ["_lc", "contam"]

In [70]:
clsts = [return_pops(df_ind, string=pop, 
                     output=False) for pop in pops]

clsts = [inner for ls in clsts for inner in ls]
clsts = list(set(clsts)) # Filter to unique Elements
print(f"Loaded {len(clsts)} Populations")

### Exclude Strings
for ex in exclude_strings:
    clsts = [c for c in clsts if ex not in c]
print(f"After Exclusion {len(clsts)}")

Loaded 110 Populations
After Exclusion 80


# Save List of populations to keep

In [62]:
#keep = np.array(["Anatolia_N", "Iberia_HG"])
keep = np.array(clsts)
path_keep = "./parfiles/keep_pops"
np.savetxt(path_keep, keep, fmt="%s")

### Run convertf (with population list to keep)
Additional parameters (such as position of output file) are coded into the parameter file

In [None]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = "./parfiles/convertf.keep.par")

parameter file: ./parfiles/convertf.keep.par
BASE: /n/groups/reich/hringbauer
DIR: explore_ntbk/v42/V42.3/v42.3
OUT: git/punic_aDNA/eigenstrat/punic1.v43
genotypename: /n/groups/reich/hringbauer/explore_ntbk/v42/V42.3/v42.3.geno
snpname: /n/groups/reich/hringbauer/explore_ntbk/v42/V42.3/v42.3.snp
indivname: /n/groups/reich/hringbauer/explore_ntbk/v42/V42.3/v42.3.ind
genooutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic1.v43.geno
snpoutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic1.v43.snp
indoutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic1.v43.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: YES
poplistname: /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/keep_pops
## /n/groups/reich/hringbauer/o2bin/convertf version: 5711
read 1073741824 bytes
read 2147483648 bytes
read 3221225472 bytes
read 4294967296 bytes
read 5137965171 bytes
packed geno read OK


## Merge in Lazaridis Ancients with mergeit (with population list to keep)

In [71]:
bin_merge_it = "/n/groups/reich/hringbauer/o2bin/mergeit"
parfile_path = "./parfiles/parMergeAddAnc"

In [72]:
%%time
! $bin_merge_it -p $parfile_path

parameter file: ./parfiles/parMergeAddAnc
BASE: /n/groups/reich/hringbauer/git/punic_aDNA
S1: eigenstrat/punic1.v43
S2: eigenstrat/additional/MinMyc
OUT: eigenstrat/combined/punic1.v43
geno1: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic1.v43.geno
snp1: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic1.v43.snp
ind1: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic1.v43.ind
geno2: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.geno
snp2: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.snp
ind2: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.ind
genooutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined/punic1.v43.geno
snpoutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined/punic1.v43.snp
indoutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined/punic1.v43.ind
docheck: YES
hashcheck: NO
allowdups: YES
packed geno read OK
end

# Prepare Individual File [Stand Alone from here]

In [74]:
def overwrite_ind_df(df, string, col="clst", 
                     output=False, overwrite="", iids=False):
    """Overwrite Individual Dataframe where col
    contains string. Return modified dataframe (Copy)
    where overwrite is the new Cluster ID
    iids: Overwrite with IIDs if given!"""
    idx = df[col].str.contains(string)
    
    if np.sum(idx)==0:
        if output: 
            print("No Indivdiuals found")
        return
    
    if output:
        print(f"Found {np.sum(idx)} Matches")
        print(df[idx][col].value_counts())
    
    ### Actually  overwrite the Column
    if len(overwrite)>0:
        df.loc[idx, col] = overwrite
        if output: 
            print(f"{np.sum(idx)} Overwritten!")
            
    if iids:
        df.loc[idx, col] = df.loc[idx, "iid"] 
        
        
### Overwrite Individual IIDs
def modifiy_iid_files(df_ind, pops_overwrite, 
                      pops_overwrite12=[], ind_modified=""):
    """Modify .ind file. Overwrite individuals from pops_overwrite (list)
    with their individuals labels. 
    df_int: Dataframe from Individuals.
    pops_overwrite12: [[pop1,pop2]] list (nx2). Overwrites ALL string matches
    for pop1 (contain) with pop2"""
    
    ### Overwrite with other Label
    for pop1,pop2 in pops_overwrite12:
        overwrite_ind_df(df_ind, pop1, overwrite=pop2)
        
    ### Overwrite with individual IIds
    for pop in pops_overwrite:
        overwrite_ind_df(df_ind, pop, 
                     iids=True, output=True)
    
    ### Save here
    df_ind.to_csv(ind_modified, sep=" ", index=None, header=False)
    print(f"Saved {len(df_ind)} Individuals to {ind_modified}")

### Overwrite Individuals
(For testing please see below)

In [78]:
ind_merged="./eigenstrat/combined/punic1.v43.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 655 Individuals


In [24]:
### Overwrite Group Names DELETE NEXT TIME
#overwrite_ind_df(df_ind, "Morocco_LN", overwrite="Morocco_LN")
#overwrite_ind_df(df_ind, "Morocco_EN", overwrite="Morocco_EN")
#overwrite_ind_df(df_ind, "Morocco_Iberomaurusian", overwrite="Morocco_Iberomaurusian")
#overwrite_ind_df(df_ind, "YRI", overwrite="YRI")

In [76]:
pops_overwrite12 = [["Morocco_LN", "Morocco_LN"], ["Morocco_EN", "Morocco_EN"],
                   ["Morocco_Iberomaurusian","Morocco_Iberomaurusian"],["YRI","YRI"]]

pops_overwrite = ["Algeria_N", "Punic", "Italy_Sicily_Phoenician",
                 "Morocco_LN", "Punic_oAfrican", 
                  "Iberia_North_BA_Africa_all", "Iberia_Greek", "Iberia_Hellenistic",
                  "Iberia_BellBeaker_o", "Gibraltar_N", "Italy_Sardinia_C_o",
                 "Nigeria_IA", "Nigeria_Medieval"]

ind_modified="./eigenstrat/combined/punic1.v43_mod1.ind"    # Where to save the modified version to

modifiy_iid_files(df_ind, pops_overwrite=pops_overwrite, 
                  pops_overwrite12=pops_overwrite12,
                  ind_modified = ind_modified)

Found 2 Matches
Algeria_N    2
Name: clst, dtype: int64
Found 27 Matches
Italy_Sardinia_IA_Punic                       9
Iberia_Punic_oAfrican2                        4
Iberia_Punic_o.3rd.degree.relative.cluster    4
Iberia_Punic_oAfrican1                        3
Iberia_Punic_o3                               2
Iberia_Punic                                  2
Iberia_Punic_oEuropean2                       1
Ibiza_Punic.SG                                1
Iberia_Punic_oEuropean1                       1
Name: clst, dtype: int64
Found 22 Matches
Italy_Sicily_Phoenician      19
Italy_Sicily_Phoenician_o     3
Name: clst, dtype: int64
Found 3 Matches
Morocco_LN    3
Name: clst, dtype: int64
No Indivdiuals found
Found 1 Matches
Iberia_North_BA_Africa_all    1
Name: clst, dtype: int64
Found 4 Matches
Iberia_Greek     2
Iberia_Greek1    1
Iberia_Greek2    1
Name: clst, dtype: int64
Found 7 Matches
Iberia_Hellenistic      5
Iberia_Hellenistic_B    2
Name: clst, dtype: int64
Found 4 Matches
Iberia

### Split up Iberian Bronze Age too

In [79]:
pops_overwrite = ["Algeria_N", "Punic", "Italy_Sicily_Phoenician",
                  "Morocco_LN", "Punic_oAfrican", 
                  "Iberia_North_BA_Africa_all", "Iberia_BA", "Iberia_IA",
                  "Iberia_Greek", "Iberia_Hellenistic",
                  "Iberia_BellBeaker_o", "Gibraltar_N", 
                  "Iberia_Iberian", "Iberia_Celtiberian", "Iberia_Tartessian",
                  "Italy_Sardinia_C_o",
                  "Nigeria_IA", "Nigeria_Medieval"
                  ]

pops_overwrite12 = [["Morocco_LN", "Morocco_LN"], ["Morocco_EN", "Morocco_EN"],
                   ["Morocco_Iberomaurusian","Morocco_Iberomaurusian"], ["YRI","YRI"]]

ind_modified="./eigenstrat/combined/punic1.v43_mod_ib.ind"    # Where to save the modified version to


modifiy_iid_files(df_ind, pops_overwrite=pops_overwrite, 
                  pops_overwrite12=pops_overwrite12,
                  ind_modified = ind_modified)

Found 2 Matches
Algeria_N    2
Name: clst, dtype: int64
Found 27 Matches
Italy_Sardinia_IA_Punic                       9
Iberia_Punic_oAfrican2                        4
Iberia_Punic_o.3rd.degree.relative.cluster    4
Iberia_Punic_oAfrican1                        3
Iberia_Punic_o3                               2
Iberia_Punic                                  2
Iberia_Punic_oEuropean2                       1
Ibiza_Punic.SG                                1
Iberia_Punic_oEuropean1                       1
Name: clst, dtype: int64
Found 22 Matches
Italy_Sicily_Phoenician      19
Italy_Sicily_Phoenician_o     3
Name: clst, dtype: int64
Found 3 Matches
Morocco_LN    3
Name: clst, dtype: int64
No Indivdiuals found
Found 1 Matches
Iberia_North_BA_Africa_all    1
Name: clst, dtype: int64
Found 56 Matches
Iberia_BA                 42
Iberia_BA.SG               5
Iberia_BA_Cogotas          4
Iberia_BA_all              2
Iberia_BA_1d.rel.I4560     1
Iberia_BA_1d.rel.I4561     1
Iberia_BA_published   

### Test indivdiual overwrites

In [None]:
overwrite_ind_df(df_ind, "Canaanite", 
                 output=True, overwrite="")
#YRI.SG
#Gibraltar_N
#Italy_Sardinia_C_o
#Nigeria_IA

# Area 51

In [17]:
ind_merged="./eigenstrat/combined/punic1.v43.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 635 Individuals


In [21]:
overwrite_ind_df(df_ind, "Iberia", output=True)

Found 98 Matches
Iberia_BA                                     42
Iberia_IA                                      6
Iberia_BA.SG                                   5
Iberia_Hellenistic                             5
Iberia_Punic_o.3rd.degree.relative.cluster     4
Iberia_Punic_oAfrican2                         4
Iberia_BA_Cogotas                              4
Iberia_Punic_oAfrican1                         3
Iberia_Punic_o3                                2
Iberia_Greek                                   2
Iberia_BellBeaker_o                            2
Iberia_BA_all                                  2
Iberia_Punic                                   2
Iberia_Hellenistic_B                           2
Iberia_BellBeaker_o_published                  1
Iberia_IA_PreIberian_published                 1
Iberia_IA_PreIberian_all                       1
Iberia_BA_published                            1
Iberia_IA_PreIberian                           1
Iberia_North_BA_Africa_all                     1
Ibe

In [None]:
df_ind[df_ind["clst"].str.contains("Iberia_Punic")]