# Prepare Files for qpAdm

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-16-64.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


# Key Definitions

In [2]:
base_path = "/n/groups/reich/DAVID/V46/V46.3/v46.3"
ind_path = base_path + ".ind"

# Save Population File (what to pull)

In [3]:
def return_pops(df, string, col="clst", 
                output=False):
    """Return list of clusters that contain string."""
    df1 = df[df[col].str.contains(string)]
    if output:
        print(df1[col].value_counts())
    clsts = list(set(df1[col].values))

    return clsts

def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./explore_ntbk/parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile

In [4]:
df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 22628 Individuals


In [8]:
##############################################
### Testing for single Populations Populations
return_pops(df_ind, string="Spain", 
            output=True);

Spain_Islamic                     131
Spain_C                           121
Spain_Visigoth                     57
Spain_MLN                          48
Spain_LateMedieval_Jewish          39
                                 ... 
Spain_EN_father.or.son.I0410        1
Spain_Visigoth_contam_lc            1
Spain_Punic_Roman_oAfrican2_lc      1
Spain_MLN_lc                        1
Spain_Almoloya_Argar_1d.ALM081      1
Name: clst, Length: 267, dtype: int64


In [5]:
pops = ["Algeria", "Morocco", "Tunisia", "Punic", "Phoenician", 
        "Sardinia", "Ibiza", "Canaanite", "Ashkelon", "Sicily", "Hellenistic",
        "Israel_IA", "Israel_EIA", "Israel_Persian", "Gibraltar",
        "Spain_EBA_Afric", "Spain_BellBeaker_o", "Spain_Greek",
        "Spain_Hellenistic", "Spain_IA",
        "Nigeria_IA", "Nigeria_Medieval",
        "Spain_LBA", "Greece_"]

# New Ones v46.3
pops = ["Algeria", "Morocco", "Tunisia", "Punic", "Phoenician", "Spain_Vandal", "Spain_LBA",
        "Sardinia", "Ibiza", "Israel_MLBA", "Israel_LBA", "Ashkelon", "Sicily", "Hellenistic",
        "Israel_IA", "Israel_EIA", "Israel_Persian", "Gibraltar", "Lebanon",
        "Spain_EBA_Afric", "Spain_BellBeaker_oAfrican", "Spain_Greek",
        "Spain_Hellenistic", "Spain_IA", "Italy_Sardinia_C_oAfrican", 
        "Nigeria_IA", "Nigeria_Medieval", "Mallorca", "Menorca", 
        "Egypt_Hellenistic", "Egypt_Roman", "Egypt_Dynastic",
        "Greece_", "Russia_Greek"]

exclude_strings = ["_lc", "contam"]

In [6]:
clsts = [return_pops(df_ind, string=pop, output=False) for pop in pops]

clsts = [inner for ls in clsts for inner in ls]
clsts = list(set(clsts)) # Filter to unique Elements
print(f"Loaded {len(clsts)} Populations")

### Exclude Strings
for ex in exclude_strings:
    clsts = [c for c in clsts if ex not in c]
print(f"After Exclusion {len(clsts)}")

Loaded 363 Populations
After Exclusion 297


In [7]:
### Exclude Kerkouane in external sharing
clsts = [c for c in clsts if "Tunisia_Punic" not in c]
len(clsts)

290

# Save List of populations to keep

In [8]:
#keep = np.array(["Anatolia_N", "Iberia_HG"])
keep = np.array(clsts)
path_keep = "./parfiles/keep_pops1"
np.savetxt(path_keep, keep, fmt="%s")

### Run convertf (with population list to keep)
Additional parameters (such as position of output file) are coded into the parameter file

Takes about 20 minutes

In [9]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = "./parfiles/convertf.anc_only.par")

parameter file: ./parfiles/convertf.anc_only.par
BASE: /n/groups/reich/
DIR: DAVID/V46/V46.3/v46.3
OUT: hringbauer/git/punic_aDNA/eigenstrat/anc_only.v46.3.share
genotypename: /n/groups/reich//DAVID/V46/V46.3/v46.3.geno
snpname: /n/groups/reich//DAVID/V46/V46.3/v46.3.snp
indivname: /n/groups/reich//DAVID/V46/V46.3/v46.3.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/anc_only.v46.3.share.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/anc_only.v46.3.share.snp
indoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/anc_only.v46.3.share.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: YES
poplistname: /n/groups/reich//hringbauer/git/punic_aDNA/parfiles/keep_pops1
## /n/groups/reich/hringbauer/o2bin/convertf version: 5722
read 1073741824 bytes
read 2147483648 bytes
read 3221225472 bytes
read 4294967296 bytes
read 5368709120 bytes
read 6442450944 bytes
read 6975154541 bytes
packed geno read OK
end of inpack
before compress: 

# Merge in Lazaridis Ancients with mergeit (with population list to keep)
Takes about 4 min for v46.3
Again: Some definitions are in the parfile. Please check/modify there!

In [11]:
bin_merge_it = "/n/groups/reich/hringbauer/o2bin/mergeit"
parfile_path = "./parfiles/parMerge.v46.3"

In [12]:
%%time
! $bin_merge_it -p $parfile_path

parameter file: ./parfiles/parMerge.v46.3
BASE: /n/groups/reich/hringbauer/git/punic_aDNA
S1: eigenstrat/anc_only.v46.3.share
S2: eigenstrat/additional/MinMyc
OUT: eigenstrat/combined/punic.v46.3.share
geno1: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/anc_only.v46.3.share.geno
snp1: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/anc_only.v46.3.share.snp
ind1: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/anc_only.v46.3.share.ind
geno2: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.geno
snp2: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.snp
ind2: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.ind
genooutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined/punic.v46.3.share.geno
snpoutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined/punic.v46.3.share.snp
indoutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined/punic.v46.3.share.i

# Prepare Individual File [Stand Alone from here]
Overwrite Individuals with their individual labels

In [3]:
def overwrite_ind_df(df, string, col="clst", 
                     output=False, overwrite="", iids=False):
    """Overwrite Individual Dataframe where col
    contains string. Return modified dataframe (Copy)
    where overwrite is the new Cluster ID
    iids: Overwrite with IIDs if True!"""
    idx = df[col].str.contains(string)
    
    if np.sum(idx)==0:
        if output: 
            print("No Indivdiuals found")
        return
    
    if output:
        print(f"Found {np.sum(idx)} Matches")
        print(df[idx][col].value_counts())
    
    ### Actually  overwrite the Column
    if len(overwrite)>0:
        df.loc[idx, col] = overwrite
        if output: 
            print(f"{np.sum(idx)} Overwritten!")
            
    if iids:
        df.loc[idx, col] = df.loc[idx, "iid"] 
        
        
### Overwrite Individual IIDs
def modifiy_iid_files(df_ind, pops_overwrite, 
                      pops_overwrite12=[], ind_modified=""):
    """Modify .ind file. Overwrite individuals from pops_overwrite (list)
    with their individuals labels. 
    df_int: Dataframe from Individuals.
    pops_overwrite12: [[pop1,pop2]] list (nx2). Overwrites ALL string matches
    for pop1 (contain) with pop2"""
    
    ### Overwrite with other Label
    for pop1,pop2 in pops_overwrite12:
        overwrite_ind_df(df_ind, pop1, overwrite=pop2)
        
    ### Overwrite with individual IIds
    for pop in pops_overwrite:
        overwrite_ind_df(df_ind, pop, 
                     iids=True, output=True)
    
    ### Save here
    df_ind.to_csv(ind_modified, sep=" ", index=None, header=False)
    print(f"Saved {len(df_ind)} Individuals to {ind_modified}")

### Version 1: Overwrite narrow target Individuals
(For testing please see below)

In [4]:
ind_merged="./eigenstrat/combined/punic.v46.3.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 1240 Individuals


### Play around with Individual df to identify clusters of interest

In [None]:
df_ind[df_ind["clst"].str.contains("Israel")]

In [24]:
### Overwrite Group Names DELETE NEXT TIME
#overwrite_ind_df(df_ind, "Morocco_LN", overwrite="Morocco_LN")
#overwrite_ind_df(df_ind, "Morocco_EN", overwrite="Morocco_EN")
#overwrite_ind_df(df_ind, "Morocco_Iberomaurusian", overwrite="Morocco_Iberomaurusian")
#overwrite_ind_df(df_ind, "YRI", overwrite="YRI")

# 1) Only overwrite the Punics

In [5]:
### Populations to overwrite. Typically because they have the ".SG" label
pops_overwrite12 = [["Morocco_LN", "Morocco_LN"], ["Morocco_EN", "Morocco_EN"],
                   ["Morocco_Iberomaurusian","Morocco_Iberomaurusian"], ["YRI", "YRI"]]

### Population to overwrite because they are the target
pops_overwrite = ["Punic", "Sicily_Punic"] # "Sardinia", "Spain" "Algeria", 

ind_modified="./eigenstrat/combined/punic.v46.3_ind.ind"    # Where to save the modified version to

modifiy_iid_files(df_ind, pops_overwrite=pops_overwrite, 
                  pops_overwrite12=pops_overwrite12,
                  ind_modified = ind_modified)

Found 134 Matches
Italy_Sicily_Punic                                             53
Tunisia_Punic                                                  23
Italy_Sardinia_Punic                                           12
Italy_Sardinia_IA_Punic_1                                       5
Italy_Sardinia_Punic_oNAfrica                                   4
Italy_Sardinia_IA_Punic_2                                       4
Spain_Punic_o.3rd.degree.relative.cluster                       3
Italy_Sicily_Punic_oLevant                                      3
Italy_Sicily_Punic_oEuropean                                    2
Spain_Punic_oAfrican2                                           2
Italy_Sardinia_Punic_oEurope                                    2
Spain_Punic                                                     2
Italy_Sicily_Punic_oEurope                                      2
Italy_Sicily_Punic_oNearEast                                    1
Spain_Punic_Roman_o3                                      

### 2) Split up Iberian Bronze Age and Sardinian Nuragic too

In [None]:
pops_overwrite = ["Algeria_N", "Punic", "Italy_Sicily_Phoenician",
                  "Sicily_IA_Polizzello", "Sicani",
                  "Morocco_LN", "Punic_oAfrican", 
                  "Iberia_North_BA_Africa_all", "Iberia_BA", "Iberia_IA",
                  "Iberia_Greek", "Iberia_Hellenistic",
                  "Iberia_BellBeaker_o", "Gibraltar_N", 
                  "Iberia_Iberian", "Iberia_Celtiberian", "Iberia_Tartessian",
                  "Italy_Sardinia_C_o", "Nuragic",
                  "Nigeria_IA", "Nigeria_Medieval"
                  ]

pops_overwrite12 = [["Morocco_LN", "Morocco_LN"], ["Morocco_EN", "Morocco_EN"],
                   ["Morocco_Iberomaurusian","Morocco_Iberomaurusian"], ["YRI","YRI"]]

ind_modified="./eigenstrat/combined/punic.v46.3_ind1.ind"    # Where to save the modified version to

modifiy_iid_files(df_ind, pops_overwrite=pops_overwrite, 
                  pops_overwrite12=pops_overwrite12,
                  ind_modified = ind_modified)

### Test indivdiual overwrites

In [None]:
overwrite_ind_df(df_ind, "Canaanite", 
                 output=True, overwrite="")
#YRI.SG
#Gibraltar_N
#Italy_Sardinia_C_o
#Nigeria_IA

# Area 51

### Compare to David's Assignments

In [54]:
df_t = pd.read_csv("/n/groups/reich/hringbauer/Data/Unpublished_data.tsv", sep="\t")
age_col = "Average of 95.4% date range in calBP (defined as 1950 CE)  "

In [None]:
df_t.str.contains()

In [35]:
group_col = 'Group_ID (format convention which we try to adhere to is "Country_<Geographic.Region_<Geographic.Subregion_>><Archaeological.Period.Or.DateBP_<Alternative.Archaeological.Period_>><Archaeological.Culture_<Alternative.Archaeological.Culture>><genetic.subgrouping.index.if.necessary_><"o_"sometimes.with.additional.detail.if.an.outlier><additional.suffix.especially.relative.status.if.we.recommend.removing.from.main.analysis.grouping><"contam_".if.contaminated><"lc_".if.<15000.SNPs.on.autosomal.targets><".SG".or.".DG".if.shotgun.data>; HG=hunter-gatherer, N=Neolithic, C=Chalcolithic/CopperAge, BA=BronzeAge, IA=IronAge, E=Early, M=Middle, L=Late, A=Antiquity)'
groups = df_t[group_col]

In [36]:
clsts_list = "|".join(clsts)
idx = groups.str.contains(clsts_list)
print(f"Found {np.sum(idx)} Individuals")
df_found = df_t[idx]

Found 174 Individuals


In [None]:
df_found[150:174][["Master ID", group_col, "Publication", "Locality", age_col]]

In [59]:
df_found.to_csv("./output/tables/samples_claim.tsv", sep="\t")

In [17]:
ind_merged="./eigenstrat/combined/punic1.v43.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 635 Individuals


In [None]:
overwrite_ind_df(df_ind, "Iberia", output=True)

In [None]:
df_ind[df_ind["clst"].str.contains("Iberia_Punic")]