# Prepare Files for qpAdm

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-e-16-235.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 28
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


# Key Definitions

In [52]:
base_path = "/n/groups/reich/DAVID/V44/V44.0/v44.0"
ind_path = base_path + ".ind"

# Save Population File (what to pull)

In [53]:
def return_pops(df, string, col="clst", 
                output=False):
    """Return list of clusters that contain string."""
    df1 = df[df[col].str.contains(string)]
    if output:
        print(df1[col].value_counts())
    clsts = list(set(df1[col].values))

    return clsts

def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile

In [54]:
df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 20692 Individuals


In [55]:
##############################################
### Testing for single Populations Populations
return_pops(df_ind, string="Sardinia", 
            output=True);

Italy_Sardinia_Punic                37
Sardinian.SDG                       25
Italy_Sardinia_EBA                  18
Italy_Sardinia_C_BA                 15
Italy_Sardinia_N                    12
                                    ..
Italy_Sardinia_C_lc                  1
Italy_Sardinia_EarlyC_o              1
Italy_Sardinia_EBA_dup.S1250         1
Italy_Sardinia_EBA_dup.SUC004.SG     1
Italy_Sardinia_EBA_dup.S1252         1
Name: clst, Length: 61, dtype: int64


In [56]:
pops = ["Algeria", "Morocco", "Tunisia", "Punic", "Phoenician", 
        "Sardinia", "Ibiza",
        "Canaanite", "Ashkelon", "Sicily", "Hellenistic",
        "Israel_IA", "Israel_EIA", "Israel_Persian", "Gibraltar",
        "Spain_EBA_Afric", "Spain_BellBeaker_o", "Spain_Greek",
        "Spain_Hellenistic", "Spain_IA",
        "Nigeria_IA", "Nigeria_Medieval",
        "Spain_LBA", "Greece_"]

exclude_strings = ["_lc", "contam"]

In [57]:
clsts = [return_pops(df_ind, string=pop, 
                     output=False) for pop in pops]

clsts = [inner for ls in clsts for inner in ls]
clsts = list(set(clsts)) # Filter to unique Elements
print(f"Loaded {len(clsts)} Populations")

### Exclude Strings
for ex in exclude_strings:
    clsts = [c for c in clsts if ex not in c]
print(f"After Exclusion {len(clsts)}")

Loaded 282 Populations
After Exclusion 228


# Save List of populations to keep

In [47]:
#keep = np.array(["Anatolia_N", "Iberia_HG"])
keep = np.array(clsts)
path_keep = "./parfiles/keep_pops"
np.savetxt(path_keep, keep, fmt="%s")

### Run convertf (with population list to keep)
Additional parameters (such as position of output file) are coded into the parameter file

In [None]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = "./parfiles/convertf.keep.par")

## Merge in Lazaridis Ancients with mergeit (with population list to keep)

In [62]:
bin_merge_it = "/n/groups/reich/hringbauer/o2bin/mergeit"
parfile_path = "./parfiles/parMergeAddAnc"

In [63]:
%%time
! $bin_merge_it -p $parfile_path

parameter file: ./parfiles/parMergeAddAnc
BASE: /n/groups/reich/hringbauer/git/punic_aDNA
S1: eigenstrat/punic1.v44
S2: eigenstrat/additional/MinMyc
OUT: eigenstrat/combined/punic1.v44
geno1: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic1.v44.geno
snp1: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic1.v44.snp
ind1: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic1.v44.ind
geno2: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.geno
snp2: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.snp
ind2: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.ind
genooutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined/punic1.v44.geno
snpoutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined/punic1.v44.snp
indoutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined/punic1.v44.ind
docheck: YES
hashcheck: NO
allowdups: YES
packed geno read OK
end

# Prepare Individual File [Stand Alone from here]
Overwrite Individuals with their individual labels

In [20]:
def overwrite_ind_df(df, string, col="clst", 
                     output=False, overwrite="", iids=False):
    """Overwrite Individual Dataframe where col
    contains string. Return modified dataframe (Copy)
    where overwrite is the new Cluster ID
    iids: Overwrite with IIDs if given!"""
    idx = df[col].str.contains(string)
    
    if np.sum(idx)==0:
        if output: 
            print("No Indivdiuals found")
        return
    
    if output:
        print(f"Found {np.sum(idx)} Matches")
        print(df[idx][col].value_counts())
    
    ### Actually  overwrite the Column
    if len(overwrite)>0:
        df.loc[idx, col] = overwrite
        if output: 
            print(f"{np.sum(idx)} Overwritten!")
            
    if iids:
        df.loc[idx, col] = df.loc[idx, "iid"] 
        
        
### Overwrite Individual IIDs
def modifiy_iid_files(df_ind, pops_overwrite, 
                      pops_overwrite12=[], ind_modified=""):
    """Modify .ind file. Overwrite individuals from pops_overwrite (list)
    with their individuals labels. 
    df_int: Dataframe from Individuals.
    pops_overwrite12: [[pop1,pop2]] list (nx2). Overwrites ALL string matches
    for pop1 (contain) with pop2"""
    
    ### Overwrite with other Label
    for pop1,pop2 in pops_overwrite12:
        overwrite_ind_df(df_ind, pop1, overwrite=pop2)
        
    ### Overwrite with individual IIds
    for pop in pops_overwrite:
        overwrite_ind_df(df_ind, pop, 
                     iids=True, output=True)
    
    ### Save here
    df_ind.to_csv(ind_modified, sep=" ", index=None, header=False)
    print(f"Saved {len(df_ind)} Individuals to {ind_modified}")

### Version 1: Overwrite narrow target Individuals
(For testing please see below)

In [21]:
ind_merged="./eigenstrat/combined/punic1.v43.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 655 Individuals


In [24]:
### Overwrite Group Names DELETE NEXT TIME
#overwrite_ind_df(df_ind, "Morocco_LN", overwrite="Morocco_LN")
#overwrite_ind_df(df_ind, "Morocco_EN", overwrite="Morocco_EN")
#overwrite_ind_df(df_ind, "Morocco_Iberomaurusian", overwrite="Morocco_Iberomaurusian")
#overwrite_ind_df(df_ind, "YRI", overwrite="YRI")

In [22]:
pops_overwrite12 = [["Morocco_LN", "Morocco_LN"], ["Morocco_EN", "Morocco_EN"],
                   ["Morocco_Iberomaurusian","Morocco_Iberomaurusian"], ["YRI", "YRI"]]

pops_overwrite = ["Algeria_N", "Punic", "Italy_Sicily_Phoenician", 
                  "Sicily_IA_Polizzello", "Sicani", "Morocco_LN", "Punic_oAfrican", 
                  "Iberia_North_BA_Africa_all", "Iberia_Greek", "Iberia_Hellenistic",
                  "Iberia_BellBeaker_o", "Gibraltar_N", "Italy_Sardinia_C_o",
                  "Nigeria_IA", "Nigeria_Medieval"]

ind_modified="./eigenstrat/combined/punic1.v43_mod1.ind"    # Where to save the modified version to

modifiy_iid_files(df_ind, pops_overwrite=pops_overwrite, 
                  pops_overwrite12=pops_overwrite12,
                  ind_modified = ind_modified)

Found 2 Matches
Algeria_N    2
Name: clst, dtype: int64
Found 27 Matches
Italy_Sardinia_IA_Punic                       9
Iberia_Punic_oAfrican2                        4
Iberia_Punic_o.3rd.degree.relative.cluster    4
Iberia_Punic_oAfrican1                        3
Iberia_Punic_o3                               2
Iberia_Punic                                  2
Iberia_Punic_oEuropean1                       1
Iberia_Punic_oEuropean2                       1
Ibiza_Punic.SG                                1
Name: clst, dtype: int64
Found 22 Matches
Italy_Sicily_Phoenician      19
Italy_Sicily_Phoenician_o     3
Name: clst, dtype: int64
Found 19 Matches
Italy_Sicily_IA_Polizzello    19
Name: clst, dtype: int64
Found 4 Matches
Italy_Sicily_IA_Sicani    4
Name: clst, dtype: int64
Found 3 Matches
Morocco_LN    3
Name: clst, dtype: int64
No Indivdiuals found
Found 1 Matches
Iberia_North_BA_Africa_all    1
Name: clst, dtype: int64
Found 4 Matches
Iberia_Greek     2
Iberia_Greek1    1
Iberia_Greek2  

### Split up Iberian Bronze Age and Sardinian Nuragic too

In [23]:
pops_overwrite = ["Algeria_N", "Punic", "Italy_Sicily_Phoenician",
                  "Sicily_IA_Polizzello", "Sicani",
                  "Morocco_LN", "Punic_oAfrican", 
                  "Iberia_North_BA_Africa_all", "Iberia_BA", "Iberia_IA",
                  "Iberia_Greek", "Iberia_Hellenistic",
                  "Iberia_BellBeaker_o", "Gibraltar_N", 
                  "Iberia_Iberian", "Iberia_Celtiberian", "Iberia_Tartessian",
                  "Italy_Sardinia_C_o", "Nuragic",
                  "Nigeria_IA", "Nigeria_Medieval"
                  ]

pops_overwrite12 = [["Morocco_LN", "Morocco_LN"], ["Morocco_EN", "Morocco_EN"],
                   ["Morocco_Iberomaurusian","Morocco_Iberomaurusian"], ["YRI","YRI"]]

ind_modified="./eigenstrat/combined/punic1.v43_mod_ib.ind"    # Where to save the modified version to

modifiy_iid_files(df_ind, pops_overwrite=pops_overwrite, 
                  pops_overwrite12=pops_overwrite12,
                  ind_modified = ind_modified)

No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
Found 56 Matches
Iberia_BA                 42
Iberia_BA.SG               5
Iberia_BA_Cogotas          4
Iberia_BA_all              2
Iberia_BA_published        1
Iberia_BA_1d.rel.I4560     1
Iberia_BA_1d.rel.I4561     1
Name: clst, dtype: int64
Found 9 Matches
Iberia_IA                         6
Iberia_IA_PreIberian_published    1
Iberia_IA_PreIberian_all          1
Iberia_IA_PreIberian              1
Name: clst, dtype: int64
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
Found 11 Matches
Iberia_Iberian              7
Iberia_Iberian_all          2
Iberia_Iberian_published    2
Name: clst, dtype: int64
Found 5 Matches
Iberia_Celtiberian       3
Iberia_Celtiberian.SG    1
Iberia_Celtiberian.DG    1
Name: clst, dtype: int64
Found 4 Matches
Iberia_Tartessian              2
Iberia_Tartessi

### Test indivdiual overwrites

In [None]:
overwrite_ind_df(df_ind, "Canaanite", 
                 output=True, overwrite="")
#YRI.SG
#Gibraltar_N
#Italy_Sardinia_C_o
#Nigeria_IA

# Area 51

### Compare to David's Assignments

In [54]:
df_t = pd.read_csv("/n/groups/reich/hringbauer/Data/Unpublished_data.tsv", sep="\t")
age_col = "Average of 95.4% date range in calBP (defined as 1950 CE)  "

In [None]:
df_t.str.contains()

In [35]:
group_col = 'Group_ID (format convention which we try to adhere to is "Country_<Geographic.Region_<Geographic.Subregion_>><Archaeological.Period.Or.DateBP_<Alternative.Archaeological.Period_>><Archaeological.Culture_<Alternative.Archaeological.Culture>><genetic.subgrouping.index.if.necessary_><"o_"sometimes.with.additional.detail.if.an.outlier><additional.suffix.especially.relative.status.if.we.recommend.removing.from.main.analysis.grouping><"contam_".if.contaminated><"lc_".if.<15000.SNPs.on.autosomal.targets><".SG".or.".DG".if.shotgun.data>; HG=hunter-gatherer, N=Neolithic, C=Chalcolithic/CopperAge, BA=BronzeAge, IA=IronAge, E=Early, M=Middle, L=Late, A=Antiquity)'
groups = df_t[group_col]

In [36]:
clsts_list = "|".join(clsts)
idx = groups.str.contains(clsts_list)
print(f"Found {np.sum(idx)} Individuals")
df_found = df_t[idx]

Found 174 Individuals


In [56]:
df_found[150:174][["Master ID", group_col, "Publication", "Locality", age_col]]

Unnamed: 0,Master ID,"Group_ID (format convention which we try to adhere to is ""Country_<Geographic.Region_<Geographic.Subregion_>><Archaeological.Period.Or.DateBP_<Alternative.Archaeological.Period_>><Archaeological.Culture_<Alternative.Archaeological.Culture>><genetic.subgrouping.index.if.necessary_><""o_""sometimes.with.additional.detail.if.an.outlier><additional.suffix.especially.relative.status.if.we.recommend.removing.from.main.analysis.grouping><""contam_"".if.contaminated><""lc_"".if.<15000.SNPs.on.autosomal.targets><"".SG"".or."".DG"".if.shotgun.data>; HG=hunter-gatherer, N=Neolithic, C=Chalcolithic/CopperAge, BA=BronzeAge, IA=IronAge, E=Early, M=Middle, L=Late, A=Antiquity)",Publication,Locality,Average of 95.4% date range in calBP (defined as 1950 CE)
7257,I7117,Nigeria_Medieval_lc,Unpublished / Unclaimed,Ngala,1050.0
8498,I3997,Iberia_BA_all,Unpublished / Unclaimed (new version of previo...,"Valencia, Paterna, Lloma de Betxí",3635.0
8499,I8142,Iberia_BA_lc,Unpublished / Unclaimed (new version of previo...,"Granada, Arenas del Rey",3850.0
8500,I8142,Iberia_BA_lc_all,Unpublished / Unclaimed (new version of previo...,"Granada, Arenas del Rey",3850.0
8503,I4246,Iberia_BellBeaker_oAfrican,Unpublished / Unclaimed (new version of previo...,"Madrid, San Fernando de Henares, Camino de las...",4202.0
8504,I6626,Iberia_BellBeaker_oSteppe,Unpublished / Unclaimed (new version of previo...,"Madrid, San Fernando de Henares, Camino de las...",4100.0
8505,I6471,Iberia_BellBeaker_oSteppe,Unpublished / Unclaimed (new version of previo...,La Magdalena,4250.0
8506,I6589,Iberia_BellBeaker_oSteppe,Unpublished / Unclaimed (new version of previo...,"Madrid, Humanejos",4200.0
8507,I6471,Iberia_BellBeaker_oSteppe_all,Unpublished / Unclaimed (new version of previo...,La Magdalena,4250.0
8508,I6589,Iberia_BellBeaker_oSteppe_all,Unpublished / Unclaimed (new version of previo...,"Madrid, Humanejos",4200.0


In [59]:
df_found.to_csv("./output/tables/samples_claim.tsv", sep="\t")

In [17]:
ind_merged="./eigenstrat/combined/punic1.v43.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 635 Individuals


In [None]:
overwrite_ind_df(df_ind, "Iberia", output=True)

In [None]:
df_ind[df_ind["clst"].str.contains("Iberia_Punic")]