# Visualize and produce Tabular output for competitive qpAdm modelling

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time
from matplotlib import gridspec

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on HMS Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-16-162.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


In [107]:
def load_qp_adm(path):
    """Load, parse qpAdm .log-file.
    Return: res, p_vals, pops, stds
    path: Path of Input File
    res: mxnx2 arryay: m: Nr subsets admixed pops, n: Nr 
    p_vals: List of m p-Values
    pops: List of Strings of analyzed populations [target, s1, s2.., sn]"""
    pop_line, pop_line_end = -1, -1 # Where the populations are found   
    res_begin, res_end = -1, -1
    std_line = -1  # Where the standard Deviation will land
    
    ### Parse the log file
    # Iterate over everything and use the signals for start/stop
    with open(path, "r") as f:
        for i, line in enumerate(f):
            if line =="left pops:\n":
                pop_line = i+1 
            if line=="right pops:\n":
                pop_line_end = i-1 # There is an empty line before
                
            # Parse off everything to first space
            s0 = line.split()
            if len(s0)>=2:
                if s0[0] == "fixed" and s0[1]=="pat":
                    res_begin = i+1
                    
                if s0[0] == "best" and s0[1]=="pat:":
                    if res_end < 0: # Only take the first occurence
                        res_end = i
                        
                elif s0[0] == "std." and s0[1]=="errors:":
                    std_line = i
    
    ### Read out the results:
    with open(path, "r") as f:
        lines = f.readlines()
        pops = lines[pop_line:pop_line_end]
        pops = [p.rstrip() for p in pops]  # Chews off new line symbol

        res = lines[res_begin:res_end]
        stds = lines[std_line]
    
    ### Post-process the important Lines:
    res_t = np.array([s.split()[:len(pops)+4] for s in res]) # 8 is valid for 3 pops!
    res = res_t[:,5:].astype("float")
    p_vals = res_t[:,4].astype("float")
    stds = np.array(stds.split())[2:].astype("float")  # Extract the Standard Deviations for the first line
            
    #### Return estimates and p-Value
    assert(len(p_vals)==len(res)) # Sanity Check
    return res, p_vals, pops, stds

def load_qp_adm_1way(path):
    """Load, parse qpAdm .log-file for 1 way
    path: Where to find the 
    Return p_value and pops:
    p_value: List of m p-Values
    pops: List of Strings of analyzed populations [target, s1]"""
    pop_line, pop_line_end = -1, -1 # Where the populations are found   
    res_line = -1
    
    ### Parse the log file
    # Iterate over everything and use the signals for start/stop
    with open(path, "r") as f:
        for i, line in enumerate(f):
            if line =="left pops:\n":
                pop_line = i+1 
            if line=="right pops:\n":
                pop_line_end = i-1 # There is an empty line before
                
            # Parse off everything to first space
            s0 = line.split()
            if len(s0)>=2:
                if s0[0] == "f4rank:" and s0[1] == "1":
                    res_line = i
    
    ### Read out the results:
    with open(path, "r") as f:
        lines = f.readlines()
        pops = lines[pop_line:pop_line_end]
        pops = [p.rstrip() for p in pops]  # Chews off new line symbol
        res = lines[res_line]
    
    ### Post-process the important Lines:
    p_val = float(res.split()[-1])
    #### Return estimates and p-Value
    return p_val, pops


def create_admix_df(source_pops, admix_coeffs, stds, p_vals):
    """Create and return dataframe with Admixture Proportions and Standard Errors.
    Assume all input is as Numpy Array (otherwise indexing errors!!)"""
    n = len(source_pops[0,:])  # Get Number Sources +1
    df = pd.DataFrame({"target":source_pops[:,0], "p-Value":p_vals})
    
    for i in range(1,n):
        df[f"Source_{i}"] = source_pops[:,i]
        
    for i in range(1,n):
        df[f"Fraction_{i}"] = admix_coeffs[:,i-1]
        
    for i in range(1,n):
        df[f"STD_{i}"] = stds[:,i-1]
    return df
    
def give_admix0(res, minval=-1e-4):
    """Return the Admixture Coefficients of the first Model.
    Only return 0 if feasible, else np.nan"""
    
    feasible = np.min(res[0])>minval  # Check if feasible
    if not feasible:
        return np.nan  # No Feasible Model
    return 0
    
def give_admix_index(res):
    """Return index of first viable admixture result.
    res: nxk array. n...Nr of all subsets, k...Nr of Source Pops"""    
    for i, ls in enumerate(res):
        if np.min(ls)>-1e-4:
            return i
    print("Warning: No valid admixture found!!")       
    return np.nan # Default Return

def give_admix_index_best(res, pvals, minval=-1e-4):
    """Return index of best viable admixture result.
    res: nxk array. n...Nr of all subsets, k...Nr of Source Pops"""  
    feasible = np.min(res, axis=1)>minval  # Extract all feasible results
    pvals_okay = feasible * pvals  # Set bad ones to 0.
    
    if np.max(pvals_okay)==0:
        print("Warning: No valid admixture found!!")
        return np.nan
    
    i = np.argmax(pvals_okay) # The Index with the Maximum Value
    return i
    
def sci_notation(num, decimal_digits=1, precision=None, exponent=None):
    """
    Returns a string representation of the scientific
    notation of the given number formatted for use with
    LaTeX or Mathtext, with specified number of significant
    decimal digits and precision (number of decimal digits
    to show). The exponent to be used can also be specified
    explicitly.
    """
    if not exponent:
        exponent = int(np.floor(np.log10(abs(num))))
    coeff = round(num / float(10**exponent), decimal_digits)
    if not precision:
        precision = decimal_digits

    return "${0:.{2}f}\cdot10^{{{1:d}}}$".format(coeff, exponent, precision)


def create_latex_lines(source_pops, admix_coeffs, stds, p_vals, na = "-", rp = "A12"):
    """Print a Line for Latex Table in the main Text. 
    Script to speed things up.
    source_pops, nx(j+1) Array, j...NR of Sources (<=3)
    admix_coeffs, stds: nxj Array
    p_vals: nx1 Array
    na: Character for Missing Data
    rp: String for right population"""
    out =""
    
    for i in range(len(source_pops)): # Iterate over every line
        ls = [na for _ in range(12)] # Create empty vector with na Symbol
        
        assert(len(source_pops[i])<=4) # At most 4 sources
        
        for j, s in enumerate(source_pops[i]): # Fill in the Source Populations
            ls[j] = s   
            
        ls[4] = rp        # Right Pop
        
        p = p_vals[i]
        
        if p==0.0:
            ls[5] = "0" 
        elif p>=0.05:
            ls[5] = "\\textbf{" +  str(np.around(p, 3)) + "}" # Make bold
            
        elif p>= 0.01:
            ls[5] = np.around(p, 3) # Round to three Digits            
        elif p<0.01: 
            ls[5] = sci_notation(p, decimal_digits=1)  # Do proper formatting      
        
        for j, x in enumerate(admix_coeffs[i]): # Fill in the admixture fractions
            ls[6+j] = "{:.3f}".format(x) 
            
        for j, x in enumerate(stds[i]): # Fill in the uncertainty of admixture fractions
            ls[9+j] = "{:.3f}".format(x)        
        
        s = " & ".join(str(x) for x in ls) # Convert to Strings and Join
        s=s.replace("_", "-") # Replace tricky underscore symbols (for Latex tables)
        print(s + "\\\\") # Add two backslashes
        out += (s + "\n") # Do next Line
        
    return out # Return the full text

def load_iids_from_indfile(path_ind, string, 
                           col="clst", col_iid="iid",
                           iids_okay=[]):
    """Load IIDs from Ind File
    Return List of IIDs"""
    df_ind = pd.read_csv(path_ind, delim_whitespace=True, header=None)
    df_ind.columns=["iid", "sex","clst"]
    idx = df_ind[col].str.contains(string)
    ls = df_ind[idx][col_iid].values
    
    ### If needed filter out okay Individuals
    if len(iids_okay)>0:
        ls = np.intersect1d(ls, iids_okay)
    return ls

###################################################
### Load okay Individual IIDs

def load_individuals_filetered(
                            path_anno = "/n/groups/reich/hringbauer/Data/v42.3.anno.csv",
                            col="clst", col_iid="iid",
                            min_snps_cov=50000,
                            snp_cov_col="n_cov_snp",
                            master_id_col="Master ID",
                            ):
    """Filter List of Individuals against meta,
    using minimal Nr of SNPs and unique IDs"""

    df_all = pd.read_csv(path_anno)

    ### Keep only the best coerage Indivdual
    df_all = df_all.sort_values(by=snp_cov_col, ascending=False)
    df_all = df_all.drop_duplicates(subset=master_id_col)
    
    ### Filter to min Nr of SNPs
    df_all = df_all[df_all[snp_cov_col]>=min_snps_cov]
    
    ### 
    df_all = df_all
    return df_all["iid"].values

######################################
### Helper Functions

def create_empty_qpAdm_df(k, n_raws=0):
    """Return qpAdm dataframe
    for k sources.
    n_entries: How many empty Entries """
    df = pd.DataFrame({})
    df["t"] = np.nan
    df["p"] = np.nan
    df["n"] = np.nan

    ### Create an empty data frame
    for i in range(1, k+1):
        df["s" + str(i)] = np.nan
        
    for i in range(1, k+1):
        df["f" + str(i)] = np.float(np.nan)

    for i in range(1, k+1):
        df["std" + str(i)] = np.float(np.nan)
    
    ### Add empty rows
    if n_raws>0:
        df = df.reindex(df.index.tolist() + list(range(n_raws)))
        #df["p"] = df["p"].astype("float")
        #df["n"] = df["n"].astype("float")
        #for i in range(1, k+1):
        #    df["f" + str(i)] = df["f" + str(i)].astype("float")
        #    df["std" + str(i)] = df["std" + str(i)].astype("float")
    return df

def load_first_qpAdm_model(path):
    """Parse and return the first values of qpAdm model
    from log-file at path. 
    Return mixture coefficients, palue, left pops
    and standard errors"""
    res, p_vals, pops, stds =  load_qp_adm(path)

    ### Only Keep the first results
    res = res[0]
    p_val = p_vals[0] 
    return res, p_val, pops, stds

def set_feasible_df(df, sources=5):
    """Set Feasible Models"""
    
    fs = [f"f{i}" for i in range(1,sources+1)]

    idx_if = np.nanmin(df[fs], axis=1) < 0
    df["fs"] = True
    df.loc[idx_if, "fs"] = False
    
    print(f"Set {np.sum(idx_if)}/{len(idx_if)} infeasible models")
    return df

def get_individual_models(df, min_p=0.01):
    """Get Dataframe with rows of best Indivdidual Model
    (lowest number of sources, highest p Value)"""
    #df = df.sort_values(by=["p", "n"], ascending=[False, True])
    dft = df[df["p"]>=min_p]
    dft = dft.sort_values(by=["n", "p"], ascending=[True, False])
    
    idx_dup = dft["t"].duplicated()
    dft = dft[~idx_dup]
    return dft.reset_index(drop=True).copy()

def reorder_qpadm_df(df, sources=[]):
    """Reorder qpAdm Dataframe, so that sources 
    are in consistent order. Only changes columns.
    Return updated dataframe"""
    df1 = df.copy() # Create Copy
    df1.iloc[:,:] = 0  # set everything to 0
    df1["p"] = df["p"] # Fill inmutables
    df1["t"] = df["t"]
    df1["n"] = df["n"]

    for i,s in enumerate(sources):
        i1=i+1 # Move Index one up to start filling at 1
        #df1[f"s{i1}"] = s  # Dont fill in if not in model
        for j in range(1,len(sources)+1):
            idx = df[f"s{j}"] == s
            df1.loc[idx, f"s{i1}"] = df.loc[idx, f"s{j}"] # Only fill in where data
            df1.loc[idx, f"f{i1}"] = df.loc[idx, f"f{j}"]
            df1.loc[idx, f"std{i1}"] = df.loc[idx, f"std{j}"]
            
    assert(~np.sum(df1[df1["n"]>1].isnull()).all())
    return df1.copy()

def get_df_n_way_model(ns=[2,], comp_groups=[], targets=[], 
                       base_folder="", reorder_sources=[]):
    """Get Dataframe of all competative n_way models of iids.
    ns: List of n i n-way models.
    comp_groups: The competivie right groups.
    targets: List of target iids
    base_folder: where to find the qpadm output .log files.
    reorder_sources: If given reorder sources
    Return qpAdm dataframe"""
    k = len(comp_groups) # For verall size of results dataframe
    k1 = len(targets)
    
    dfs = []
    for n in ns: # Iterate over number of sources
        source_list = np.array(list(it.combinations(comp_groups, r=n)))
        l = len(source_list)
        
        df = create_empty_qpAdm_df(k, n_raws=l*k1)
        df["n"] = n
        
        i = 0
        for iid in targets:  # iterate over targets
            for s in source_list: # iterate over number of sources
                path_log = ".".join([iid] + list(s)) + ".log"
                path_load = f"{base_folder}{n}way/{path_log}"
                
                if n==1:
                    p_val, pops = load_qp_adm_1way(path_load)
                    res, stds = [1], [0] # No uncertainty about 1way model
                    
                elif n>1:
                    res, p_val, pops, stds = load_first_qpAdm_model(path_load)
                    
                ### Fill dataframe row
                df.loc[i,"t"] = pops[0]
                df.loc[i,"p"] = p_val

                for l in range(n):
                    l1 = l + 1  
                    df.loc[i,f"s{l1}"] = pops[l1]
                    df.loc[i,f"f{l1}"] = res[l]
                    df.loc[i,f"std{l1}"] = stds[l]
                i+= 1 # Jump to the row
                
        dfs.append(df)
    df = pd.concat(dfs)
    
    if len(reorder_sources)>0:
        df = reorder_qpadm_df(df, order_new = reorder_sources)
        
    df = set_feasible_df(df, sources=len(comp_groups))
    
    return df

def reorder_qpAdm_df2(df, order_new=[]):
    """Reorder qpAdm df so that 
    s1,s2,..sn are in order of order
    order: List of populations [s1,s2,...,sn]"""
    
    df1 = df.copy() # Placeholder for the new Dataframe
    
    for n, s in enumerate(order_new):    
        n1 = n + 1  # Index of pop starts at 1
        df1[f"s{n1}"]=np.nan
        df1[f"f{n1}"] = 0
        df1[f"std{n1}"] = 0
        
        for c1 in range(1,len(order_new)+1): # Go over all columns
            idx = df[f"s{c1}"] == s
        
            df1.loc[idx, f"s{n1}"] = df.loc[idx, f"s{c1}"]
            df1.loc[idx, f"f{n1}"] = df.loc[idx, f"f{c1}"]
            df1.loc[idx, f"std{n1}"] = df.loc[idx, f"std{c1}"]
    return df1
        
def get_spec_model(df, sources=[], tot_sources=5, 
                   drop=True, output=True):
    """Get specific model for dataframe df.
    Return dataframe with this model."""
    k = len(sources)
    found = np.ones(len(df), dtype="bool")
    for i in range(1,k+1):
        idx = df[f"s{i}"].isin(sources)
        found = found & idx
    for i in range(k+1, tot_sources+1): 
        idx = df[f"s{i}"].isnull()
        found = found & idx
        
    if output: 
        print(f"Found {np.sum(found)} fitting rows.")
    dft = df[found]
    
    ### Drop Unnecessary Labels
    if drop:
        for i in range(k+1, tot_sources+1):
            dft = dft.drop(columns=[f"s{i}", f"f{i}", f"std{i}"])
    
    return dft.reset_index(drop=True).copy()

# 1) Process the Data

In [109]:
%%time
comp_groups = ["Italy_Sicily_IA_Polizzello", "Algeria_IA", "Greece_BA_Mycenaean", 
               "Israel_Phoenician", "Italy_Sardinia_BA_Nuragic", "Spain_IA"]

reorder_sources = ["Algeria_IA", "Italy_Sicily_IA_Polizzello", "Italy_Sardinia_BA_Nuragic", 
              "Spain_IA", "Greece_BA_Mycenaean",  "Israel_Phoenician"]

base_folder = "./output/qpAdm/comp2.v46.3/"

### Get the IIDs
df = pd.read_csv("./output/tables/qpAdm30Kpunic.v46.3.tsv", sep="\t")
iids = df["iid"].values[:2]
d = np.array(["S18200.Y1.E2.L1", 'MS10614.SG']) # Bad qpAdm runs?!
iids= np.setdiff1d(iids, d)
print(f"Loaded {len(iids)} Punic IIDs")

df = get_df_n_way_model(ns=[1,2,3,4], comp_groups=comp_groups, 
                        targets=iids[:], base_folder=base_folder, 
                        reorder_sources=reorder_sources)

### Save the Data to tabular Format
df.to_csv("./output/qpAdm/v46.3/ind_model_comp2r.tsv", sep="\t", index=False)

Loaded 2 Punic IIDs


TypeError: reorder_qpadm_df() got an unexpected keyword argument 'order_new'

# 1b) Process the proximal model with reordering
This is needed for plotting - as it produces a standardized dataframe with all sources ordered and also single models filled in correctly

In [103]:
df = pd.read_csv("./output/qpAdm/v46.3/ind_model_comp2.tsv", sep="\t")

new_groups = ["Algeria_IA", "Italy_Sicily_IA_Polizzello", "Italy_Sardinia_BA_Nuragic", 
              "Spain_IA", "Greece_BA_Mycenaean",  "Israel_Phoenician"]

df.to_csv("./output/qpAdm/v46.3/ind_model_comp2r.tsv", sep="\t", index=False)

# Code to play with: Analyze the best Individual Models
Not needed downstream, only gives out data in notebook

In [4]:
%%capture --no-stdout

df = pd.read_csv("./output/qpAdm/v46.3/ind_model_comp.tsv", sep="\t")
print(f"Loaded {len(df)} qpAdm models.")

df = set_feasible_df(df, sources=5)
df1 = df[df["fs"]==1].copy()
df2 = df1.copy()
df_ind = get_individual_models(df1, min_p=0.01)
print(f"Got {len(df_ind)} best Indivdiuals models")

Loaded 3875 qpAdm models.
Set 2006/3875 infeasible models
Got 76 best Indivdiuals models


In [28]:
%%capture --no-stdout
### Load the wave 2 model
df = pd.read_csv("./output/qpAdm/v46.3/ind_model_comp2.tsv", sep="\t")
print(f"Loaded {len(df)} qpAdm models.")

df = set_feasible_df(df, sources=6)
df1 = df[df["fs"]==1].copy() ### Only feasible
df2 = df1.copy()
df_ind2 = get_individual_models(df1, min_p=0.01)
print(f"Got {len(df_ind2)} best Indivdiuals models")

Loaded 7000 qpAdm models.
Set 3643/7000 infeasible models
Got 115 best Indivdiuals models


### Get all Results for one individual

In [64]:
df = pd.read_csv("./output/qpAdm/v46.3/ind_model_comp2.tsv", sep="\t")
print(f"Loaded {len(df)} qpAdm models.")

Loaded 7000 qpAdm models.


In [None]:
df1[df1["t"]=="I22113"].sort_values(by="p", ascending=False)#[["f1", "f2", "f3"]]

### Get all Results for one specific Model

In [21]:
df2 = get_spec_model(df1, sources=["Greece_BA_Mycenaean", "Spain_IA", "Algeria_IA"])   # "Israel_Phoenician"
s = np.sum(df2["p"]>0.01)
print(f"Fitting Models at p>0.01: {s}")
#Italy_Sicily_IA_Polizzello  Italy_Sardinia_BA_Nuragic	Tunisia_N Greece_BA_Mycenaean Israel_Phoenician

Found 94 fitting rows.
Fitting Models at p>0.01: 72


In [26]:
comp_groups = ["Italy_Sicily_IA_Polizzello", "Algeria_IA", "Greece_BA_Mycenaean", 
               "Israel_Phoenician", "Italy_Sardinia_BA_Nuragic", "Spain_IA"]
p=0.01

print(f"Fitting at p>{p}")
for r in range(1,5):
    print(f"\nSources: {r}")
    for ls in it.combinations(comp_groups, r=r):
        df2 = get_spec_model(df1, sources=ls, output=False)   # "Israel_Phoenician"
        s = np.sum(df2["p"]>p)
        print(f"{ls}: {s}")

Fitting at p>0.01

Sources: 1
('Italy_Sicily_IA_Polizzello',): 4
('Algeria_IA',): 3
('Greece_BA_Mycenaean',): 9
('Israel_Phoenician',): 6
('Italy_Sardinia_BA_Nuragic',): 0
('Spain_IA',): 4

Sources: 2
('Italy_Sicily_IA_Polizzello', 'Algeria_IA'): 34
('Italy_Sicily_IA_Polizzello', 'Greece_BA_Mycenaean'): 8
('Italy_Sicily_IA_Polizzello', 'Israel_Phoenician'): 28
('Italy_Sicily_IA_Polizzello', 'Italy_Sardinia_BA_Nuragic'): 0
('Italy_Sicily_IA_Polizzello', 'Spain_IA'): 7
('Algeria_IA', 'Greece_BA_Mycenaean'): 51
('Algeria_IA', 'Israel_Phoenician'): 14
('Algeria_IA', 'Italy_Sardinia_BA_Nuragic'): 6
('Algeria_IA', 'Spain_IA'): 9
('Greece_BA_Mycenaean', 'Israel_Phoenician'): 11
('Greece_BA_Mycenaean', 'Italy_Sardinia_BA_Nuragic'): 8
('Greece_BA_Mycenaean', 'Spain_IA'): 26
('Israel_Phoenician', 'Italy_Sardinia_BA_Nuragic'): 15
('Israel_Phoenician', 'Spain_IA'): 45
('Italy_Sardinia_BA_Nuragic', 'Spain_IA'): 2

Sources: 3
('Italy_Sicily_IA_Polizzello', 'Algeria_IA', 'Greece_BA_Mycenaean'): 44
('

In [36]:
df2 = get_spec_model(df1, sources=['Italy_Sicily_IA_Polizzello', 'Algeria_IA', 'Israel_Phoenician', 'Spain_IA']) 

Found 65 fitting rows.


In [None]:
df2.sort_values(by="p",ascending=False)[:50]

In [None]:
df[df["t"]=="I21984"]

## Get all Results for one Site

In [8]:
df_meta = pd.read_csv("/n/groups/reich/hringbauer/Data/v46.3.anno.csv")
dft = pd.merge(df1, df_meta[["iid", "loc", "clst", "n_cov_snp"]], left_on="t", right_on="iid", how="left")
dft = get_individual_models(dft, min_p=0.01)

In [None]:
dft[dft["loc"].str.contains("Tharros")].sort_values(by="p", ascending=False)

In [None]:
df2.sort_values(by="p", ascending=False)[:50]

In [None]:
plt.figure(figsize=(6,6))
ax = plt.gca()
ax.hist(df2["p"], ec="k", bins=21)
plt.show()

# Process the Distal Model with Tunisia N
Takes about 6 min for 173 Indivdiuals

In [3]:
%%time
comp_groups = ["Anatolia_N", "Steppe_MLBA", "WHG", 
               "Tunisia_N", "Levant_N", "Iran_N"]

base_folder = "./output/qpAdm/dist.v46.3/"

### Get the IIDs
df = pd.read_csv("./output/tables/qpadm.targets.distal.v46.3.tsv", sep="\t")
iids = df["iid"].values
d = np.array(["S18200.Y1.E2.L1", 'MS10614.SG']) # Bad qpAdm runs?!
iids= np.setdiff1d(iids, d)
print(f"Loaded {len(iids)} IIDs for distal modelling")

df = get_df_n_way_model(ns=[1,2,3,4,5,6], comp_groups=comp_groups, 
                        targets=iids[:], 
                        base_folder=base_folder, reorder_sources=comp_groups)

### Save the Data to tabular Format
df.to_csv("./output/qpAdm/v46.3/ind_model_dist.tsv", sep="\t", index=False)

Loaded 193 IIDs for distal modelling
Set 6942/12159 infeasible models
CPU times: user 1min 51s, sys: 4.02 s, total: 1min 55s
Wall time: 8min 27s


# Process the distal model with Algeria IA
Same as above but folders different
Per 20 individuals: ~ XX min 
(in total 193)

In [69]:
%%time
comp_groups = ["Anatolia_N", "Steppe_MLBA", "WHG", 
               "I12433", "Levant_N", "Iran_N"]

base_folder = "./output/qpAdm/distAlgIA.v46.3/"

### Get the IIDs
df = pd.read_csv("./output/tables/qpadm.targets.distal.v46.3.tsv", sep="\t") # CHANGE BACK TO ORIGNAL FILES
iids0 = df["iid"].values
d = np.array(["S18200.Y1.E2.L1", 'MS10614.SG', "I12433"]) # Bad qpAdm runs?!
iids= np.setdiff1d(iids0, d)
print(f"Loaded {len(iids)} IIDs for distal modelling")

df = get_df_n_way_model(ns=[1,2,3,4,5,6], comp_groups=comp_groups, targets=iids[:], 
                        base_folder=base_folder, reorder_sources=comp_groups) # 2,3,4,5,6

### Save the Data to tabular Format
df.to_csv("./output/qpAdm/v46.3/ind_model_distAlgIA.tsv", sep="\t", index=False)

Loaded 192 IIDs for distal modelling
Set 6897/12096 infeasible models
CPU times: user 59.1 s, sys: 1.93 s, total: 1min
Wall time: 1min 37s


# Load the processed Data and play with some analysis

In [23]:
df = pd.read_csv("./output/qpAdm/v46.3/ind_model_distAlgIA.tsv", sep="\t")
#df = pd.read_csv("./output/qpAdm/v46.3/ind_model_dist.tsv", sep="\t")
print(f"Loaded {len(df)} qpAdm models.")
df1 = df[df["fs"]==1].copy()
print(f"Feasable: {len(df1)} models.")

df_ind = get_individual_models(df1, min_p=0.01)
print(f"Got {len(df_ind)} best Indivdiuals models")

Loaded 12096 qpAdm models.
Feasable: 5199 models.
Got 182 best Indivdiuals models


In [None]:
df[df["t"]=="VIL009"].sort_values(by="p", ascending=False)[:30]

In [71]:
comp_groups = ["Anatolia_N", "Steppe_MLBA", "WHG", 
               "I12433", "Iran_N", "Levant_N"]
p=0.01

print(f"Fitting at p>{p}")
for r in range(1,7):
    print(f"\nSources: {r}")
    for ls in it.combinations(comp_groups, r=r):
        df2 = get_spec_model(df1, sources=ls, output=False)   # "Israel_Phoenician"
        s = np.sum(df2["p"]>p)
        print(f"{ls}: {s}")

Fitting at p>0.01

Sources: 1
('Anatolia_N',): 0
('Steppe_MLBA',): 0
('WHG',): 0
('I12433',): 0
('Iran_N',): 0
('Levant_N',): 0

Sources: 2
('Anatolia_N', 'Steppe_MLBA'): 0
('Anatolia_N', 'WHG'): 0
('Anatolia_N', 'I12433'): 0
('Anatolia_N', 'Iran_N'): 0
('Anatolia_N', 'Levant_N'): 0
('Steppe_MLBA', 'WHG'): 0
('Steppe_MLBA', 'I12433'): 0
('Steppe_MLBA', 'Iran_N'): 0
('Steppe_MLBA', 'Levant_N'): 0
('WHG', 'I12433'): 0
('WHG', 'Iran_N'): 0
('WHG', 'Levant_N'): 0
('I12433', 'Iran_N'): 0
('I12433', 'Levant_N'): 0
('Iran_N', 'Levant_N'): 0

Sources: 3
('Anatolia_N', 'Steppe_MLBA', 'WHG'): 0
('Anatolia_N', 'Steppe_MLBA', 'I12433'): 0
('Anatolia_N', 'Steppe_MLBA', 'Iran_N'): 0
('Anatolia_N', 'Steppe_MLBA', 'Levant_N'): 0
('Anatolia_N', 'WHG', 'I12433'): 0
('Anatolia_N', 'WHG', 'Iran_N'): 0
('Anatolia_N', 'WHG', 'Levant_N'): 0
('Anatolia_N', 'I12433', 'Iran_N'): 0
('Anatolia_N', 'I12433', 'Levant_N'): 0
('Anatolia_N', 'Iran_N', 'Levant_N'): 0
('Steppe_MLBA', 'WHG', 'I12433'): 0
('Steppe_MLBA', 

In [13]:
df_meta = pd.read_csv("/n/groups/reich/hringbauer/Data/v46.3.anno.csv")
dft = pd.merge(df1, df_meta[["iid", "loc", "clst", "n_cov_snp"]], left_on="t", right_on="iid", how="left")
dft1 = get_individual_models(dft, min_p=0.05)

# Area 51

In [158]:
df2 = get_spec_model(dft, sources=['Anatolia_N', 'Steppe_MLBA', 'WHG'], tot_sources=6)   # "Israel_Phoenician"
s = np.sum(df2["p"]>0.01)
print(f"Fitting Models at p>0.01: {s}")

Found 70 fitting rows.
Fitting Models at p>0.01: 46


In [None]:
dft1[dft1["loc"].str.contains("Poli")][["p", "t", "clst",  "loc", "s1", "s2", "s3", "s4", "s5", "f1", "f2", "f3", "f4",
                                                   "std1", "std2", "std3", "std4"]]

In [89]:
df2[df2["clst"].str.contains("Iberia")]

Unnamed: 0,t,p,n,s1,s2,s3,f1,f2,f3,std1,std2,std3,fs,iid,loc,clst,n_cov_snp
