# Calculate F_ST Matrix for Punic project
This is run on v46.3 data

In [None]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

In [2]:
def create_meta_df(f, path_meta="/n/groups/reich/hringbauer/Data/v43.4.anno.csv"):
    """Create and return Meta Dataframe that matches hdf5 in format"""
    samples = f["samples"][:].astype("str")
    df_h5 = pd.DataFrame({"iid":samples})
    df_meta = pd.read_csv(path_meta)
    print(f"Loaded {len(df_meta)} Samples")
    df = pd.merge(df_h5, df_meta, on="iid", how="left")
    print(f"Created matching Meta Dataframe for h5: {len(df)}")
    df = df.reset_index(drop=True)
    df["clst"] = df["clst"].fillna("Not Available")
    return df

def get_cluster_idx(df, clst="", col_clst="clst", 
                    age_range=[], exact=False,
                    include_col="include"):
    """Get idcs of all samples within Cluster
    If age_range, only samples in age range"""
    if len(include_col)>0:
        idcs1 = df[include_col]==True
    else:
        idcs1 = True
        
    if exact:
        idcs = np.where((df[col_clst]==clst) & idcs1)[0] 
    else:
        idcs = np.where((df[col_clst].str.contains(clst)) & idcs1)[0]
    
    ### Do additional Filtering
    if len(age_range)>0:  
        pass
    return idcs

###############################
### Calculate the Allele Counts
def get_ph(f, idcs):
    """Sample pseudohaploid data for hdf5 for
    individuals with indices idcs"""
    ads = f["calldata/AD"][:,idcs,:2]
    ads[ads<0]=0 # Set nmissing data to 0
    cov = np.sum(ads, axis=2) # get the coverage per locus/indiviual
    idx = cov>0  # Where there is some coverage
    p = np.divide(ads[:,:,1], cov, where=idx)
    p[~idx]=1
    p = np.clip(p, a_min=0, a_max=1) # Santity check to deal with numerics
    ac = np.random.binomial(1,p)
    ac[~idx] = -1
    return ac

def get_gt(f, idcs):
    """Get diploid genoytpe counts"""
    gt = f["calldata/GT"][:,idcs,:]
    assert(np.min(gt)>=0)
    gt = np.sum(gt, axis=2) # Count #derived variants
    return gt

def calc_ac_from_ph(ph):
    """Calculate allele allele counts for individuals 
    with indices from hdf5 with allele counts only
    ph: Array of pseudo-haploid [l,n]"""
    c_ref=np.sum(ph==0, axis=1) # Sum the Counts over all Individuals
    c_alt=np.sum(ph==1, axis=1) # Sum the Counts over all Individuals
    
    # Double 0,0 no problem, goes to NaN and is then caught by allel
    return np.column_stack((c_ref, c_alt)) # Return the nx2 Allele Counts

def calc_ac_from_gt(gt):
    """Calculate allele allele counts for individuals 
    with indices from hdf5 with allele counts only
    ph: Array of pseudo-haploid [l,n]"""
    c_ref= 2*np.sum(gt==0, axis=1) + np.sum(gt==1, axis=1)
    c_alt= 2*np.sum(gt==2, axis=1) + np.sum(gt==1, axis=1)
    return np.column_stack((c_ref, c_alt)) # Return the nx2 Allele Counts

def get_ac_from_f(f, idcs, ph=True):
    """Get Allele Counts from HDF, 
    grouped for all indivdiuals in idcs
    ph: Whether to use pseudo-haploid or diploid genotypes"""
    if ph:
        ph = get_ph(f, idcs)
        ac = calc_ac_from_ph(ph=ph)
    else:
        print("Using diploid mode...")
        gt = get_gt(f, idcs)
        ac = calc_ac_from_gt(gt)
    return ac

def calculate_ac_pop(clst, f, df, col="clst", exact=False, 
                     ph=True, include_col="include"):
    """Return allele counts for population.
    exact: whether ther is an exact match"""
    idcs = get_cluster_idx(df, clst=clst, exact=exact,
                           col_clst=col, include_col=include_col)
    ac = get_ac_from_f(f, idcs, ph=ph) 
    return ac

def calculate_ac_pops(pops, f, df, col="clst", ph=True, 
                      exact=False, out=True):
    """Calculate list of allele counts [l,2] for pops
    f: hdf5
    df: metafile matching f
    pops: List of populations to extract ACs for"""
    ### Check whether all pops have matches first:
    idcss = [get_cluster_idx(df=df, clst=pop, exact=exact, col_clst=col)
                              for pop in pops]
    counts = np.array(map(len, idcss))
    if np.min(counts)==0:
        idx = np.where(counts==0)[0]
        raise RuntimeError(f"Pops {pop[idx]} not found!")
    
    acs=[]
    for pop in pops:
        idcs = get_cluster_idx(df=df, clst=pop, exact=exact, col_clst=col)
        if len(idcs)==0:
            raise RuntimeWarning(f"No matching iids for {pop} not found!!")
        if out:
            print(f"Calculating counts pop: {pop}, n={len(idcs)}...")
        ac = get_ac_from_f(f, idcs, ph=ph) 
        acs.append(ac)
    return acs

###########################################
###########################################
### Calculate the actual f statistics

def f3_ac(pt, p1, p2, snps_okay=None, blen=1000):
    """Calculate f3 for Allele Counts (lx2 arrays)
    snps_okay: Which SNPs to actually use. If none use all
    blen: Block Nr for Bootstrap
    """
    #f3 = np.mean((pt-p1)*(pt-p2))
    f3 = allel.average_patterson_f3(pt, p1, p2, blen=blen, normed=False)
    return [f3[0], f3[1], f3[2]]  # f4, se, z

def f4_ac(p1, p2, p3, p4, snps_okay=None, blen=1000):
    """Calculate f4 for Allele Counts (lx2 arrays)
    snps_okay: Which SNPs to actually use (If none use all)
    blen: Block Nr for Bootstrap
    """
    f4 = allel.average_patterson_d(p1, p2, p3, p4, blen=blen)
    return [f4[0], f4[1], f4[2]]  # f4, se, z

def fst_ac(p1, p2, blen=1000):
    """Calculate f3 for Allele Counts (lx2 arrays)
    blen: Block Nr for Bootstrap
    A sim wrapper, so later on different methods can be implemented.
    Return fst, se, z value (based on jackkniving)
    """
    res = allel.average_patterson_fst(p1, p2, blen=blen)
    f4, se = res[0], res[1]
    z = f4 / se # Calculate the z-Value
    return [res[0], res[1], z]  # f4, se, z

# Load the Data

In [None]:
%%time
path_anno = "./data/v46.4.anno.csv"
path_h5 = "/n/groups/reich/hringbauer/git/hapBLOCK/data/hdf5/1240k_v46.2/all_ch.h5"
path_meta = "./data/meta_v1.tsv"

f = h5py.File(path_h5, "r")
df = create_meta_df(f, path_meta=path_anno)

### Merge in the cluster labels
df1 = pd.read_csv(path_meta, sep="\t")
### Only include unique,unrelated samples
df1 = df1[df1["include"]==1].copy().reset_index(drop=True) 
df1 = pd.merge(df,df1[["iid", "label_region", "include"]], on="iid", how="left")
df1.loc[df1["label_region"].isnull(), "label_region"]="not assigned"
assert(len(df1)==len(df))