# Prepare the Readcount Data I got from David for the South Americans into a HDF5 compatible with HAPSBURG
Additonally: Also prepare the downsampled RC/PH HDF5 versions of high coverage SA Individuals
@ Author: Harald Ringbauer, 2019

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os  # For Saving to Folder
import pandas as pd
import h5py  # Python Package to do the HDF5.

import socket
import os as os
import sys as sys
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
sys.path.append("./package/hapsburg/")  # Since now we are in the Root Directory
#from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..


print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Load the Data

In [8]:
def get_df_from_rc(path_ind = "../fromDavid/MA577_1240k_all.cnts"):
    df_rc = pd.read_csv(path_ind, header=None, sep=" ")
    df_rc.columns=["chr","pos", "ref", "alt", "A", "G", "C", "T"]
    print(f"Loaded {len(df_rc)} Markers")
    return df_rc

def creat_count_col(df_rc):
    """Add Fields for ref and alt Count"""
    df_rc["ref_count"]=0
    df_rc["alt_count"]=0

    for a in ["A", "G", "C", "T"]:
        idcs = df_rc["ref"]==a
        df_rc.loc[idcs,"ref_count"]=df_rc[a]

        idcs = df_rc["alt"]==a
        df_rc.loc[idcs,"alt_count"]=df_rc[a]
    mean_depth = np.mean(df_rc["alt_count"]+df_rc["ref_count"])
    print(f"Mean Depth: {mean_depth:.4f}")
    return df_rc

def save_hdf5(gt, ad, ref, alt, pos, ch, samples, path):
    """Create a new HDF5 File with Input Data.
    gt: Genotype data [l,k,2]
    ad: Allele depth [l,k,2]
    ref: Reference Allele [l]
    alt: Alternate Allele [l]
    pos: Position  [l]
    m: Map position [l]
    ch: Which chromosome [l]
    samples: Sample IDs [k]"""

    l, k, _ = np.shape(gt)  # Nr loci and Nr of Individuals

    if os.path.exists(path):  ### Do a Deletion of existing File there
        os.remove(path)

    dt = h5py.special_dtype(vlen=str)  # To have no problem with saving

    with h5py.File(path, 'w') as f0:
        ### Create all the Groups
        #f_map = f0.create_dataset("variants/MAP", (l,), dtype='f')
        f_ch = f0.create_dataset("variants/CHROM", (l,), dtype='i')
        f_ad = f0.create_dataset("calldata/AD", (l, k, 2), dtype='i')
        f_ref = f0.create_dataset("variants/REF", (l,), dtype=dt)
        f_alt = f0.create_dataset("variants/ALT", (l,), dtype=dt)
        f_pos = f0.create_dataset("variants/POS", (l,), dtype='i')
        f_gt = f0.create_dataset("calldata/GT", (l, k, 2), dtype='i')
        f_samples = f0.create_dataset("samples", (k,), dtype=dt)

        ### Save the Data
        #f_map[:] = rec
        f_ch[:] = ch
        f_ad[:] = ad
        f_ref[:] = ref.astype("S1")
        f_alt[:] = alt.astype("S1")
        f_pos[:] = pos
        f_gt[:] = gt
        f_samples[:] = np.array(samples).astype("S10")

    print(f"Successfully saved {k} individuals to: {path}")

### Do single Indivudal

In [3]:
def rc_to_hdf_1ind(path_ind, path_h5="./Data/SA_1240kHDF5/MA577_1240k.h5", iid="MA577_1240k"):
    """Produce HDF5 File from Readcount Data"""
    df_rc= get_df_from_rc(path_ind)
    df_rc = creat_count_col(df_rc)
    l = len(df_rc)
    k = 1

    ###
    gt = -np.ones((l,k,2), dtype="int8") # No genotypes
    ad = df_rc[["ref_count", "alt_count"]].values[:,None,:] # None for n=1 axis
    ref = df_rc["ref"].values
    alt = df_rc["alt"].values
    pos = df_rc["pos"].values
    ch = df_rc["chr"].values
    samples=[iid,]

    save_hdf5(gt, ad, ref, alt, pos, ch, samples, path_h5)
    print(f"Successfully saved to {path_h5}")

In [4]:
base_folder="../fromDavid/"
out_folder="./Data/SA_1240kHDF5/"

#iids=["IPY10", "IPK12", "MA577", "894", "895"]
iids=["Loschbour", "Stuttgart", "Ust_Ishim"]

paths = [os.path.join(base_folder, iid + "_1240k_all.cnts") for iid in iids]
paths_h5 = [os.path.join(out_folder,iid + ".h5") for iid in iids]

for i in range(len(paths)):
    rc_to_hdf_1ind(paths[i], path_h5=paths_h5[i], iid=iids[i])

Loaded 1115283 Markers
Mean Depth: 19.5486
Successfully saved 1 individuals to: ./Data/SA_1240kHDF5/Loschbour.h5
Successfully saved to ./Data/SA_1240kHDF5/Loschbour.h5
Loaded 1115215 Markers
Mean Depth: 18.2127
Successfully saved 1 individuals to: ./Data/SA_1240kHDF5/Stuttgart.h5
Successfully saved to ./Data/SA_1240kHDF5/Stuttgart.h5
Loaded 1115315 Markers
Mean Depth: 40.7515
Successfully saved 1 individuals to: ./Data/SA_1240kHDF5/Ust_Ishim.h5
Successfully saved to ./Data/SA_1240kHDF5/Ust_Ishim.h5


### Do Individuals from Moreno-Mayer
Do the Individuals where David W. downloaded and processed the RC data. Only iids and the suffix are different

In [19]:
base_folder="../fromDavid/"
out_folder="./Data/SA_1240kHDF5/"

iids = ["A460", "Andaman", "5832", "Lovelock4", "Sumidouro4","Sumidouro5", 
        "Sumidouro6", "Sumidouro7", "19651", "AHUR_2064", "Lovelock1", "Lovelock2", "Lovelock3", "Aconcagua"]

paths = [os.path.join(base_folder, iid + "_final.bam.mpileup.cnts") for iid in iids]
paths_h5 = [os.path.join(out_folder, iid + ".h5") for iid in iids]

for i in range(len(paths)):
    rc_to_hdf_1ind(paths[i], path_h5=paths_h5[i], iid=iids[i])

Loaded 1115101 Markers
Mean Depth: 12.9466
Successfully saved 1 individuals to: ./Data/SA_1240kHDF5/A460.h5
Successfully saved to ./Data/SA_1240kHDF5/A460.h5
Loaded 1102888 Markers
Mean Depth: 22.4121
Successfully saved 1 individuals to: ./Data/SA_1240kHDF5/Andaman.h5
Successfully saved to ./Data/SA_1240kHDF5/Andaman.h5
Loaded 912911 Markers
Mean Depth: 2.2065
Successfully saved 1 individuals to: ./Data/SA_1240kHDF5/5832.h5
Successfully saved to ./Data/SA_1240kHDF5/5832.h5
Loaded 543174 Markers
Mean Depth: 1.3750
Successfully saved 1 individuals to: ./Data/SA_1240kHDF5/Lovelock4.h5
Successfully saved to ./Data/SA_1240kHDF5/Lovelock4.h5
Loaded 696732 Markers
Mean Depth: 1.6262
Successfully saved 1 individuals to: ./Data/SA_1240kHDF5/Sumidouro4.h5
Successfully saved to ./Data/SA_1240kHDF5/Sumidouro4.h5
Loaded 1114912 Markers
Mean Depth: 18.8313
Successfully saved 1 individuals to: ./Data/SA_1240kHDF5/Sumidouro5.h5
Successfully saved to ./Data/SA_1240kHDF5/Sumidouro5.h5
Loaded 1008433 Mar

# Do multiple individuals
Did not work because different number of SNPs! Left here as a reminder!

# Prepare Downsampled HDF5
Idea: Have HDF5s with several individuals that are downsampled from one master Individual

In [4]:
def downsample_ad(ad, cov_frac):
    """Return downsampled Allele Depth
    ad: [l,2] Array of Allele Depths
    cov_frac: To which fraction to downsample"""
    assert(np.min(ad)>=0)   # Sanity Check
    ad = np.random.binomial(ad, cov_frac)
    return ad
    
def rc_to_hdf_cov_ind(base_folder, iid, path_hdf5, down_sampling_covs):
    """Create HDF5 from one Individual, downsampled to various levels.
    down_sampling_covs: Vector of coverages to downsamples to"""
    ### Produce the Sample Names
    samples = [f"{c:.4f}" for c in down_sampling_covs]
    
    ### Same as for 1 Individual, but as a Loop
    path = os.path.join(base_folder, iid + "_1240k_all.cnts")
    df_rc = get_df_from_rc(path)
    df_rc = creat_count_col(df_rc)

    ### Get Fields that are same for all Individuals (i.e. Array)
    ref = df_rc["ref"].values
    alt = df_rc["alt"].values
    pos = df_rc["pos"].values
    ch = df_rc["chr"].values

    ### Get Fields that are a matrix
    l = len(df_rc)
    k = len(down_sampling_covs)
    gt = -np.ones((l, k, 2), dtype="int8")            # No genotypes filled in!

    ad = df_rc[["ref_count", "alt_count"]].values     # Extract Allele Depth Field
    ads= [downsample_ad(ad, cov_frac) for cov_frac in down_sampling_covs]
    ad = np.stack(ads, axis=1)          # Combine the allele Depths (along axis 1 for individuals)
    
    assert(np.shape(ad)==np.shape(gt)) # Sanity Check
    save_hdf5(gt, ad, ref, alt, pos, ch, samples, path_hdf5)

### Downsample IPK12/Stuttgart/Loschbour/Ust_Ishim Readcounts
Comment out what is needed

In [12]:
### IPK12
#down_sampling_covs = np.geomspace(0.04, 1.0, 10)
#rc_to_hdf_cov_ind("../fromDavid/", "IPK12", path_hdf5="./Data/SA_1240kHDF5/IPK12_downsample.h5", 
#                  down_sampling_covs=down_sampling_covs)

#down_sampling_covs=np.geomspace(0.01, 1.0, 10)
#rc_to_hdf_cov_ind("../fromDavid/", "Stuttgart", path_hdf5="./Data/SA_1240kHDF5/Stuttgart_downsample.h5", 
#                  down_sampling_covs=down_sampling_covs)

down_sampling_covs=np.geomspace(0.01, 1.0, 10)
rc_to_hdf_cov_ind("../fromDavid/", "Loschbour", path_hdf5="./Data/SA_1240kHDF5/Loschbour_downsample.h5", 
                  down_sampling_covs=down_sampling_covs)

#down_sampling_covs=np.geomspace(0.005, 1.0, 10)
#rc_to_hdf_cov_ind("../fromDavid/", "Ust_Ishim", path_hdf5="./Data/SA_1240kHDF5/Ust_Ishim_downsample.h5", 
#                  down_sampling_covs=down_sampling_covs)

Loaded 1115283 Markers
Mean Depth: 19.5486
Successfully saved 10 individuals to: ./Data/SA_1240kHDF5/Loschbour_downsample.h5


### Prepare Downsampled HDF5 with Pseudohaploid Genotypes

In [21]:
def downsample_ph(ad, cov_frac=1.0, mis_val=-1):
    """Downsample to pseudohaploid Genotypes
    Input: Allele Depths [l,2], cov_frac: Target Coverage (1.0x nothing set to 0)"""
    tot_cov = np.sum(ad, axis=1)
    cov0 = (tot_cov ==0)
    cov_mean = np.mean(tot_cov>0) # How many markers are covered at all
    #print(cov_mean)
    tot_cov[cov0]=1    # Fill up with fake value for now, will be replaced later!!
    p = ad[:,1] / tot_cov  # Fraction of derived Read per Locus
    
    gt = np.random.binomial(n=1, p=p) # The Pseudohaploid Genotypes
    gt = gt.astype("int8")
    
    ### Do the downsampling (So that coverage + missing is right)
    not_cov = np.random.random(size=len(gt)) > (cov_frac / cov_mean)  # Create missing mask
    gt[not_cov] = mis_val # Set missing
    gt[cov0] = mis_val # Set missing where no coverage
    
    ### Do the Duplication
    gt = np.tile(gt, (2,1)).T # Create (l,2) Array
    return gt

def rc_to_hdf_cov_ind_ph(base_folder, iid, path_hdf5, 
                         down_sampling_covs, suffix=""):
    """Create HDF5 from one Individual, downsampled to various levels.
    down_sampling_covs: Vector of coverages to downsamples to.
    suffix: Suffix to append to Individuals"""
    ### Produce the Sample Names
    samples = [f"{c:.4f}{suffix}" for c in down_sampling_covs]
    
    ### Same as for 1 Individual, but as a Loop
    path = os.path.join(base_folder, iid + "_1240k_all.cnts")
    df_rc = get_df_from_rc(path)
    df_rc = creat_count_col(df_rc)

    ### Get Fields that are same for all Individuals (i.e. Array)
    ref = df_rc["ref"].values
    alt = df_rc["alt"].values
    pos = df_rc["pos"].values
    ch = df_rc["chr"].values

    ### Get Fields that are a matrix
    l = len(df_rc)
    k = len(down_sampling_covs)
    
    #gts = np.zeros((l, k, 2), dtype="int8")
    #gt = -np.ones((l, k, 2), dtype="int8")           # No genotypes filled in!

    ad = df_rc[["ref_count", "alt_count"]].values     # Extract Allele Depth Field
    
    gts= [downsample_ph(ad, cov_frac) for cov_frac in down_sampling_covs]
    gts = np.stack(gts, axis=1)            # Combine the allele Depths (along axis 1 for individuals)
    ads = np.tile(ad[:,None,:], (1, k, 1)) # Tile the Allele depths as well
    
    assert(np.shape(ads)==np.shape(gts)) # Sanity Check
    save_hdf5(gts, ads, ref, alt, pos, ch, samples, path_hdf5)

In [10]:
down_sampling_covs = np.linspace(0.3, 1.0, 8)
rc_to_hdf_cov_ind_ph("../fromDavid/", "Loschbour", path_hdf5="./Data/SA_1240kHDF5/Loschbour_downsample_ph.h5", 
                     down_sampling_covs=down_sampling_covs)

Loaded 1115283 Markers
Mean Depth: 19.5486
Successfully saved 8 individuals to: ./Data/SA_1240kHDF5/Loschbour_downsample_ph.h5


In [13]:
down_sampling_covs = np.linspace(0.3, 1.0, 8)
rc_to_hdf_cov_ind_ph("../fromDavid/", "Stuttgart", path_hdf5="./Data/SA_1240kHDF5/Stuttgart_downsample_ph.h5", 
                     down_sampling_covs=down_sampling_covs)

Loaded 1115215 Markers
Mean Depth: 18.2127
Successfully saved 8 individuals to: ./Data/SA_1240kHDF5/Stuttgart_downsample_ph.h5


In [8]:
down_sampling_covs = np.linspace(0.2, 1.0, 9)
rc_to_hdf_cov_ind_ph("../fromDavid/", "Ust_Ishim", path_hdf5="./Data/SA_1240kHDF5/Ust_Ishim_downsample_ph.h5", 
                     down_sampling_covs=down_sampling_covs)

Loaded 1115315 Markers
Mean Depth: 40.7515
Successfully saved 8 individuals to: ./Data/SA_1240kHDF5/Ust_Ishim_downsample_ph.h5


In [5]:
down_sampling_covs = np.linspace(0.3, 1.0, 8)
rc_to_hdf_cov_ind_ph("../fromDavid/", "IPK12", path_hdf5="./Data/SA_1240kHDF5/IPK12_downsample_ph0.h5", 
                     down_sampling_covs=down_sampling_covs)

Loaded 1114250 Markers
Mean Depth: 10.4390
Successfully saved 8 individuals to: ./Data/SA_1240kHDF5/IPK12_downsample_ph0.h5


# Do 10 Replicate of Downsampling Ush Ishim

In [None]:
reps = 20
down_sampling_covs = np.linspace(0.2, 1.0, 9)

for r in range(reps):  
    path_save = "./Data/SA_1240kHDF5/Ust_Ishim_rep/downsample_ph_r" + str(r) + ".h5"
    
    rc_to_hdf_cov_ind_ph("../fromDavid/", "Ust_Ishim", path_hdf5=path_save, 
                         down_sampling_covs=down_sampling_covs, suffix=f"_r{r}")

# Area 51

In [53]:
down_sampling_covs = np.linspace(0.3, 1.0, 8)
down_sampling_covs

array([0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [79]:
ad = np.array([[0,2],[0,2],[2,2]])
ad

array([[0, 2],
       [0, 2],
       [2, 2]])

In [83]:
np.shape(ad)

(3, 2)

In [85]:
b = np.tile(ad[:,None,:], (1,5,1))
np.shape(b)

(3, 5, 2)

In [87]:
b[:,2,:]

array([[0, 2],
       [0, 2],
       [2, 2]])

### Test whether Loading works properly

In [4]:
path_load = "./Data/SA_1240kHDF5/Loschbour_downsample.h5"
#path_load = "./Data/SA_1240kHDF5/IPK12_downsample_ph.h5"
f = h5py.File(path_load, "r") # Load for Sanity Check. See below!
        
print("Loaded HDF5")
print("Loaded %i variants" % np.shape(f["calldata/GT"])[0])
print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
print(list(f["calldata"].keys()))
print(list(f["variants"].keys()))

Loaded HDF5
Loaded 1115283 variants
Loaded 10 individuals
['AD', 'GT']
['ALT', 'CHROM', 'POS', 'REF']


In [5]:
f["samples"][:]

array(['0.0100', '0.0167', '0.0278', '0.0464', '0.0774', '0.1292',
       '0.2154', '0.3594', '0.5995', '1.0000'], dtype=object)

In [8]:
np.shape(f["calldata/GT"])

(1114250, 10, 2)

In [10]:
covs_gt = [np.mean(f["calldata/GT"][:,i,:]>-1) for i in range(len(f["samples"]))]
covs_gt

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [None]:
pd.value_counts(f["variants/CHROM"][:])

In [10]:
f["variants/CHROM"][:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [None]:
covs_ad = [np.mean(f["calldata/AD"][:,i,:]) * 2 for i in range(len(f["samples"]))]
covs_ad

In [13]:
10.438989454790217 * 0.04

0.4175595781916087

In [None]:
f["calldata/AD"][:10,1,:]

In [None]:
f["calldata/GT"][:10,5,:]

In [None]:
pd.value_counts(f["variants"]["CHROM"][:])

In [34]:
np.where(f["samples"][:]=="MA577_1240")[0]

array([], dtype=int64)

In [74]:
iid="MA577_1240"

samples = f["samples"][:]
assert(len(samples) == np.shape(f["calldata/GT"])[1])  # Sanity Check

id_obs = np.where(samples == iid)[0]
if len(id_obs) == 0:
    raise RuntimeError(f"Individual {iid} not found in Samples Field")

In [76]:
id_obs[0]

0

In [41]:
f["samples"][:]

array(['IPY10', 'IPK12', 'MA577', '894', '895'], dtype=object)

In [None]:
'IPY10', 'IPK12', 'MA577', '894', '895'

In [80]:
df_rc= get_df_from_rc(path_ind = "../fromDavid/IPY10_1240k_all.cnts")
df_rc = creat_count_col(df_rc)

Loaded 1114799 Markers


In [None]:
df_rc[df_rc["chr"]==10]

In [66]:
df_t = get_df_from_rc(path_ind='../fromDavid/IPK12_1240k_all.cnts')
df_t = creat_count_col(df_t)

Loaded 1114250 Markers
Mean Depth: 10.4390


In [68]:
sum_cov = (df_t["ref_count"] + df_t["alt_count"])

In [None]:
sum_cov.value_counts()

In [33]:
df_t=get_df_from_rc("../fromDavid/Ust_Ishim_1240k_all.cnts")

Loaded 1115315 Markers


In [34]:
df_t["chr"].value_counts()

2     95427
1     90211
3     78609
6     76778
5     70297
4     69671
8     62024
7     60650
10    59583
11    55558
12    54584
9     51354
13    39532
14    36785
16    34996
15    34886
18    34327
17    29665
20    29439
19    18697
21    16250
22    15992
Name: chr, dtype: int64

In [35]:
df_t.head(10)

Unnamed: 0,chr,pos,ref,alt,A,G,C,T
0,1,752566,G,A,36,23,0,0
1,1,776546,A,G,43,0,0,0
2,1,832918,T,C,0,0,0,54
3,1,842013,T,G,0,0,0,32
4,1,846864,G,C,0,26,0,0
5,1,869303,C,T,0,0,20,3
6,1,891021,G,A,39,0,0,0
7,1,896271,C,T,0,0,5,3
8,1,903426,C,T,0,0,32,0
9,1,914852,G,C,0,8,17,0
