# Prepare the Readcount Data I got from David for the South Americans
@ Author: Harald Ringbauer, 2019

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import os  # For Saving to Folder
import pandas as pd
import h5py  # Python Package to do the HDF5.

import socket
import os as os
import sys as sys
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..


print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Load the Data

In [7]:
def get_df_from_rc(path_ind = "../fromDavid/MA577_1240k_all.cnts"):
    path_ind = "../fromDavid/MA577_1240k_all.cnts"
    df_rc = pd.read_csv(path_ind, header=None, sep=" ")
    df_rc.columns=["chr","pos", "ref", "alt", "A", "G", "C", "T"]
    print(f"Loaded {len(df_rc)} Markers")
    return df_rc

def creat_count_col(df_rc):
    """Add Fields for ref and alt Count"""
    df_rc["ref_count"]=0
    df_rc["alt_count"]=0

    for a in ["A", "G", "C", "T"]:
        idcs = df_rc["ref"]==a
        df_rc.loc[idcs,"ref_count"]=df_rc[a]

        idcs = df_rc["alt"]==a
        df_rc.loc[idcs,"alt_count"]=df_rc[a]
    return df_rc

def save_hdf5(gt, ad, ref, alt, pos, ch, samples, path):
    """Create a new HDF5 File with Input Data.
    gt: Genotype data [l,k,2]
    ad: Allele depth [l,k,2]
    ref: Reference Allele [l]
    alt: Alternate Allele [l]
    pos: Position  [l]
    m: Map position [l]
    ch: Which chromosome [l]
    samples: Sample IDs [k]"""

    l, k, _ = np.shape(gt)  # Nr loci and Nr of Individuals

    if os.path.exists(path):  ### Do a Deletion of existing File there
        os.remove(path)

    dt = h5py.special_dtype(vlen=str)  # To have no problem with saving

    with h5py.File(path, 'w') as f0:
        ### Create all the Groups
        #f_map = f0.create_dataset("variants/MAP", (l,), dtype='f')
        f_ch = f0.create_dataset("variants/CHROM", (l,), dtype='i')
        f_ad = f0.create_dataset("calldata/AD", (l, k, 2), dtype='i')
        f_ref = f0.create_dataset("variants/REF", (l,), dtype=dt)
        f_alt = f0.create_dataset("variants/ALT", (l,), dtype=dt)
        f_pos = f0.create_dataset("variants/POS", (l,), dtype='i')
        f_gt = f0.create_dataset("calldata/GT", (l, k, 2), dtype='i')
        f_samples = f0.create_dataset("samples", (k,), dtype=dt)

        ### Save the Data
        #f_map[:] = rec
        f_ch[:] = ch
        f_ad[:] = ad
        f_ref[:] = ref.astype("S1")
        f_alt[:] = alt.astype("S1")
        f_pos[:] = pos
        f_gt[:] = gt
        f_samples[:] = np.array(samples).astype("S10")

    print(f"Successfully saved {k} individuals to: {path}")

### Do single Indivudal

In [21]:
def rc_to_hdf_1ind(path_ind, path_h5="./Data/SA_1240kHDF5/MA577_1240k.h5", iid="MA577_1240k"):
    """Produce HDF5 File from Readcount Data"""
    df_rc= get_df_from_rc(path_ind="../fromDavid/MA577_1240k_all.cnts")
    df_rc = creat_count_col(df_rc)
    l = len(df_rc)
    k = 1

    ###
    gt = -np.ones((l,k,2), dtype="int8") # No genotypes
    ad = df_rc[["ref_count", "alt_count"]].values[:,None,:] # None for n=1 axis
    ref = df_rc["ref"].values
    alt = df_rc["alt"].values
    pos = df_rc["pos"].values
    ch = df_rc["chr"].values
    samples=[iid,]

    save_hdf5(gt, ad, ref, alt, pos, ch, samples, path_h5)
    print(f"Successfully saved to {path_h5}")

# Do multiple individuals

In [27]:
def rc_to_hdf_mul_ind(base_folder, iids, path_hdf5):
    """Create HDF5s with Readcounts from multiple Individuals"""
    ### Same as for 1 Individual, but as a Loop
    paths = [os.path.join(base_folder, iid + "_1240k_all.cnts") for iid in iids]
    df_rcs = [get_df_from_rc(path_ind=p) for p in paths]
    df_rcs = [creat_count_col(df_rc) for df_rc in df_rcs]

    ### Sanity Check if all Datasets are identical
    lens = [len(df_rc) for df_rc in df_rcs]
    assert(len(set(lens))==1) 

    ### Get Fields that are same for all Individuals (i.e. Array)
    df_rc = df_rcs[0]
    ref = df_rc["ref"].values
    alt = df_rc["alt"].values
    pos = df_rc["pos"].values
    ch = df_rc["chr"].values

    # Get Fields that are a matrix
    l = len(df_rc)
    k = len(df_rcs)
    gt = -np.ones((l, k, 2), dtype="int8") # No genotypes filled in!

    ad = [df_rc[["ref_count", "alt_count"]].values for df_rc in df_rcs]   # None for n=1 axis
    ad = np.stack(ad, axis=1)          # Combine the allele Depths (along axis 1 for individuals)
    assert(np.shape(ad)==np.shape(gt)) # Sanity Check
    save_hdf5(gt, ad, ref, alt, pos, ch, iids, path_hdf5)

In [28]:
### Run for the first 5 Individuals
rc_to_hdf_mul_ind(base_folder="../fromDavid/", iids=["IPY10", "IPK12", "MA577", "894", "895"], 
                  path_hdf5="./Data/SA_1240kHDF5/SA_inds_5.h5")

Loaded 907683 Markers
Loaded 907683 Markers
Loaded 907683 Markers
Loaded 907683 Markers
Loaded 907683 Markers
Successfully saved 5 individuals to: ./Data/SA_1240kHDF5/SA_inds_5.h5


# Area 51

In [29]:
path_load = "./Data/SA_1240kHDF5/SA_inds_5.h5"
f = h5py.File(path_load, "r") # Load for Sanity Check. See below!
        
print("Loaded HDF5")
print("Loaded %i variants" % np.shape(f["calldata/GT"])[0])
print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
print(list(f["calldata"].keys()))
print(list(f["variants"].keys()))

Loaded HDF5
Loaded 907683 variants
Loaded 5 individuals
['AD', 'GT']
['ALT', 'CHROM', 'POS', 'REF']


In [30]:
np.shape(f["calldata/GT"])

(907683, 5, 2)

In [31]:
f["calldata/AD"][:10,0,:]

array([[ 0,  3],
       [ 1,  0],
       [ 1,  0],
       [ 4,  0],
       [ 6,  0],
       [ 0,  2],
       [ 0,  6],
       [ 3,  0],
       [ 0,  8],
       [ 0, 14]], dtype=int32)

In [32]:
pd.value_counts(f["variants"]["CHROM"][:])

2     77218
1     74716
3     63082
6     61135
5     56089
4     53931
8     49916
10    49208
7     48464
11    45757
12    44218
9     41707
13    30798
16    29942
14    29933
15    29024
18    27515
17    25846
20    25154
19    16650
22    14323
21    13057
dtype: int64

In [34]:
np.where(f["samples"][:]=="MA577_1240")[0]

array([], dtype=int64)

In [74]:
iid="MA577_1240"

samples = f["samples"][:]
assert(len(samples) == np.shape(f["calldata/GT"])[1])  # Sanity Check

id_obs = np.where(samples == iid)[0]
if len(id_obs) == 0:
    raise RuntimeError(f"Individual {iid} not found in Samples Field")

In [76]:
id_obs[0]

0

In [41]:
f["samples"][:]

array(['IPY10', 'IPK12', 'MA577', '894', '895'], dtype=object)

In [None]:
'IPY10', 'IPK12', 'MA577', '894', '895'