### Create test data to check implementation of HMM
Creates data under the model!

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import socket as socket
import os as os
import sys as sys
import multiprocessing as mp
import h5py
import allel
import itertools as it

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM O2 Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/hapBLOCK/"  # The Path on Harvard Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)

print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

compute-a-16-54.o2.rc.hms.harvard.edu
HSM O2 Computational partition detected.
/n/groups/reich/hringbauer/git/hapBLOCK
CPU Count: 32


### Create Array of Means

In [2]:
def draw_normal_all_freqs(l=5000, mean=0.5, std=0.1):
    """Draw Normally distributed allele frequencies"""
    p = np.random.normal(size=l, loc=mean, scale=std)
    return p

def draw_haplotype(p):
    """Draw binomal haplotype from allele frequency p.
    For each locus draw one genotype
    Assumes Hardy Weinbgerg."""
    l = len(p)
    s = np.random.random(size=l)<p
    return s

def create_hap_ll(h):
    """Create vector of haplotype likelihood.
    Without error. 
    h: Haplotype vector [l] consisting of 0/1"""
    l = np.zeros((len(h),2), dtype="bool")
    idx_der = (h==1)
    l[idx_der, 1] = 1
    l[~idx_der, 0] = 1
    return l
    
def copy_in_block(h_s, h_t, loc=[10,20]):
    """Copy in Haplotype block by copying segment.
    h_s: The source haplotype
    h_t: The target haplotype
    loc: Location of haplotype to copy over"""
    h_t[loc[0]:loc[1]]=h_s[loc[0]:loc[1]]
    return h_t

def create_ibd_haplos(l=10000, loc=[2000,4000], mean=0.5, std=0.1):
    """Create haplotype likelihoods with IBD copying.
    Draw allele frequencies, create HW haplotypes, copy in haplotype
    and create likelihoods.
    Return p [l] and haplotypelikelihoods [4,l,2]"""
    p = draw_normal_all_freqs(l=l, mean=mean, std=std)
    hts = [draw_haplotype(p) for _ in range(4)]
    hts[0] = copy_in_block(h_s=hts[2], h_t=hts[0], loc=loc)
    htsl = np.array([create_hap_ll(h) for h in hts])
    return p, htsl

def save_haplo_ll(hts, p=[], folder="./output/simulated/undermodel/",
                  delimiter='\t'):
    """Save Haplotype Likelihoods in standardized format."""
    if not os.path.exists(folder):
        os.makedirs(folder)
    savepath = os.path.join(folder,"haplo_ll.tsv")
    np.savetxt(savepath, hts, delimiter=delimiter)
    print(f"Saved {np.shape(hts)[1]} loci likelihoods to {savepath}")
    
    if len(p)>0:
        savepath = os.path.join(folder,"p.tsv")
        np.savetxt(savepath, p, delimiter=delimiter)
        print(f"Saved {len(p)} allele frequencies to {savepath}")

### Test single Haplotype

In [50]:
p = draw_normal_all_freqs(l=10000, mean=0.5, std=0.1)
assert((np.min(p)>0) and (np.max(p)<1)) # Sanity Check

### Draw random Haplotype
h0 = draw_haplotype(p)
print(f"Number of derived variants: {np.sum(h0)} / {len(h0)}")
l0 = create_hap_ll(h0)
print(f"Number of derived variants haplotype ll: {np.sum(l0[:,1])} / {len(l0)}")

Number of derived variants: 4995 / 10000
Number of derived variants haplotype ll: 4995 / 10000


### Create IBD sharing

In [88]:
%%time
p, htsl = create_ibd_haplos(l=10000, loc=[2000,4000], mean=0.5, std=0.1)

CPU times: user 5.28 ms, sys: 616 µs, total: 5.9 ms
Wall time: 4.29 ms


In [89]:
save_haplo_ll(hts=hts, p=p, folder="./output/simulated/undermodel/sim1/")

Saved 10000 loci likelihoods to ./output/simulated/undermodel/sim1/haplo_ll.tsv
Saved 10000 allele frequencies to ./output/simulated/undermodel/sim1/p.tsv


# Area 51

### Test load the saved the data

In [90]:
htsl1 = np.loadtxt("./output/simulated/undermodel/sim1/haplo_ll.tsv", delimiter="\t", dtype="float")
p1 = np.loadtxt("./output/simulated/undermodel/sim1/p.tsv", delimiter="\t", dtype="float")