# Class that implements HMM methods:
In particular the 

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import socket as socket
import os as os
import sys as sys
import multiprocessing as mp
import h5py
import allel
import itertools as it
import psutil
from scipy.special import logsumexp

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM O2 Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/hapBLOCK/"  # The Path on Harvard Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)

print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

sys.path.append("./python3/") 
from emission import load_emission_model
from transition import load_transition_model

compute-a-17-93.o2.rc.hms.harvard.edu
HSM O2 Computational partition detected.
/n/groups/reich/hringbauer/git/hapBLOCK
CPU Count: 32


In [14]:
from python3.cfunc import fwd_bkwd_fast

In [3]:
def fwd_bkwd(e_mat0, t_mat, in_val = 1e-4, full=False):
    """Takes emission and transition probabilities, and calculates posteriors.
    Uses speed-up specific for symmetric states 1...n (pooling same transition rates)
    Low-Mem: Do no save the full FWD BWD and Posterior. Use temporary
    Arrays for saving.
    Input:
    e_mat0: Emission probabilities [k x l] (log space)       (log space)
    t_mat: Transition Matrix: [l x 3 x 3]                     (normal space)
    in_val: Initial Probability of being in IBD state
    full: Boolean whether to return everything"""
    n_states = e_mat0.shape[0]
    n_loci = e_mat0.shape[1]
    stay, tot_ll = 0.,0.  #e Probablility of Staying

    # Initialize Posterior and Transition Probabilities
    post = np.empty((n_loci,n_states), dtype=np.float) # Array of 0 State Posterior
    trans_ll = np.empty(n_states-1, dtype=np.float) # Array for pre-calculations

    three_v = np.empty(3, dtype=np.float)     # Array of size three
    two_v = np.empty(2, dtype=np.float)       # Array of size two

    ### Initialize FWD BWD Arrays
    fwd0 = np.zeros(n_states, dtype=np.float)
    fwd0[:] = np.log(in_val)  # Initial Probabilities
    fwd0[0] = np.log(1 - (n_states - 1) * in_val)
    #cdef double[:] fwd = fwd0

    bwd0 = np.zeros(n_states, dtype=np.float)
    bwd0[:] = np.log(in_val)
    bwd0[0] = np.log(1 - (n_states - 1) * in_val)
    #cdef double[:] bwd = bwd0

    tmp = np.zeros(n_states, dtype=np.float)
    #cdef double[:] tmp = tmp0
    
    # Do transform to Log Space:
    t0 = np.log(t_mat)      

    #############################
    ### Do the Forward Algorithm

    post[0,:] = fwd0 # Add to first locus 0 Posterior
    for i in range(1, n_loci):  # Run forward recursion
        stay = np.log(t_mat[i, 1, 1] - t_mat[i, 1, 2])  # Do the log of the Stay term

        for k in range(1, n_states): # Calculate logsum of ROH states:
            trans_ll[k-1] = fwd0[k]
        f_l = logsumexp(trans_ll) # Logsum of ROH States

        # Do the 0 State:
        two_v[0] = fwd0[0] + t0[i, 0, 0]   # Staying in 0 State
        two_v[1] = f_l + t0[i, 1, 0]             # Going into 0 State
        tmp[0] = e_mat0[0, i] + logsumexp(two_v)

        ### Do the other states
        # Preprocessing:
        three_v[0] = fwd0[0] + t0[i, 0, 1]   # Coming from 0 State
        three_v[1] = f_l + t0[i, 1, 2]             # Coming from other ROH State

        for j in range(1, n_states):  # Do the final run over all states
            three_v[2] = fwd0[j] + stay
            tmp[j] = e_mat0[j, i] + logsumexp(three_v)

        ### Make tmp new fwd vec:
        fwd0 = tmp
        post[i,:] = fwd0  # Add to 0-State Posterior

    ### Get total log likelihood
    tot_ll = logsumexp(fwd0+bwd0)

    #############################
    ### Do the Backward Algorithm
    ## last0-State Posterior
    post[n_loci-1,:] = post[n_loci-1,:] + bwd0[:] - tot_ll

    for i in range(n_loci-1, 0, -1):  # Run backward recursion
        stay = np.log(t_mat[i, 1, 1] - t_mat[i, 1, 2])

        for k in range(1, n_states): # Calculate logsum of ROH states:
            trans_ll[k-1] = bwd0[k] + e_mat0[k, i]
        f_l = logsumexp(trans_ll) # Logsum of ROH States

        # Do the 0 State:
        two_v[0] = bwd0[0] + t0[i, 0, 0] + e_mat0[0, i]   # Staying in 0 State
        two_v[1] = f_l + t0[i, 0, 1]                         # Going into 0 State
        tmp[0] = logsumexp(two_v)

        ### Do the other states
        # Preprocessing:
        three_v[0] = e_mat0[0, i] + bwd0[0] + t0[i, 1, 0]
        three_v[1] = f_l + t0[i, 1, 2]    # Coming from other ROH State

        for j in range(1, n_states):  # Do the final run over all states
            three_v[2] = e_mat0[j, i] + bwd0[j] +  stay
            tmp[j] = logsumexp(three_v)  # Fill in the backward Probability

        ### Make tmp new bwd vec:
        bwd0 = tmp

        ### Finalize the 0 Posterior
        post[i-1,:] = post[i-1,:] + bwd0[:] - tot_ll

    print(f"Total Log likelihood: {tot_ll: .3f}")
    #print_memory_usage()   ## For MEMORY_BENCH

    if full==False:
        return post[:,:]  # For "fake" axis

    elif full==True:   # Return everything
        return post[:,:], fwd0, bwd0, tot_ll

####################################################
####################################################
### Additional Helper Functions

def print_memory_usage():
    """Print the current Memory Usage in mB"""
    process = psutil.Process(os.getpid())
    mb_usage = process.memory_info().rss / 1e6
    print(f"Memory Usage: {mb_usage} mB")

### Test HMM Module
First load emission and transition Matrix

In [4]:
ht1 = np.array([[0.01,0.99],[1,0],[0.01,0.99],[0.1,0.9]])
ht2 = np.array([[0.01,0.99],[1,0],[1,0],[0.5,0.5]])
ht3 = np.array([[0.01,0.99],[1,0],[1,0],[0.5,0.5]])
ht4 = np.array([[0.01,0.99],[1,0],[1,0],[0.5,0.5]])
hts = np.stack((ht1, ht2))
hts2 = np.stack((ht1,ht2,ht3,ht4))
p = np.array([0.9,0.8,0.7,0.5])
r_vec = np.ones(len(p))

In [5]:
e = load_emission_model(e_model="haploid_gl")
e_mat =e.give_emission_matrix(hts2, p)
np.shape(e_mat)

(5, 4)

In [6]:
t = load_transition_model(t_model="standard")
t.set_params(ibd_in = 0.0005, ibd_out = 0.001, ibd_jump = 0.05, recalculate=False)
t_mat = t.full_transition_matrix(r_vec,n=4)
np.shape(t_mat)

Reference Number: 4


(4, 3, 3)

In [9]:
post0 = fwd_bkwd(np.log(e_mat), t_mat, in_val = 1e-4, full=False)

Total Log likelihood: -13.187


In [12]:
np.sum(np.exp(post0), axis=1)

array([0.99999614, 0.99999654, 1.00000005, 1.        ])

In [10]:
np.shape(post0)

(4, 5)