# Class that implements Emission Matrix for IBD detection

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import socket as socket
import os as os
import sys as sys
import multiprocessing as mp
import h5py
import allel
import itertools as it

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM O2 Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/hapBLOCK/"  # The Path on Harvard Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)

print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

compute-a-17-102.o2.rc.hms.harvard.edu
HSM O2 Computational partition detected.
/n/groups/reich/hringbauer/git/hapBLOCK
CPU Count: 32


In [2]:
"""
Class for calculating Emission Probabilities.
Contains Sub-Classes, as well as factory Method.
@ Author: Harald Ringbauer, 2019, All rights reserved
"""

###############################
###############################

class Emissions(object):
    """Class for emission probabilities
    Has methods to return emission probabilities"""

    def give_emission_matrix(self, remember=False):
        """Return Emission Matrix - for every possible set of states"""
        raise NotImplementedError("Implement This in specific subclass.")

    def give_emission_state(self, ob_stat):
        """Gives the emission matrix of path of states"""
        raise NotImplementedError("Implement This in specific subclass.")
    
    def give_emission_log(self, ob_stat, dtype=np.float):
        """Return the full emission Probability directly in Log Space. 
        ob_stat: Observed Readcounts [2,l] array of 0/1 """
        raise NotImplementedError("Implement This in specific subclass.")

    def set_params(self, **kwargs):
        """Set the Parameters.
        Takes keyworded arguments"""
        for key, value in kwargs.items():
            setattr(self, key, value)

In [5]:
### Input: Two genotype probability arrays: in format [lx3x2]
### Output: Emission matrix of form [lx5] 1 Background state. 4 copying states

In [10]:
### Describe HW State GT Probabilities [3xl]
t0 = np.stack(((1-p)**2, 2*(1-p)*p, p**2))
#m0 = t0 . gt1

In [28]:
### Emission Probability 1
b1 = np.sum(gts[0,:,:] * t0, axis=0)
### Emission Probability 2
b2 = np.sum(gts[1,:,:] * t0, axis=0)
b = b1 * b2 ## Multiply the two probabilities

In [30]:
np.shape(t0)

(3, 4)

### Background State

In [5]:
def hw_background_state_2gls(gls, p):
    """Emission probability for background state.
    gls: 2x3xl probabilities of genotypes. 2: two inds 3: 3 genotypes, l loci
    p: Derived allele freqquency [l]"""
    hw = np.stack(((1-p)**2, 2*(1-p)*p, p**2)) # hw: Hardy Weinberg probabilies of hidden genotypes [3xl]
    b1 = np.sum(gls[0,:,:] * hw, axis=0)
    b2 = np.sum(gls[1,:,:] * hw, axis=0)
    b = b1 * b2 # Multiply probabilites (it's ind.)
    return b

In [103]:
### Calculate the Hardy-Weinberg probabilites

def hw_prob_haplo_pp(ht_p, p):
    """Calculate HW Probabilitiy of haplotype gt
    ht_p: [l,2] Array of haplotype likelihood, l locis, 2: Nr of genotypes.
    First one is ancestral. Second one is derived prob.
    p: [l] Array of (derived) allele frequencies
    returns [l] vector of HW prob of gt"""
    prob = ht_p[:,1] * p +  ht_p[:,0] * (1-p)
    return prob

def hw_prob_haplo_share_pp(ht_p1, ht_p2, p):
    """Calculate probability of sharing haplotypes
    ht_p1, ht_p2: Array of haplotype likelihood, l locis, 2: Nr of genotypes.
    p: [l] Array of (derived) allele frequencies
    returns [l] vector of prob that shared haplotype"""
    p_hw = np.stack((1-p,p), axis=1)
    prob = np.sum(ht_p1 * ht_p2 * p_hw, axis=1)
    return prob

def hw_prob_haplos_pp(hts_p, p):
    """Calculate HW Probabilitiy of haplotype gt
    gt: [k,l,2] Array of haplotype likelihood, k: Nr of haplotypesl locis, 2: Nr of genotypes.
    First one is ancestral. Second one is derived prob.
    p: [l] Array of (derived) allele frequencies
    returns [l] vector of HW prob of gt"""
    prob = hts_p[:,:,1] * p +  hts_p[:,:,0] * (1-p)
    prob_tot = np.prod(prob, axis=0)
    return prob_tot
    
def hw_probs_shared(hts_p, p, shared=(0,2), dtype="float"):
    """Give emission probabilities for shared state.
    Assume 0/1 2/3 are diploid haplotypes, and 0/2 are shared
    hts_p: [4,l,2] Array of four haplotype probabilities
    p: [l] array of derived genotype probability.
    shared: tuple of length 2 giving the indices of the shared haplotypes"""
    not_shared = [i for i in range(0,4) if i not in shared]
    assert(len(not_shared)==2 & len(shared)==2)
    p_hw1 = hw_prob_haplo_pp(hts_p[not_shared[0],:,:], p=p)
    p_hw2 = hw_prob_haplo_pp(hts_p[not_shared[1],:,:], p=p)
    p_shared = hw_prob_haplo_share_pp(hts_p[shared[0],:,:],hts_p[shared[1],:,:], p=p)
    p = p_hw1*p_hw2*p_shared
    return p
    
def give_emission_matrix(hts_p, p, dtype="float"):
    """Give emission Matrix for 5-state HMM.
    0th state: HW 1st-4th State: Haplotype Copying
    Input: p: [l] Array of (derived) allele frequencies
    hts_p: [4,l,2] Array of four haplotype probabilities.
    Return: emission matrix [5,l]."""
    l = np.shape(hts_p)[1]
    e_mat = np.zeros((5,l), dtype=dtype)
    e_mat[0,:] = hw_prob_haplos_pp(hts_p,p=p)
    
    for i, (j,k) in enumerate(it.product([0,1],repeat=2)):
        e_mat[i+1,:] = hw_probs_shared(hts_p, p=p, shared=[j,k+2])
        
    return e_mat

In [94]:
for i, (j,k) in enumerate(it.product([0,1],repeat=2)):
    print(i)
    print((j,k))

0
(0, 0)
1
(0, 1)
2
(1, 0)
3
(1, 1)


In [95]:
hw_prob_haplos_pp(hts, p)

array([0.81, 0.04, 0.21, 0.25])

In [97]:
hw_prob_haplo_share_pp(hts[0,:,:], hts[1,:,:],p=p)

array([0.9 , 0.2 , 0.  , 0.25])

In [98]:
hw_probs_shared(hts2,p=p)

array([0.729 , 0.008 , 0.    , 0.0625])

In [100]:
hw_probs_shared(hts2, p=p, shared=(1,2))

array([0.729 , 0.008 , 0.063 , 0.0625])

In [104]:
give_emission_matrix(ht2, p)

IndexError: too many indices for array

### IBD State
w.l.o.g. assume first allele shared, second allele not shared

In [11]:
### Input:
# gt1, gt2: Genotype vectors: [l,2] 0: anc, 1 derived alle
# pl1, pl2: genotype probabilities: [l]
# p: allele frequencies: [l]

In [13]:
### Probs for alternative alleles:
b1 = gt1[:,1] * p + (1-gt1[:,1]) * (1-p) 
b2 = gt2[:,1] * p + (1-gt2[:,1]) * (1-p)

### Prob identical
b3 = gt1[:,0] * gt2[:,0] * (1-p) + \  # prob homo anc
     (1-gt1[:,0]) * (1-gt2[:,0]) * p  # prob homo derived
    
b = b1 * b2 * b3

### Do the four sharing states

In [46]:
(2*0.9*0.1)**2

0.032400000000000005

### Test data

In [3]:
gt1 = np.array([[0,1,0],[0,0,1],[1,0,0],[0,1,0]]).T
gt2 = np.array([[0,1,0],[0,0,1],[0,0,1],[0,0,1]]).T
gts = np.stack((gt1, gt2))
p = np.array([0.9,0.8,0.7,0.5])

In [52]:
### Test data for biallelic case
ht1 = np.array([[0,1],[1,0],[0,1],[0.1,0.9]])
ht2 = np.array([[0,1],[1,0],[1,0],[0.5,0.5]])
ht3 = np.array([[0,1],[1,0],[1,0],[0.5,0.5]])
ht4 = np.array([[0,1],[1,0],[1,0],[0.5,0.5]])
hts = np.stack((ht1, ht2))
hts2 = np.stack((ht1,ht2,ht3,ht4))
p = np.array([0.9,0.8,0.7,0.5])

In [53]:
hts2

array([[[0. , 1. ],
        [1. , 0. ],
        [0. , 1. ],
        [0.1, 0.9]],

       [[0. , 1. ],
        [1. , 0. ],
        [1. , 0. ],
        [0.5, 0.5]],

       [[0. , 1. ],
        [1. , 0. ],
        [1. , 0. ],
        [0.5, 0.5]],

       [[0. , 1. ],
        [1. , 0. ],
        [1. , 0. ],
        [0.5, 0.5]]])

In [34]:
a=np.stack((1-p,p), axis=1)

In [35]:
np.shape(a)

(4, 2)

In [42]:
gt1.T

array([[0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0]])

In [24]:
gts[1,:]

array([[0, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 1, 1]])