# Class that implements Emission Matrix for IBD detection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import socket as socket
import os as os
import sys as sys
import multiprocessing as mp
import h5py
import allel

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM O2 Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/hapBLOCK/"  # The Path on Harvard Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)

print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

compute-e-16-229.o2.rc.hms.harvard.edu
HSM O2 Computational partition detected.
/n/groups/reich/hringbauer/git/hapBLOCK
CPU Count: 28


In [2]:
"""
Class for calculating Emission Probabilities.
Contains Sub-Classes, as well as factory Method.
@ Author: Harald Ringbauer, 2019, All rights reserved
"""

###############################
###############################

class Emissions(object):
    """Class for emission probabilities
    Has methods to return emission probabilities"""

    def give_emission_matrix(self, remember=False):
        """Return Emission Matrix - for every possible set of states"""
        raise NotImplementedError("Implement This in specific subclass.")

    def give_emission_state(self, ob_stat):
        """Gives the emission matrix of path of states"""
        raise NotImplementedError("Implement This in specific subclass.")
    
    def give_emission_log(self, ob_stat, dtype=np.float):
        """Return the full emission Probability directly in Log Space. 
        ob_stat: Observed Readcounts [2,l] array of 0/1 """
        raise NotImplementedError("Implement This in specific subclass.")

    def set_params(self, **kwargs):
        """Set the Parameters.
        Takes keyworded arguments"""
        for key, value in kwargs.items():
            setattr(self, key, value)

In [5]:
### Input: Two genotype probability arrays: in format [lx3x2]
### Output: Emission matrix of form [lx5] 1 Background state. 4 copying states

In [10]:
### Describe HW State GT Probabilities [3xl]
t0 = np.stack(((1-p)**2, 2*(1-p)*p, p**2))
#m0 = t0 . gt1

In [28]:
### Emission Probability 1
b1 = np.sum(gts[0,:,:] * t0, axis=0)
### Emission Probability 2
b2 = np.sum(gts[1,:,:] * t0, axis=0)
b = b1 * b2 ## Multiply the two probabilities

In [30]:
np.shape(t0)

(3, 4)

### Background State

In [34]:
def hw_background_state(gls, p):
    """Emission probability for background state.
    gls: 2x3xl probabilities of genotypes. 2: two inds 3: 3 genotypes, l loci
    p: Derived allele freqquency [l]"""
    hw = np.stack(((1-p)**2, 2*(1-p)*p, p**2)) # hw: Hardy Weinberg probabilies of hidden genotypes [3xl]
    b1 = np.sum(gls[0,:,:] * hw, axis=0)
    b2 = np.sum(gls[1,:,:] * hw, axis=0)
    b = b1 * b2 # Multiply probabilites (it's ind.)
    return b

In [45]:
hw_background_state(gts, p)

array([0.0324, 0.4096, 0.0441, 0.125 ])

### IBD State
w.l.o.g. assume first allele shared, second allele not shared

In [11]:
### Input:
# gt1, gt2: Genotype vectors: [l,2] 0: anc, 1 derived alle
# pl1, pl2: genotype probabilities: [l]
# p: allele frequencies: [l]

In [13]:
### Probs for alternative alleles:
b1 = gt1[:,1] * p + (1-gt1[:,1]) * (1-p) 
b2 = gt2[:,1] * p + (1-gt2[:,1]) * (1-p)

### Prob identical
b3 = gt1[:,0] * gt2[:,0] * (1-p) + \  # prob homo anc
     (1-gt1[:,0]) * (1-gt2[:,0]) * p  # prob homo derived
    
b = b1 * b2 * b3

### Do the four sharing states

In [46]:
(2*0.9*0.1)**2

0.032400000000000005

### Test data

In [3]:
gt1 = np.array([[0,1,0],[0,0,1],[1,0,0],[0,1,0]]).T
gt2 = np.array([[0,1,0],[0,0,1],[0,0,1],[0,0,1]]).T
gts = np.stack((gt1, gt2))
p = np.array([0.9,0.8,0.7,0.5])

In [9]:
gt1 = np.array([[0,1],[0,1],[1,0],[0,1]])
gt2 = np.array([[0,1],[0,1],[0,0],[0,0]])
gts_h = np.stack((gt1, gt2))
p = np.array([0.9,0.8,0.7,0.5])

In [10]:
np.shape(gt1)

(4, 2)

In [40]:
0.3**2 * 0.7**2

0.04409999999999999

In [41]:
np.shape(gts)

(2, 3, 4)

In [42]:
gt1.T

array([[0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0]])

In [24]:
gts[1,:]

array([[0, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 1, 1]])