# This is a first test Notebook to calculate Probabilities for Read Count Data
@ Author: Harald Ringbauer, July 2019

In [78]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket
from scipy.stats import binom

### Pick the right path (whether on cluster or at home)
if socket.gethostname() == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket.gethostname() == "midway2-0401.rcc.local" or socket.gethostname() == 'midway2-0402.rcc.local':
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

Midway jnovmbre partition detected.
/project/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


## Description of task:
Need to implement two functions of object:

has as fields:
- ref_mat [n_ref, n_loc]
- p [Allele Frequency in the Ref]
- e_rate [Error Rate]

def give_emission_matrix(self, remember=True):
    return emission_matrix (n_ref+1, n_loc, 2)    Which is the general Emission matrix for all possible observed states
    
def give_emission_state(self, ob_stat, e_mat):  Which is Emission Matrix of the OBSERVED states
    return e_prob (n_ref+1, n_loc)

Both are return NOT in log space

# Solution Here:
- I: give_emission_matrix: Calculate the emission probabilities for all 3 possible Genotypes 
- II: give_emission_state: Do the Binomial Step

### Calculate Emission Matrix
(for read of 0 OR 1, i.e. Ref OR ALT)

In [183]:
### Step 1: Do Probabilities given genotypes
n_ref = 3
n_loci = 5
e_rate = 1e-2   # The error rate for the reads
e_rate_ref = 1e-3  # The error rate for the reference genome (to not run into trouble for high cooverage SNPs)

In [184]:
#p = []   # Vector of mean allele frequencies in Reference
#ref_haps = []

In [191]:
# Calculate the probabilities for the three hidden Genotypes given each state and each locus
p_hgeno = -np.ones((n_ref + 1, n_loci, 3)) 

### Do the HW State
p_hgeno[0,:,0] = (1 - p) ** 2
p_hgeno[0,:,1] = 2 * p * (1 - p)
p_hgeno[0,:, 2] = p ** 2

### Do the copying states
p_hgeno[1:, :, 1] = e_rate_ref / 2
p_hgeno[1:, :, 0] =  (ref_haps==0) * (1 - e_rate_ref) + (ref_haps==1) * e_rate_ref/2
p_hgeno[1:, :, 2] =  (ref_haps==1) * (1 - e_rate_ref) + (ref_haps==0) * e_rate_ref/2


### Allow small error in Reference Genome:
#p_hgeno[p_hgeno==1] = 1 - e_rate_ref
#p_hgeno[p_hgeno==0] = e_rate_ref

assert(np.all(np.isclose(np.sum(p_hgeno, axis=2), 1)))  # Sanity Check if genotype probabilities sum up to (approx.) 1
assert((np.min(p_hgeno)>=0) & (np.max(p_hgeno)<=1))   # Sanity Check

p_hgeno[1, 4, :]

array([5.00e-04, 5.00e-04, 9.99e-01])

### Give Emission Matrix Given State
(Binomial Calculation from p per locus)

In [192]:
### For each Genotype state, calculate the probability of derived read

### What's the probability of observing a dervided read given hidden genotypes 00 01 11
p_read = np.array([e_rate, 0.5, 1 - e_rate]) 

### Calculate the Binomial Likelihoods of RC Data
rc_tot = np.sum(ob_stat, axis=0)
rc_der = ob_stat[1, :]

prob_binom = binom.pmf(rc_der[:,None], rc_tot[:,None], p_read[None,:])
print(np.shape(prob_binom))

(5, 3)


In [193]:
### Sum the probabilities to getfull emission probabilities
p_full = np.sum(p_hgeno * prob_binom[None, :, :], axis=2)

In [194]:
p_full

array([[6.70264014e-02, 1.38899670e-01, 5.46722222e-01, 6.63333333e-01,
        2.33222222e-01],
       [5.70492186e-02, 1.75646277e-04, 9.79244950e-01, 1.07350000e-02,
        2.00401000e-02],
       [7.54052949e-05, 1.75646277e-04, 7.14950000e-04, 9.89265000e-01,
        2.00401000e-02],
       [5.70492186e-02, 1.75646277e-04, 9.79244950e-01, 9.89265000e-01,
        2.00401000e-02]])

# Area 51

### Test Case
Test the code on mini example and verify calculations

### Create test data:
- RC for l=5 Loci (2, l)
- 3 References:
(n, l) = (3, 5)

In [188]:
ob_stat = np.array([[1, 5], [3, 3], [0, 2], [1, 0], [1,1]]).T
ob_stat

array([[1, 3, 0, 1, 1],
       [5, 3, 2, 0, 1]])

In [189]:
ref_haps = np.array([[1, 1, 1, 1, 1], [0, 0, 0, 0, 0], [1, 1, 1, 0, 0]])
ref_haps

array([[1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0]])

In [190]:
p = np.mean(ref_haps, axis=0)
p

array([0.66666667, 0.66666667, 0.66666667, 0.33333333, 0.33333333])

In [90]:
p = np.array([0.1, 0.5, 0.9])
x = np.array([0, 0, 3])
binom.pmf(x, 3, p)

array([0.729, 0.125, 0.729])

In [96]:
p_read

array([1.0000000e-08, 5.0000000e-01, 9.9999999e-01])

In [109]:
binom.pmf(1, 6, p_read[1])

0.09375000000000003

In [101]:
rc_tot

array([6, 6, 2, 1, 2])

In [158]:
binom.pmf(5, 6, e_rate) * 0.33**2 + binom.pmf(5, 6, 0.5) * 2*0.33333*0.66667 + binom.pmf(5, 6, 1 - e_rate) * 0.6666667**2

0.04166648499791815

In [165]:
binom.pmf(2, 2, e_rate) * e_rate_ref/2 + binom.pmf(2, 2, 0.5) * e_rate_ref/2 + binom.pmf(2, 2, 1 - e_rate) * (1 - e_rate_ref)

0.999912480002