In [1]:
import pandas as pd
import pystan
import numpy as np

In [2]:
data = pd.read_table("/Users/inti.pedroso/DATA/ASE/phaser/ERR883767/ERR883767.phaser_ase.gene.txt")
data.head()
data2 = data[data["totalCount"] > 20]

In [3]:
!cat /Users/inti.pedroso/DATA/ASE/mixture_BB_dirPrior.stan

data {
  int<lower=1> N; // total number of observations
  int<lower=1> K; // total number of mixture distributions
  real<lower=0,upper=1> mu[K]; // means of mixture distributions
  vector<lower=0>[K] lambda; // prior counts for mixture distributions
  int<lower=0> x[N]; // counts for one allele
  int<lower=0> n[N]; // total counts for unit
}
parameters {
  simplex[K] theta; // mixture proportions
  vector<lower=0.1>[K] M; // sum of all priors of individual components
} 
model {
  real alpha[K];
  real beta[K];
  vector[K] log_theta = log(theta); // cache log calculation
  // alpha and beta parameters for each distribution
  for (k in 1:K) {
    alpha[k] = mu[k]*M[k];
    beta[k] = (1-mu[k])*M[k];
  }
  // priors for allocations proportions. This are fixed
  theta ~ dirichlet(lambda);
  // priors for each component 
  M ~ pareto(0.1, 1.5);
  // likelihood 
  for (i in 1:N) {
    vector[K] lps = log_theta;
    for (k in 1:K)
      lps[k] += beta_binomial_lp

In [4]:

sm_BB = pystan.StanModel(file="/Users/inti.pedroso/DATA/ASE/mixture_BB.stan")
sm_BB_dirPrior = pystan.StanModel(file="/Users/inti.pedroso/DATA/ASE/mixture_BB_dirPrior.stan")



INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ca28b97c632e4821dd758ef58b56d581 NOW.
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_8e974f6889c1edbaaaea95457fcd5445 NOW.


In [6]:
def get_mus(K=3, center=0.5, denom_val = 2):
    mus = np.zeros(K)
    center_idx = int(0.5*(K-1))
    mus[ center_idx ] = 0.5
    denominator = np.sqrt(denom_val)
    for i in range(int(K - 0.5*(K-1)),K):
      mus[i] = mus[i-1]/denominator;
      mus[K - i - 1] = 1 - mus[i-1]/denominator;
        
    print(mus)

def get_mus_log(K=3, center=0.5):
    start = np.log(1)
    end=np.log(0.5)
    step= (end - start)/((K-1)/2)
    mus = np.sort(
        np.hstack(
            [np.exp(np.arange(start,end,step)),
            0.5,
            1 - np.exp(np.arange(start,end,step))]
        )
    )

    mus[mus == 0] = 0.01
    mus[mus == 1] = 0.99
    return mus

def get_prior_counts(K=3):
    pc = np.ones(K)
    pc[int(0.5*(K-1))] = 10
    return pc

get_mus_log(K=7)


array([0.01      , 0.20629947, 0.37003948, 0.5       , 0.62996052,
       0.79370053, 0.99      ])

In [7]:
K=7
dat2stan = {'N': data2.shape[0],
            'K':K,
            'mu': get_mus_log(K=K),
            'lambda': get_prior_counts(K=K),
            'x': data2.aCount.values,
            'n': data2.totalCount.values
}
dat2stan


{'N': 2423,
 'K': 7,
 'mu': array([0.01      , 0.20629947, 0.37003948, 0.5       , 0.62996052,
        0.79370053, 0.99      ]),
 'lambda': array([ 1.,  1.,  1., 10.,  1.,  1.,  1.]),
 'x': array([ 42,  11,  23, ..., 197, 327, 114]),
 'n': array([ 88,  22,  38, ..., 236, 455, 133])}

In [None]:
time fit = sm_BB.sampling(data=dat2stan, iter=2000, chains=2, n_jobs=2)

  elif np.issubdtype(np.asarray(v).dtype, float):


In [None]:
print(fit.stansummary())

In [9]:
from scipy.special import gammaln

def log_dbetabinomial(x,n,a,b):
    return gammaln(n+1) + gammaln(x+a) + gammaln(n-x+b) + gammaln(a+b) - \
        (gammaln(x+1) + gammaln(n-x+1) + gammaln(a) + gammaln(b) + gammaln(n+a+b))

log_dbetabinomial(10,50,25,25)

-7.336178530676079

In [18]:
data2 = data[data["totalCount"] > 100]
data2.shape

(489, 12)

In [19]:
data2.head()

Unnamed: 0,contig,start,stop,name,aCount,bCount,totalCount,log2_aFC,n_variants,variants,gw_phased,bam
47,B07,895453,900798,LOC100651524,90,73,163,0.302029,1,B07_896066_T_C,0,ERR883767
52,B07,936109,938120,LOC100652075,80,57,137,0.489038,1,B07_938003_T_A,0,ERR883767
61,B07,1008279,1010117,LOC100644439,68,68,136,0.0,1,B07_1008987_C_T,0,ERR883767
116,B07,1627689,1645135,LOC100650116,97,88,185,0.140481,1,B07_1642199_C_T,0,ERR883767
152,B07,2104707,2106305,LOC100644293,224,218,442,0.039171,1,B07_2105536_A_G,0,ERR883767


In [22]:
mult = get_mus_log(K=5)
pr_ct = get_prior_counts(K=5)
pr_ct


array([ 1.,  1., 10.,  1.,  1.])

In [39]:
class DirMult:
    def __init__(self,K=3,prior_counts = None):
        if prior_counts is not None:
            self.prior_counts = prior_counts
        else: 
            self.prior_counts = np.ones(K)
        
        self.alpha = self.prior_counts

    
    def update(self,counts = None):
        self.alpha += counts
            
d = dirich(prior_counts=pr_ct)
print(d.prior_counts)
d.update(pr_ct)
print(d.alpha)



[ 2.  2. 20.  2.  2.]
[ 4.  4. 40.  4.  4.]


In [None]:
for i in xr

In [42]:
def prob_1_beats_2(alpha_1, beta_1, alpha_2, beta_2):
    total = 0.0
    for k in range(0,(alpha_1-1)):
        total += exp(k * log(beta_1) + alpha_2 * logbeta_2) \
                 - (k+alpha_2) * log(beta_1 + beta_2) \
                 - log(k + alpha_2) - lbeta(k + 1, alpha_2)

    return total

prob_1_beats_2(10,50,25,25)

NameError: name 'exp' is not defined