In [4]:
import os, sys
sys.path.insert(0, 'structural_variants/lib/')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
import sympy

# MATLAB
import matlab.engine
import matlab
print(matlab.__file__)
eng = matlab.engine.start_matlab()

/opt/anaconda3/lib/python3.9/site-packages/matlab/__init__.py


In [16]:
def generate_data(params):
    '''
    Generate simulated data for a one parent, one child Structural Variant analysis
    Args:
        n: size of data vectors (signals)
        k: total number of structural variants
        pctNovel: percent of novel structural variants in [0,1] (biological reality- very small %)
        lambda_p, lambda_c: sequence coverage of child and parent, respectively
        erreps: error (>0) incurred by sequencing and mapping process
        r: dispersion parameter for Negative Binomial distribution
    
    Output:
        A_c: (lambda_c - erreps) I_n, sparse diagonal nxn matrix. I_n is nxn identity matrix
        A_p: (lambda_p - erreps) I_n, sparse diagonal nxn matrix. I_n is nxn identity matrix
        s_p: nx1 random vector drawn from Negative binomial distribution (for parent)
        s_c: nx1 random vector drawn from Negative binomial distribution (for child)
        for i in {P (parent), H (inherited), N (novel)}:
        z_i: nx1 indicator vector of homogeneous structural variants
        y_i: nx1 indicator vector of heterogeneous structural variants
        
    '''

    # params = {
    #     'r': r,
    #     'n': n,
    #     'k': k,
    #     'lambda_c': lambda_c,
    #     'lambda_p': lambda_p,
    #     'pctNovel': pctNovel,
    #     'erreps'  : erreps,
    #     'suffix'  : ['p','c'], }
    q = np.random.permutation(params['n'])
    print(q)
    startVal = int(params['k']*params['pctNovel'])
    endVal = int(startVal +params['k'])

    f_p = f_c = f_h = f_n = np.zeros((params['n'],1), dtype=np.int8) 
    f_p[q[: params['k']]], f_c[q[startVal:endVal]] = 1,1
    print(q[: params['k']])
    f_h[q[startVal:params['k']]], f_n[q[params['n']+1:endVal]] = 1,1
    
    d = {}
    for i, letter in enumerate(params['suffix']):
        d['A_%s'%letter]   = (params["lambda_%s"%letter] - params['erreps'])*sparse.eye(params['n'])
        d['mu_%s'%letter]  = np.matmul(d['A_%s'%letter].toarray(), eval('f_%s'%letter)) + params['erreps']
        d['var_%s'%letter] = d['mu_%s'%letter] +(1/params['r'])*(d['mu_%s'%letter]**2)
        d['y_%s'%letter]   = np.random.negative_binomial(d['mu_%s'%letter]/(d['var_%s'%letter]-d['mu_%s'%letter]),d['mu_%s'%letter]/d['var_%s'%letter])
    
    d['f_p'] = f_p; d['f_h'] = f_h; d['f_n'] = f_n; 
    return d

In [6]:
params = {
    'r': 1,
    'n': 10**2,
    'k': 17,
    'lambda_c': 4,
    'lambda_p': 8,
    'pctNovel': 0.15,
    'erreps'  : 1e-2,
    'suffix'  : ['p','c'] }

In [17]:
data = generate_data(params)

[53 26 44 40 25 41  7 42 39 24 98  1 64 33 80 58 61 70 37 14 85 22 56 23
 28  9  2 68 90 21 48 69 38 84 27 79 20 67 97 10 46 72  8 13 65 45 86 31
 83 19 89 34 17  0 93 81 60 29 30 36 77 16 15 49 87 52 82 62 92 35  5  4
 78 71 96 99 66 51  6 12  3 43 95 32 18 54 11 91 55 76 59 50 47 63 57 88
 94 75 73 74]
[53 26 44 40 25 41  7 42 39 24 98  1 64 33 80 58 61]


In [8]:
data.keys()

dict_keys(['A_p', 'mu_p', 'var_p', 'y_p', 'A_c', 'mu_c', 'var_c', 'y_c', 'f_p', 'f_h', 'f_n'])

In [9]:
data['A_p'].toarray()

array([[7.99, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 7.99, 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 7.99, ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 7.99, 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 7.99, 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 7.99]])

In [18]:
data['f_p']

array([[0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
    

In [None]:
np.matmul(data['A_p'].toarray(), data['f_p']) + params['erreps']

In [None]:
q = np.random.permutation(n)
startVal = int(k*pctNovel)
endVal = int(startVal +k)

f_p = f_c = f_h = f_n = np.zeros((n,1), dtype=np.int8) 
f_p[q[:k]], f_c[q[startVal:endVal]] = 1,1
f_h[q[startVal:k]], f_n[q[k+1:endVal]] = 1,1

In [None]:
q = np.random.permutation(n); q

In [None]:
f_p = np.zeros((n,1))
f_p[q[:k]] = 1

In [None]:
q[:k]

In [None]:
np.transpose(f_p)
#print(np.sum(f_p))

In [None]:
startVal = int(k*pctNovel); print(startVal)
endVal = int(startVal +k); print(endVal) 

In [None]:
q[:k]

In [None]:
# if we do startVal +1, the child only gets k-1 SVs, and if we do startVal the child gets k SVs
q[startVal:endVal]

In [None]:
f_c = np.zeros((n,1))
f_c[q[startVal:endVal]] = 1

In [None]:
np.transpose(f_c)

In [None]:
# f_p and f_c have same number of nonzero entries
np.sum(f_p != 0) == np.sum(f_c != 0)

In [None]:
f_p = f_c = f_h = f_n = np.zeros((n,1), dtype=np.int8) #, np.zeros((n,1)), np.zeros((n,1)), np.zeros((n,1))
f_p[q[:k]], f_c[q[startVal:endVal]] = 1,1
np.transpose(f_c)

In [None]:
set(q[startVal:k]).issubset(q[:k])

In [None]:
f_h[q[startVal:k]], f_n[q[k+1:endVal]] = 1,1

In [None]:
np.transpose(f_n)

In [None]:
scipy.sparse.identity(n)

In [None]:
A_p, A_c = (lambda_p - erreps)*scipy.sparse.eye(n), (lambda_c - erreps)*scipy.sparse.eye(n)

In [None]:
mu_p = np.matmul(A_p.toarray(), f_p) + erreps

In [None]:
var_p = mu_p + (1/r)*(mu_p**2)

In [None]:
np.transpose(var_p)

In [None]:
np.random.negative_binomial(mu_p/(var_p-mu_p),mu_p/var_p)

In [None]:
params = {
    'r': 1,
    'n': 10**2,
    'k': 17,
    'lambda_c': 4,
    'lambda_p': 8,
    'pctNovel': 0.15,
    'erreps'  : 1e-2,
    'suffix'  : ['p','c'] }

In [None]:
d = {}
for i, letter in enumerate(params['suffix']):
    d['A_%s'%letter]   = (params["lambda_%s"%letter] - params['erreps'])*sparse.eye(params['n'])
    d['mu_%s'%letter]  = np.matmul(d['A_%s'%letter].toarray(), eval('f_%s'%letter)) + params['erreps']
    d['var_%s'%letter] = d['mu_%s'%letter] +(1/params['r'])*(d['mu_%s'%letter]**2)
    d['y_%s'%letter]   = np.random.negative_binomial(d['mu_%s'%letter]/(d['var_%s'%letter]-d['mu_%s'%letter]),d['mu_%s'%letter]/d['var_%s'%letter])

In [None]:
d.keys()