In [1]:
import os, sys
sys.path.insert(0, 'structural_variants/lib/')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
import sympy
import random
# MATLAB
# import matlab.engine
# import matlab
# print(matlab.__file__)
# eng = matlab.engine.start_matlab()

In [2]:
def generate_haploid_data(params):
    '''
    Generate simulated data for a one parent, one child Structural Variant analysis
    Args: A dictionary containing the following parameters as keys
        n: size of data vectors (signals)
        k: total number of structural variants
        pctNovel: percent of novel structural variants in [0,1] (biological reality- very small %)
        lambda_p, lambda_c: sequence coverage of child and parent, respectively
        erreps: error (>0) incurred by sequencing and mapping process
        r: dispersion parameter for Negative Binomial distribution
    
    Output: A dictionary containing the following data elements as keys
        A_c: (lambda_c - erreps) I_n, sparse diagonal nxn matrix. I_n is nxn identity matrix
        A_p: (lambda_p - erreps) I_n, sparse diagonal nxn matrix. I_n is nxn identity matrix
        s_p: nx1 random vector drawn from Negative binomial distribution (for parent)
        s_c: nx1 random vector drawn from Negative binomial distribution (for child)
        TODO: add mu and var
        for i in {P (parent), H (inherited), N (novel)}:
        z_i: nx1 indicator vector of homogeneous structural variants
        y_i: nx1 indicator vector of heterogeneous structural variants
        
    '''
    q = np.random.permutation(params['n'])
    #print(q)
    startVal = int(params['k']*params['pctNovel']); print(startVal)
    endVal = int(startVal +params['k']) ; print(endVal)

    f_p, f_c, f_h, f_n = np.zeros((params['n'],1), dtype=np.int8),np.zeros((params['n'],1), dtype=np.int8), np.zeros((params['n'],1), dtype=np.int8), np.zeros((params['n'],1), dtype=np.int8)
    f_p[q[: params['k']]], f_c[q[startVal:endVal]] = 1,1
    f_h[q[startVal:params['k']+1]], f_n[q[params['k']+1:endVal]] = 1,1
    
    
    d = {}
    for i, letter in enumerate(params['suffix']):
        d['A_%s'%letter]   = (params["lambda_%s"%letter] - params['erreps'])*sparse.eye(params['n'])
        d['mu_%s'%letter]  = np.matmul(d['A_%s'%letter].toarray(), eval('f_%s'%letter)) + params['erreps']
        d['var_%s'%letter] = d['mu_%s'%letter] +(1/params['r'])*(d['mu_%s'%letter]**2)
        d['s_%s'%letter]   = np.random.negative_binomial(d['mu_%s'%letter]/(d['var_%s'%letter]-d['mu_%s'%letter]),d['mu_%s'%letter]/d['var_%s'%letter])
    
    d['f_p'] = f_p; d['f_h'] = f_h; d['f_n'] = f_n; 
    return d

In [44]:
params = {
    'r': 1,
    'n': 10**2,
    'k': 10,
    'lambda_c': 4,
    'lambda_p': 8,
    'pctNovel': 0.15,
    'erreps'  : 1e-2,
    'suffix'  : ['p','c'],
    'pct_similarity': 0.6}

In [45]:
q = np.random.permutation(params['n'])
print(q)
startVal = int(params['k']*params['pctNovel']); print(startVal)
endVal = int(startVal +params['k']) ; print(endVal)
similarity = int(params['pct_similarity']* params['k']) # pct_similarity * number of SVs

f_p, f_p2, f_c, f_h, f_n = np.zeros((params['n'],1), dtype=np.int8), np.zeros((params['n'],1), dtype=np.int8),np.zeros((params['n'],1), dtype=np.int8), np.zeros((params['n'],1), dtype=np.int8), np.zeros((params['n'],1), dtype=np.int8)
f_p[q[:params['k']]] =1
f_p2[np.append(q[0:similarity], random.choices(q, k=params['k'] -similarity))]=1

[84 51 80  3  4 62  0 60  1 77 96 17 46  7 76  6 23 34 29 90 25 52  9 97
 83 15 22 11 38 19 73 69 43 10 71 66 44 92 20 87 70 37 54  5 13 18 48 61
 24 72 98 28 85 27 14 30 39 82 50 94 74  2 59 49 88 91 95 53 65 78 67 64
 12 45 35 81 32 86 56 89 75 99 40 63 58 16 55 33 68 31 26 42 93  8 47 79
 21 41 36 57]
1
11


In [46]:
q[:params['k']] #parent 1

array([84, 51, 80,  3,  4, 62,  0, 60,  1, 77])

In [47]:
np.append(q[0:similarity], random.choices(q, k=params['k'] -similarity)) # parent 2

array([84, 51, 80,  3,  4, 62, 98, 88, 84, 34])

In [48]:
f_p[q[:params['k']]] =1
f_p2[np.append(q[0:similarity], random.choices(q, k=params['k'] -similarity))]=1

In [49]:
np.transpose(f_p)

array([[1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int8)

In [50]:
np.transpose(f_p2)

array([[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int8)

In [60]:
for i in np.arange(f_p.shape[0]):
    if f_p[i]==1 and f_p2[i]==1: f_c[i]=1
    if (f_p[i]==1 or f_p2[i]==1) and np.random.random_sample() > 0.5: 
print(np.sum(f_c))

[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
[ True]
6
