In [1]:
import os, sys
sys.path.insert(0, 'structural_variants/lib/')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
import sympy
import random

# MATLAB
# import matlab.engine
# import matlab
# print(matlab.__file__)
# eng = matlab.engine.start_matlab()

In [6]:
def generate_haploid_data(params):
    '''
    Generate simulated data for a one parent, one child Structural Variant analysis
    Args: A dictionary containing the following parameters as keys
        n: size of data vectors (signals)
        k: total number of structural variants
        pctNovel: percent of novel structural variants in [0,1] (biological reality- very small %)
        lambda_p, lambda_c: sequence coverage of child and parent, respectively
        erreps: error (>0) incurred by sequencing and mapping process
        r: dispersion parameter for Negative Binomial distribution
    
    Output: A dictionary containing the following data elements as keys
        A_c: (lambda_c - erreps) I_n, sparse diagonal nxn matrix. I_n is nxn identity matrix
        A_p: (lambda_p - erreps) I_n, sparse diagonal nxn matrix. I_n is nxn identity matrix
        s_p: nx1 random vector drawn from Negative binomial distribution (for parent)
        s_c: nx1 random vector drawn from Negative binomial distribution (for child)
        TODO: add mu and var
        for i in {P (parent), H (inherited), N (novel)}:
        z_i: nx1 indicator vector of homogeneous structural variants
        y_i: nx1 indicator vector of heterogeneous structural variants
        
    '''
    q = np.random.permutation(params['n'])
    #print(q)
    startVal = int(params['k']*params['pctNovel']); print(startVal)
    endVal = int(startVal +params['k']) ; print(endVal)

    f_p, f_c, f_h, f_n = np.zeros((params['n'],1), dtype=np.int8),np.zeros((params['n'],1), dtype=np.int8), np.zeros((params['n'],1), dtype=np.int8), np.zeros((params['n'],1), dtype=np.int8)
    f_p[q[: params['k']]], f_c[q[startVal:endVal]] = 1,1
    f_h[q[startVal:params['k']+1]], f_n[q[params['k']+1:endVal]] = 1,1
    
    
    d = {}
    d['f_p'] = f_p; d['f_h'] = f_h; d['f_n'] = f_n; d['f_c'] = f_h + f_n; 
    for i, letter in enumerate(params['suffix']):
        d['A_%s'%letter]   = (params["lambda_%s"%letter] - params['erreps'])*sparse.eye(params['n'])
        d['mu_%s'%letter]  = np.matmul(d['A_%s'%letter].toarray(), d['f_%s'%letter]) + params['erreps']
        d['var_%s'%letter] = d['mu_%s'%letter] +(1/params['r'])*(d['mu_%s'%letter]**2)
        d['s_%s'%letter]   = np.random.negative_binomial(d['mu_%s'%letter]/(d['var_%s'%letter]-d['mu_%s'%letter]),d['mu_%s'%letter]/d['var_%s'%letter])
    
    
    return d

In [7]:
params = {
    'r': 1,
    'n': 10**2,
    'k': 10,
    'lambda_c': 4,
    'lambda_p': 8,
    'pctNovel': 0.15,
    'erreps'  : 1e-2,
    'suffix'  : ['p','c'],
    'pct_similarity': 0.6}

In [8]:
data = generate_haploid_data(params)

1
11


In [32]:
q = np.random.permutation(params['n'])
print(q)
startVal = int(params['k']*params['pctNovel']); print(startVal)
endVal = int(startVal +params['k']) ; print(endVal)
similarity = int(params['pct_similarity']* params['k']) # pct_similarity * number of SVs

signals = ['f_p2', 'f_c']
for i,letter in enumerate(['p', 'h', 'n']):
    signals.append('f_%s'%letter)
    signals.append('z_%s'%letter)
    signals.append('y_%s'%letter)
signals.sort(reverse=True)
print(signals)
# Initialize signals
d= {}
for signal in signals: d[signal] = np.zeros((params['n'],1), dtype=np.int32)

# # parent signals: 
# #        f_p  - k elements will be 1s and 2s randomly selected
# #        f_p2 - floor of %similarity*k elements will be the same as f_p and the rest will be random 1s and 2s
# for i in q[:params['k']]: f_p[i] = np.random.randint(1,3) 
# f_p2[q[0:similarity]] = f_p[q[0:similarity]]
# f_p2[random.choices(q, k=params['k'] -similarity)]= np.random.randint(1,3) 

# # child signal
# #     inherited
# for i in np.arange(f_p.shape[0]):
#     if f_p[i]==2 and f_p2[i]==2: f_c[i]=2
#     if (f_p[i]>=1 and f_p2[i]>=1):
#         if np.random.random_sample() >= 0.5: f_c[i]=2
#         else: f_c[i]=1
# #     novel        
# f_c[random.choices(q, k=params['k'] -similarity)]= np.random.randint(1,3) 

# d = {}
# d['f_p'] = f_p; d['f_h'] = f_h; d['f_n'] = f_n; d['f_c'] = f_h + f_n; 

[33 25 36  1 55 98 10  5 42 77  2 34 39 75 58 49 28 26 18 73 80 82  4 15
 29 95 40 17  0 99 47 67 86 69 27 70 88 79 53 48 60 37 13 19 66 96 61 46
 85 93 20  7 31 64 16 71 32 14 45  3  6 24 91 87  8 44 41 52 97 94 78 74
 90 72 23 35 51 54 81 43 76 63 83  9 68 30 89 84 21 92 62 38 57 65 22 11
 12 59 56 50]
1
11
['z_p', 'z_n', 'z_h', 'y_p', 'y_n', 'y_h', 'f_p2', 'f_p', 'f_n', 'f_h', 'f_c']


In [30]:
signals = []
for i,letter in enumerate(['p', 'h', 'n']):
    signals.append('z_%s'%letter)
    signals.append('y_%s'%letter)
signals.sort()
print(signals)

['y_h', 'y_n', 'y_p', 'z_h', 'z_n', 'z_p']


In [None]:
params['k'] -similarity

In [None]:
np.transpose(f_p)

In [None]:
np.transpose(f_p2)

In [None]:
for i in np.arange(f_p.shape[0]):
    if f_p[i]==2 and f_p2[i]==2: f_c[i]=2
    if (f_p[i]>=1 and f_p2[i]>=1):
        if np.random.random_sample() >= 0.5: f_c[i]=2
        else: f_c[i]=1
print(f_c[np.nonzero(f_c > 0)])

In [None]:
for i in np.arange(f_p.shape[0]):    
    if (f_p[i]>=1 and f_p2[i]>=1) : print(f_p[i], f_p2[i])

In [None]:
f_c[random.choices(q, k=params['k'] -similarity)]= np.random.randint(1,3) 

In [None]:
f_p[np.nonzero(f_p > 0)]

In [None]:
f_p2[np.nonzero(f_p2 > 0)]

In [None]:
f_c[np.nonzero(f_c > 0)]

In [None]:
np.transpose(f_c)

In [None]:
z_c, y_c = np.zeros((params['n'],1), dtype=np.int32), np.zeros((params['n'],1), dtype=np.int32)

In [None]:
for i in np.arange(f_c.shape[0]):
    if   f_c[i]==2: z_c[i]=1
    elif f_c[i]==1: y_c[i]=1
np.transpose(z_c)

In [None]:
np.transpose(y_c)