In [8]:
import numpy as np
import pandas as pd
import scipy

######################################
#                                    #
# Generate matrix,thresholds & probs #
#                                    #
######################################


from methods import K_Means, BASC_A, onestep, shmulevich

#algos -> list of algorithms
#n -> size of gene
def PDF(algos, n):

    #generate random matrix
    m = np.random.rand(1000,n)
    #list of algorithms
    methods = []
    #list of lists of thresholds
    thresholds = []
    #list of col names
    cols = []
    for a in algos:
        if a == "K-Means":
            methods.append(K_Means)
            thresholds.append([])
            cols.append('k-means')
        if a == "Onestep":
            methods.append(onestep)
            thresholds.append([])
            cols.append('onestep')
        if a == "BASC A":
            methods.append(BASC_A)
            thresholds.append([])
            cols.append('BASC_A')
        if a == "Shmulevich":
            methods.append(shmulevich)
            thresholds.append([])
            cols.append('shmulevich')

    #Generate thresholds and store
    for i in range(1000):
    
        x = m[i]
        for a in range(len(methods)):
    
            thr = methods[a](x)
            thresholds[a].append(thr)

    #dataframe with probabilities
    prob = []

    #create PDF
    for i in range(len(cols)):
        col = thresholds[i]
        r = scipy.stats.rv_histogram(np.histogram(col, bins=100))
        probs = r.cdf(np.linspace(0.1,1,100))
        prob.append(probs) 

    pdf_df = pd.DataFrame(prob).transpose()
    pdf_df.columns = cols
    return pdf_df


import pandas as pd
import numpy as np
import math

#calculate prob
#vec is threshold
#d is displacement
#alg is algorithm id
def prob(vec, d, alg, probs,n):

    t = math.floor((vec[-n]-d) * 100)
    p1 = probs[alg].iloc[t]
    t = math.floor((vec[-n]+d) * 100)
    p0 = 1 - probs[alg].iloc[t]
    pq = 1 - (p1 + p0)


    return [p0,p1,pq]

#vec is threshold
#d is displacement
#n is number of elements in gene vector
#a is algorithm to calculate prob for
#x is empty list, left default for function call
#xx is list of tuples of list and prob, left default for function call
#pp is running probability, left default for function call
def probPerm(vec, d, n, alg, x, p, xx, probs):
    #base case
    if n == 0:
        return xx.append((x, p))
    else:
        #calculate probs
        prb = prob(vec,d,alg,probs,n)
        
        #append values
        x0 = x + '0'
        p0 = p * prb[0]
        probPerm(vec,d,n-1,alg,x0,p0,xx,probs)
        
        x1 = x + '1'
        p1 = p * prb[1]
        probPerm(vec,d,n-1,alg,x1,p1,xx,probs)
        
        xq = x + '?'
        pq = p * prb[2]
        probPerm(vec,d,n-1,alg,xq,pq,xx,probs)

    return xx

#PDF is a dataframe with the PDF of the selected algorithms
#vec is vector of gene
#alg is algorithm (string)
#d is displacement
#n is size of gene
def probBin(vec,d,n,alg,PDF):
    
    probs = PDF
    res = probPerm(vec,d,n,alg,"",1,[],probs)
    dct = {'string':[],'prob':[]}
    for t in res:
        dct['string'].append(t[0])
        dct['prob'].append(t[1])
    
    df = pd.DataFrame.from_dict(dct)
    return df

In [9]:
dataset = [0.12,0.1,0.65,0.7,0.305]
disp = 0.0096
pdf = PDF(['K-Means'],5)
binnies = probBin(dataset,disp,5,'k-means',pdf)

In [10]:
binnies.sort_values(by=['prob'])

Unnamed: 0,string,prob
242,?????,8.343952e-12
161,1????,2.225975e-11
215,?1???,2.635273e-11
241,????1,3.189644e-11
236,???0?,3.526976e-11
...,...,...
9,00100,2.754863e-02
14,0011?,3.149196e-02
3,00010,5.404112e-02
13,00111,1.203844e-01
