In [1]:
import pandas as pd
from glob import glob
import numpy as np
from tools.procOps import *
from tools.fileOps import *
import itertools
import multiprocessing
from scipy.special import *
from tools.bio import *

In [2]:
sun_df = pd.read_csv('copy_number/hg38_features.txt', sep='\t')
sun_df.columns = ['loc', 'NAB', 'NC', 'ND', 'NN']
sun_df['NAB'] = [1 if x > 0 else 0 for x in sun_df['NAB']]
sun_df['NC'] = [1 if x > 0 else 0 for x in sun_df['NC']]
sun_df['ND'] = [1 if x > 0 else 0 for x in sun_df['ND']]
sun_df['NN'] = [1 if x > 0 else 0 for x in sun_df['NN']]
sun_df.head()

Unnamed: 0,loc,NAB,NC,ND,NN
0,56,0,0,0,1
1,178,0,0,1,0
2,240,0,0,0,1
3,386,1,0,0,0
4,393,1,0,0,0


In [3]:
# remove bad positions
bad_positions = set([48646, 75180, 17731, 43539, 2967, 54815, 64033, 89900, 66088, 68137, 90156, 85939, 87309, 70198, 54452, 1594, 77455, 8253, 73792, 7776, 60995, 44613, 96353, 24138, 48718, 13391, 47185, 24659, 29269, 13913, 23133, 68191, 29280, 69729, 1635, 88677, 10513, 74860, 54893, 90226, 57973, 74358, 58487, 82556, 10517, 76419, 59524, 74373, 41094, 8843, 63630, 74383, 5267, 57493, 12439, 35992, 85862, 3235, 48297, 68722, 24754, 74420, 95925, 7356, 7871, 38598, 18209, 5841, 39122, 67796, 68309, 94430, 6370, 93478, 49894, 37095, 52457, 16620, 92397, 60142, 76015, 41712, 93480, 41716, 54006, 22990, 48376, 16597, 92418, 40711, 93484, 63242, 26380, 898, 92431, 29144, 49941, 43799, 48409, 91419, 90398, 92448, 25889, 64803, 10534, 24359, 39720, 46889, 93482, 25900, 93486, 76080, 20276, 78646, 53056, 86339, 76104, 41186, 14670, 67410, 23382, 83803, 17245, 17760, 63844, 88806, 74086, 26983, 46953, 88039, 878, 12655, 16345, 7957, 38774, 50409, 386, 1496, 41878, 66952, 393, 53655, 60815, 15250, 58691, 41454, 64406, 70039, 2459, 11679, 86432, 34203, 94628, 2469, 72614, 84903, 74664, 42921, 29100, 40370, 37299, 78266, 9149, 61385, 65996, 29133, 85454, 89553, 45522, 21459, 27096, 47065, 51164, 76769, 8674, 85806, 56292, 88037, 89169, 47080, 54766, 30196, 70649, 70651, 31230, 92159])
sun_df = sun_df[~sun_df['loc'].isin(bad_positions)]

In [4]:
# load parsed pileups
files = glob('/hive/users/ifiddes/simons_normals/96kb_consensus/*.parsed_pileup.txt')
dfs = {}
for f in files:
    n = os.path.basename(f).split('.')[0]
    dfs[n] = pd.read_csv(f, sep='\t', index_col=0)

In [5]:
# load C/D copy number estimates

files = glob('/hive/users/ifiddes/simons_normals/*.filtered.txt')

from collections import Counter
def convert(x):
    x = x.split(':')
    n, v = x
    v = int(v)
    return n, v

copy_number = {}
for x in files:
    n = os.path.basename(x).split('.')[0]
    l = open(x).next().rstrip().split()
    c = []
    for x in l[2:4]:
        _, v = convert(x)
        c.append(v)
    copy_number[n] = c

In [6]:
# filter dataframes for C = 2 and D = 2
# also filter for informative positions
filtered_dfs = {}
for n, df in dfs.iteritems():
    c = copy_number[n]
    if sum(c) != 4:
        continue
    df_m = df.merge(sun_df, on='loc')
    df_m = df_m[df_m['loc'].isin(sun_df['loc'])]
    filtered_dfs[n] = df_m

In [7]:
useful_positions = set.intersection(*[set(x['loc']) for x in filtered_dfs.itervalues()])
filtered_sun_df = sun_df[sun_df['loc'].isin(useful_positions)]

In [257]:
k = []
l = []
actual_alt = []
actual_ref = []

for n, df_m in filtered_dfs.iteritems():
    for _, s in df_m.iterrows():
        if s.coverage >= 10 and s.ratio >= 0.01:
            num_alt = 4 * s.NAB + 2 * s.NC + 2 * s.ND + 2 * s.NN  # number of alt paratypes
            num_ref = 10 - num_alt  # valid because these are all 4-2-2-2
            k.append(num_alt)
            l.append(num_ref)
            actual_alt.append(s.alt_count)
            actual_ref.append(s.ref_count)
        else:
            k.append(0)
            l.append(0)
            actual_alt.append(0)
            actual_ref.append(0)

In [262]:
k = np.array(k).reshape(len(filtered_dfs), df_m.shape[0]).T
l = np.array(l).reshape(len(filtered_dfs), df_m.shape[0]).T
actual_alt = np.array(actual_alt).reshape(len(filtered_dfs), df_m.shape[0]).T
actual_ref = np.array(actual_ref).reshape(len(filtered_dfs), df_m.shape[0]).T

In [263]:
# all proposed genotypes
def construct_C(max_n=12, min_n=8, k=4):
    list1=np.arange(0,6)
    r = []
    for i in itertools.product(list1,repeat=k):
        if min_n <= np.sum(i) <= max_n:
            r.append(i)
    return np.array(r)

C = construct_C()

In [264]:
# number of haplotypes that are expected to be alt in each C
ft = filtered_sun_df.set_index('loc').T
alt_options = np.dot(C, ft)

# ref options -- first construct vector of total number of paratypes in each row of C
tot = C.sum(axis=1)
tot = np.array([tot] * alt_options.shape[1]).T # turn into total matrix
ref_options = tot - alt_options  # remove alt to get ref counts

In [265]:
#precompute tot
tot = np.nansum(actual_alt, axis=1) + np.nansum(actual_ref, axis=1)

In [266]:
a_matrix = []
b_matrix = []
for a, b in zip(alt_options, ref_options):
    alt = (np.divide(k, np.vstack(a + 0.1)) * actual_alt).sum(axis=1)
    ref = (np.divide(l, np.vstack(b + 0.1)) * actual_ref).sum(axis=1)
    for i, (x, y) in enumerate(zip(alt, ref)):
        if np.isnan(x):
            alt[i] = 0.01 * y
        elif np.isnan(y):
            ref[i] = 0.01 * x
        assert not np.isnan(alt[i])
    u = alt / (alt + ref)
    a = u * (tot - 1)
    b = (1 - u) * (tot -1 )
    a_matrix.append(a)
    b_matrix.append(b)

In [267]:
a_matrix = np.array(a_matrix)
b_matrix = np.array(b_matrix)

In [268]:
m = []
n = []
target_coverage = 200
for _, s in filtered_sun_df.iterrows():
    x = (2.0 * s.NAB + s.NC + s.ND + s.NN) / 10 * target_coverage
    y = target_coverage - x
    m.append(x)
    n.append(y)

In [269]:
# test

r_values = []
for a, b in zip(a_matrix, b_matrix):
    r_values.append(np.sum(betaln(m + a, n + b) - betaln(a, b)))

In [270]:
r_map = {i: x for i, x in enumerate(r_values)}
best_index, score = sorted(r_map.iteritems(), key=lambda x: x[1])[-1]
best_haps = C[best_index]
ordered = sorted(r_map.iteritems(), key=lambda x: x[1])[-10:][::-1]

In [271]:
print 'log odds: {}'.format(score)
print ''
print 'results: '
for x, y in zip(filtered_sun_df.columns[1:], best_haps):
    if y > 0:
        print '{}: {}'.format(x, y)

print ' '.join(filtered_sun_df.columns[1:])
print 'top 10 hits:'
for i, x in enumerate([[C[pos], pos, val] for pos, val in ordered], 1):
    print '{}: {}'.format(i, x)


log odds: -79588.8003979

results: 
NAB: 4
NC: 2
ND: 3
NN: 3
NAB NC ND NN
top 10 hits:
1: [array([4, 2, 3, 3]), 527, -79588.800397898915]
2: [array([4, 2, 2, 3]), 522, -79737.745429188799]
3: [array([3, 2, 2, 2]), 389, -79902.636472890124]
4: [array([3, 2, 2, 3]), 390, -80039.282238943968]
5: [array([3, 1, 2, 2]), 363, -80048.964327161419]
6: [array([4, 2, 3, 2]), 526, -80063.855637273955]
7: [array([4, 2, 2, 4]), 523, -80281.622242225538]
8: [array([3, 2, 3, 2]), 395, -80294.668470365548]
9: [array([4, 2, 2, 2]), 521, -80497.698168314644]
10: [array([5, 2, 2, 3]), 638, -80532.726430768322]


In [212]:
## test old R on these features

Ct = C.T

num = np.dot(filtered_sun_df.set_index('loc'), Ct)
denom = np.sum(Ct, axis=0)
S = (1.0 + num) / (2.0 + denom)


In [221]:
S_log = np.log(S)
S_inv = np.log(1 - S)
# calculate the masking matrix based on deviance
# M is the number of alt reads, N is the number of ref reads
R = (np.dot(np.array(m) + 10, S_log) + np.dot(np.array(n) + 10, S_inv))

In [222]:
R_map = {i: x for i, x in enumerate(R)}
best_index, score = sorted(R_map.iteritems(), key=lambda x: x[1])[-1]
best_haps = C[best_index]
ordered = sorted(R_map.iteritems(), key=lambda x: x[1])[-10:][::-1]
print 'log odds: {}'.format(score)
print ''
print 'results: '
for x, y in zip(filtered_sun_df.columns[1:], best_haps):
    if y > 0:
        print '{}: {}'.format(x, y)

print ' '.join(filtered_sun_df.columns[1:])
print 'top 10 hits:'
for i, x in enumerate([[C[pos], pos, val] for pos, val in ordered], 1):
    print '{}: {}'.format(i, x)


log odds: -99419.4314475

results: 
NAB: 5
NC: 3
ND: 2
NN: 2
NAB NC ND NN
top 10 hits:
1: [array([5, 3, 2, 2]), 656, -99419.431447459021]
2: [array([5, 3, 3, 1]), 658, -99861.389458721271]
3: [array([4, 4, 2, 2]), 564, -99875.661392076465]
4: [array([5, 3, 2, 1]), 655, -99881.558633255278]
5: [array([5, 4, 2, 1]), 668, -99911.625945815176]
6: [array([4, 3, 2, 1]), 544, -100000.17029657327]
7: [array([4, 3, 2, 2]), 545, -100007.88553305517]
8: [array([5, 4, 1, 2]), 666, -100078.80157080464]
9: [array([5, 3, 1, 2]), 652, -100136.36060544752]
10: [array([4, 4, 2, 1]), 563, -100262.35489976255]


In [565]:
# sanity check -- synthetic data

syn_k = []
syn_l = []
syn_alt = []
syn_ref = []
target_coverage = 100
for i, (_, s) in enumerate(filtered_sun_df[:3].iterrows()):
    num_alt = 1 if i  == 0 else 2
    #num_alt = 4 * s.NAB + 2 * s.NC + 2 * s.ND + 2 * s.NN
    num_ref = 10 - num_alt
    syn_k.append([num_alt] * 2)
    syn_l.append([num_ref] * 2)
    tmp_alt = target_coverage * 1.0 * num_alt / 10
    syn_alt.append([tmp_alt] * 2)
    syn_ref.append([target_coverage - tmp_alt] * 2)


In [567]:
syn_k = np.array(syn_k).astype(float)
syn_l = np.array(syn_l).astype(float)
syn_alt = np.array(syn_alt).astype(float)
syn_ref = np.array(syn_ref).astype(float)

In [592]:
# all proposed genotypes
C = construct_C(4,4,2)

f = np.array([[1,0],[0,1],[0,1]])

alt_options = np.dot(C, f.T)

# ref options -- first construct vector of total number of paratypes in each row of C
tot = C.sum(axis=1)
tot = np.array([tot] * alt_options.shape[1]).T # turn into total matrix
ref_options = tot - alt_options  # remove alt to get ref counts


In [840]:
#precompute tot
tot = np.nansum(syn_alt, axis=1) + np.nansum(syn_ref, axis=1)

In [873]:
a_matrix = []
b_matrix = []
for a, b in zip(alt_options, ref_options):
    alt = (k * syn_alt).sum(axis=1) * 1.0 / a
    ref = (l * syn_ref).sum(axis=1) * 1.0 / b
    for i, (x, y) in enumerate(zip(alt, ref)):
        if np.isinf(x):
            alt[i] = 0.01 * y
        elif np.isinf(y):
            ref[i] = 0.01 * x
        assert not np.isinf(alt[i])
    u = alt / (alt + ref)
    a = u * (tot - 1)
    b = (1 - u) * (tot -1 )
    a_matrix.append(a)
    b_matrix.append(b)

In [874]:
a_matrix = np.array(a_matrix)
b_matrix = np.array(b_matrix)

In [875]:
print 'coverage: {}'.format(target_coverage)
print 'A_{{0,0}}: {}'.format(a_matrix[0][0])
print 'B_{{0,0}}: {}'.format(b_matrix[0][0])


coverage: 100
A_{0,0}: 1.9702970297
B_{0,0}: 197.02970297


In [876]:
# test
m = np.array([[10, 10, 10]])
n = np.array([[90, 90, 90]])
r_values = []
for a, b in zip(a_matrix, b_matrix):
    r_values.append(np.sum(betaln(m + a, n + b) - betaln(a, b)))

In [877]:
r_map = {i: x for i, x in enumerate(r_values)}
best_index, score = sorted(r_map.iteritems(), key=lambda x: x[1])[-1]
best_haps = C[best_index]
ordered = sorted(r_map.iteritems(), key=lambda x: x[1])[-10:][::-1]

In [878]:
print 'log odds: {}'.format(score)
print ''
print 'results: '
for x, y in zip(filtered_sun_df.columns[1:], best_haps):
    if y > 0:
        print '{}: {}'.format(x, y)

print ' '.join(filtered_sun_df.columns[1:])
print 'top 10 hits:'
for i, x in enumerate([[C[pos], pos, val] for pos, val in ordered], 1):
    print '{}: {}'.format(i, x)


log odds: -106.641161414

results: 
NAB: 2
NC: 2
NAB NC ND NN
top 10 hits:
1: [array([2, 2]), 2, -106.64116141427075]
2: [array([3, 1]), 3, -109.8657092331448]
3: [array([1, 3]), 1, -110.33930959783766]
4: [array([4, 0]), 4, -255.49329274306922]
5: [array([0, 4]), 0, -390.63856659581404]


In [683]:
# feature matrix -- 1 A feature and 2 B features
f = np.array([[1, 0],
             [0, 1],
             [0, 1]])

In [919]:
# 2 synthetic genomes, rows are features 
# both synthetic genomes therefore are expected to have 1 copy of A and 2 copies of B
k = np.array([[1, 1],
             [2, 2],
             [2, 2]])
k = k.astype(float)

In [920]:
# and therefore we expect to see 9 copies of ref in A features and 8 in B features
l = np.array([[9, 9],
             [8, 8],
             [8, 8]])
l = l.astype(float)

In [921]:
# setting coverage to 10, in our idealized scenario, actualAlt and actualRef are 10 * k and 10 * l 
actual_alt = k * 10
actual_ref = l * 10

In [938]:
# let us propose the haplotype [2,2]
c = np.array([2, 2])
k_i = np.sum(c * f, axis=1)
l_i = 10 - k_i
print 'k_i: {}\n'.format(K)
print 'l_i: {}\n'.format(L)

k_i: [2 2 2]

l_i: [8 8 8]



In [939]:
# alt
alt = (k * actual_alt).sum(axis=1) * 1.0 / k_i
print alt

[ 10.  40.  40.]


In [940]:
# ref
ref = (l * actual_ref).sum(axis=1) * 1.0 / l_i
print ref

[ 202.5  160.   160. ]


In [941]:
u = alt / (alt + ref)
print 'u: {}'.format(u)
a = u * (tot - 1)
print 'a: {}'.format(a)
b = (1 - u) * (tot  - 1)
print 'b: {}'.format(b)

u: [ 0.04705882  0.2         0.2       ]
a: [  9.36470588  39.8         39.8       ]
b: [ 189.63529412  159.2         159.2       ]


In [942]:
# fake data that matches the expected haplotype of [1, 2]
m = np.array([[10, 20, 20]])
n = np.array([[90, 80, 80]])

In [943]:
# calculate r
r = np.sum(betaln(m + a, n + b) - betaln(a, b))
print r

-134.809528911


In [950]:
# repeat this for the true haplotype of [1, 2]
c = np.array([1, 2])
k_i = np.sum(c * f, axis=1)
l_i = 10 - k_i
alt = (k * actual_alt).sum(axis=1) * 1.0 / k_i
ref = (l * actual_ref).sum(axis=1) * 1.0 / l_i
u = alt / (alt + ref)
a = u * (tot - 1)
b = (1 - u) * (tot  - 1)
r = np.sum(betaln(m + a, n + b) - betaln(a, b))
print r

-133.20237652


In [984]:
# repeat this for all possible haplotypes, in matrix form

# all proposed genotypes
C = np.array([[0, 3],
             [1, 2],
             [2, 1],
             [3, 0]])
all_k = np.dot(C, f.T)
all_l = 10 - all_k
print 'C (all proposed haplotypes):\n{}\n'.format(C)

print 'all_k (each row is a proposed haplotype each column is a feature):\n{}\n'.format(all_k)

print 'all_l (each row is a proposed haplotype each column is a feature):\n{}\n'.format(all_l)

C (all proposed haplotypes):
[[0 3]
 [1 2]
 [2 1]
 [3 0]]

all_k (each row is a proposed haplotype each column is a feature):
[[0 3 3]
 [1 2 2]
 [2 1 1]
 [3 0 0]]

all_l (each row is a proposed haplotype each column is a feature):
[[10  7  7]
 [ 9  8  8]
 [ 8  9  9]
 [ 7 10 10]]



In [985]:
#precompute tot
tot = np.nansum(syn_alt, axis=1) + np.nansum(syn_ref, axis=1)
print tot

[ 200.  200.  200.]


In [989]:
# now compute a and b for each proposed haplotype
a_matrix = []
b_matrix = []
for k_i, l_i in zip(all_k, all_l):
    # calculate and ref as before
    alt = (k * syn_alt).sum(axis=1) * 1.0 / k_i
    ref = (l * syn_ref).sum(axis=1) * 1.0 / l_i
    # need to handle the case where k_i is 0 or l_i is 0
    for i, (x, y) in enumerate(zip(alt, ref)):
        if np.isinf(x):
            alt[i] = 0.01 * y
        elif np.isinf(y):
            ref[i] = 0.01 * x
        # k_i and l_i should never be 0 at the same time
        assert not np.isinf(alt[i])
    u = alt / (alt + ref)
    a = u * (tot - 1)
    b = (1 - u) * (tot - 1)
    print 'alt/ref ratio for a = {} and b = {}:\n{}'.format(a, b, alt / ref)
    a_matrix.append(a)
    b_matrix.append(b)
    
a_matrix = np.array(a_matrix)
b_matrix = np.array(b_matrix)

alt/ref ratio for a = [  1.97029703  25.32727273  25.32727273] and b = [ 197.02970297  173.67272727  173.67272727]:
[ 0.01        0.14583333  0.14583333]
alt/ref ratio for a = [ 19.9  39.8  39.8] and b = [ 179.1  159.2  159.2]:
[ 0.11111111  0.25        0.25      ]
alt/ref ratio for a = [  9.36470588  71.64        71.64      ] and b = [ 189.63529412  127.36        127.36      ]:
[ 0.04938272  0.5625      0.5625    ]
alt/ref ratio for a = [ 5.572       1.97029703  1.97029703] and b = [ 193.428       197.02970297  197.02970297]:
[ 0.02880658  0.01        0.01      ]


In [987]:
# score these matrices
r_values = []
for a, b in zip(a_matrix, b_matrix):
    r_values.append(np.sum(betaln(m + a, n + b) - betaln(a, b)))

In [988]:
# figure out which one was the best scoring
r_map = {i: x for i, x in enumerate(r_values)}
best_index, score = sorted(r_map.iteritems(), key=lambda x: x[1])[-1]
best_haps = C[best_index]
ordered = sorted(r_map.iteritems(), key=lambda x: x[1])[-10:][::-1]
print 'log odds: {}'.format(score)
print ''
print 'results: '
for x, y in zip(filtered_sun_df.columns[1:], best_haps):
    if y > 0:
        print '{}: {}'.format(x, y)

print ' '.join(filtered_sun_df.columns[1:])
print 'top 10 hits:'
for i, x in enumerate([[C[pos], pos, val] for pos, val in ordered], 1):
    print '{}: {}'.format(i, x)


log odds: -133.20237652

results: 
NAB: 1
NC: 2
NAB NC ND NN
top 10 hits:
1: [array([1, 2]), 1, -133.20237652046524]
2: [array([2, 1]), 2, -143.11191242136056]
3: [array([0, 3]), 0, -143.3971861547783]
4: [array([3, 0]), 3, -173.58599608433849]


In [1046]:
# now let's scale this to 100 synthetic genomes, with 100 coverage, across our real feature set
syn_k = []
syn_l = []
syn_alt = []
syn_ref = []
target_coverage = 100
num_genomes = 100
for _, s in filtered_sun_df.iterrows():
    num_alt = 4 * s.NAB + 2 * s.NC + 2 * s.ND + 2 * s.NN
    num_ref = 10 - num_alt
    syn_k.append([num_alt] * num_genomes)
    syn_l.append([num_ref] * num_genomes)
    tmp_alt = target_coverage * 1.0 * num_alt / 10
    syn_alt.append([tmp_alt] * num_genomes)
    syn_ref.append([target_coverage - tmp_alt] * num_genomes)

syn_k = np.array(syn_k).astype(float)
syn_l = np.array(syn_l).astype(float)
syn_alt = np.array(syn_alt).astype(float)
syn_ref = np.array(syn_ref).astype(float)

In [1047]:
# allow anywhere from 8 to 12 haplotypes
min_n = 8
max_n = 12

# range of possible genotypes
genotypes = range(0, 6)  # [0, 1, 2, 3, 4, 5]

# number of columns
num_paratypes = 4
r = []
for i in itertools.product(genotypes, repeat=num_paratypes):
    if min_n <= np.sum(i) <= max_n:
        r.append(i)

C = np.array(r).astype(float)
print 'first 10 proposed haplotypes:'
print C[:10]

first 10 proposed haplotypes:
[[ 0.  0.  3.  5.]
 [ 0.  0.  4.  4.]
 [ 0.  0.  4.  5.]
 [ 0.  0.  5.  3.]
 [ 0.  0.  5.  4.]
 [ 0.  0.  5.  5.]
 [ 0.  1.  2.  5.]
 [ 0.  1.  3.  4.]
 [ 0.  1.  3.  5.]
 [ 0.  1.  4.  3.]]


In [1048]:
f = filtered_sun_df[['NAB', 'NC', 'ND', 'NN']]
all_k = np.dot(C, f.T)

# now we can't just subtract 10, because the total number of paratypes at each proposed genotype changes
num_genotypes = C.sum(axis=1)
all_l = (num_genotypes - all_k.T).T

print 'all_k[0][:10]: {}'.format(all_k[0][:10])

print 'all_l[0][:10]: {}'.format(all_l[0][:10])


all_k[0][:10]: [ 5.  3.  5.  5.  5.  3.  8.  8.  8.  0.]
all_l[0][:10]: [ 3.  5.  3.  3.  3.  5.  0.  0.  0.  8.]


In [1049]:
#precompute tot
tot = np.nansum(syn_alt, axis=1) + np.nansum(syn_ref, axis=1)
print tot[0]

10000.0


In [1051]:
# now compute a and b for each proposed haplotype
a_matrix = []
b_matrix = []
for k_i, l_i in zip(all_k, all_l):
    # calculate and ref as before
    alt = (syn_k * syn_alt).sum(axis=1) * 1.0 / k_i
    ref = (syn_l * syn_ref).sum(axis=1) * 1.0 / l_i
    # need to handle the case where k_i is 0 or l_i is 0
    for i, (x, y) in enumerate(zip(alt, ref)):
        if np.isinf(x):
            alt[i] = 0.01 * y
        elif np.isinf(y):
            ref[i] = 0.01 * x
        # k_i and l_i should never be 0 at the same time
        assert not np.isinf(alt[i])
    u = alt / (alt + ref)
    a = u * (tot - 1)
    b = (1 - u) * (tot - 1)
    a_matrix.append(a)
    b_matrix.append(b)
    
a_matrix = np.array(a_matrix)
b_matrix = np.array(b_matrix)

In [1057]:
# score these matrices vs. synthetic data
# synthetic data is the same as training data

m = syn_alt.T[0]
n = syn_ref.T[0]

r_values = []
for a, b in zip(a_matrix, b_matrix):
    r_values.append(np.sum(betaln(m + a, n + b) - betaln(a, b)))
    
# figure out which one was the best scoring
r_map = {i: x for i, x in enumerate(r_values)}
best_index, score = sorted(r_map.iteritems(), key=lambda x: x[1])[-1]
best_haps = C[best_index]
ordered = sorted(r_map.iteritems(), key=lambda x: x[1])[-10:][::-1]
print 'log odds: {}'.format(score)
print ''
print 'results: '
for x, y in zip(filtered_sun_df.columns[1:], best_haps):
    if y > 0:
        print '{}: {}'.format(x, y)

print ' '.join(filtered_sun_df.columns[1:])
print 'top 10 hits:'
for i, x in enumerate([[C[pos], pos, val] for pos, val in ordered], 1):
    print '{}: {}'.format(i, x)


log odds: -51185.8148424

results: 
NAB: 4.0
NC: 2.0
ND: 2.0
NN: 2.0
NAB NC ND NN
top 10 hits:
1: [array([ 4.,  2.,  2.,  2.]), 521, -51185.814842386753]
2: [array([ 5.,  2.,  2.,  2.]), 637, -51414.567363092057]
3: [array([ 5.,  2.,  3.,  2.]), 641, -51428.771870830256]
4: [array([ 5.,  2.,  2.,  3.]), 638, -51443.314342446174]
5: [array([ 3.,  2.,  2.,  2.]), 389, -51531.480063198112]
6: [array([ 4.,  2.,  3.,  2.]), 526, -51545.330390295094]
7: [array([ 4.,  3.,  2.,  2.]), 545, -51557.311197329982]
8: [array([ 4.,  2.,  2.,  3.]), 522, -51600.629128480788]
9: [array([ 5.,  3.,  2.,  2.]), 656, -51631.923060585381]
10: [array([ 4.,  3.,  3.,  2.]), 549, -51663.076355431651]
