In [1]:
import pandas as pd
from glob import glob
import numpy as np
from tools.procOps import *
from tools.fileOps import *
import itertools
import multiprocessing
from scipy.special import *
from tools.bio import *

In [3]:
sun_df = pd.read_csv('copy_number/hg38_features_split_a_b.txt', sep='\t')
sun_df.columns = ['loc', 'NA', 'NB', 'NC', 'ND', 'NN']
sun_df['NA'] = [1 if x > 0 else 0 for x in sun_df['NA']]
sun_df['NB'] = [1 if x > 0 else 0 for x in sun_df['NB']]
sun_df['NC'] = [1 if x > 0 else 0 for x in sun_df['NC']]
sun_df['ND'] = [1 if x > 0 else 0 for x in sun_df['ND']]
sun_df['NN'] = [1 if x > 0 else 0 for x in sun_df['NN']]
sun_df.head()

Unnamed: 0,loc,NA,NB,NC,ND,NN
0,56,0,0,0,0,1
1,178,0,0,0,1,0
2,240,0,0,0,0,1
3,386,1,1,0,0,0
4,393,1,0,0,0,0


In [4]:
# remove bad positions
bad_positions = set([48646, 75180, 17731, 43539, 2967, 54815, 64033, 89900, 66088, 68137, 90156, 85939, 87309, 70198, 54452, 1594, 77455, 8253, 73792, 7776, 60995, 44613, 96353, 24138, 48718, 13391, 47185, 24659, 29269, 13913, 23133, 68191, 29280, 69729, 1635, 88677, 10513, 74860, 54893, 90226, 57973, 74358, 58487, 82556, 10517, 76419, 59524, 74373, 41094, 8843, 63630, 74383, 5267, 57493, 12439, 35992, 85862, 3235, 48297, 68722, 24754, 74420, 95925, 7356, 7871, 38598, 18209, 5841, 39122, 67796, 68309, 94430, 6370, 93478, 49894, 37095, 52457, 16620, 92397, 60142, 76015, 41712, 93480, 41716, 54006, 22990, 48376, 16597, 92418, 40711, 93484, 63242, 26380, 898, 92431, 29144, 49941, 43799, 48409, 91419, 90398, 92448, 25889, 64803, 10534, 24359, 39720, 46889, 93482, 25900, 93486, 76080, 20276, 78646, 53056, 86339, 76104, 41186, 14670, 67410, 23382, 83803, 17245, 17760, 63844, 88806, 74086, 26983, 46953, 88039, 878, 12655, 16345, 7957, 38774, 50409, 386, 1496, 41878, 66952, 393, 53655, 60815, 15250, 58691, 41454, 64406, 70039, 2459, 11679, 86432, 34203, 94628, 2469, 72614, 84903, 74664, 42921, 29100, 40370, 37299, 78266, 9149, 61385, 65996, 29133, 85454, 89553, 45522, 21459, 27096, 47065, 51164, 76769, 8674, 85806, 56292, 88037, 89169, 47080, 54766, 30196, 70649, 70651, 31230, 92159])
sun_df = sun_df[~sun_df['loc'].isin(bad_positions)]

In [5]:
# load parsed pileups
files = glob('/hive/users/ifiddes/simons_normals/96kb_consensus/*.parsed_pileup.txt')
dfs = {}
for f in files:
    n = os.path.basename(f).split('.')[0]
    dfs[n] = pd.read_csv(f, sep='\t', index_col=0)

In [6]:
# load C/D copy number estimates

files = glob('/hive/users/ifiddes/simons_normals/*.filtered.txt')

from collections import Counter
def convert(x):
    x = x.split(':')
    n, v = x
    v = int(v)
    return n, v

copy_number = {}
for x in files:
    n = os.path.basename(x).split('.')[0]
    l = open(x).next().rstrip().split()
    c = []
    for x in l[2:4]:
        _, v = convert(x)
        c.append(v)
    copy_number[n] = c

In [7]:
# filter dataframes for C = 2 and D = 2
# also filter for informative positions
filtered_dfs = {}
for n, df in dfs.iteritems():
    c = copy_number[n]
    if sum(c) != 4:
        continue
    df_m = df.merge(sun_df, on='loc')
    df_m = df_m[df_m['loc'].isin(sun_df['loc'])]
    filtered_dfs[n] = df_m

In [8]:
useful_positions = set.intersection(*[set(x['loc']) for x in filtered_dfs.itervalues()])
filtered_sun_df = sun_df[sun_df['loc'].isin(useful_positions)]

In [10]:
k = []
l = []
actual_alt = []
actual_ref = []

for n, df_m in filtered_dfs.iteritems():
    for _, s in df_m.iterrows():
        if s.coverage >= 10 and s.ratio >= 0.01:
            num_alt = 2 * s.NA + 2 * s.NB + 2 * s.NC + 2 * s.ND + 2 * s.NN  # number of alt paratypes
            num_ref = 10 - num_alt  # valid because these are all 4-2-2-2
            k.append(num_alt)
            l.append(num_ref)
            actual_alt.append(s.alt_count)
            actual_ref.append(s.ref_count)
        else:
            k.append(0)
            l.append(0)
            actual_alt.append(0)
            actual_ref.append(0)

In [11]:
k = np.array(k).reshape(df_m.shape[0], len(filtered_dfs))
l = np.array(l).reshape(df_m.shape[0], len(filtered_dfs))
actual_alt = np.array(actual_alt).reshape(df_m.shape[0], len(filtered_dfs))
actual_ref = np.array(actual_ref).reshape(df_m.shape[0], len(filtered_dfs))

In [17]:
# all proposed genotypes
def construct_C(max_n=12, min_n=8, k=5):
    list1=np.arange(0,6)
    r = []
    for i in itertools.product(list1,repeat=k):
        if min_n <= np.sum(i) <= max_n:
            r.append(i)
    return np.array(r)

C = construct_C()

In [20]:
# number of haplotypes that are expected to be alt in each C
ft = filtered_sun_df.set_index('loc').T
alt_options = np.dot(C, ft)

# ref options -- first construct vector of total number of paratypes in each row of C
tot = C.sum(axis=1)
tot = np.array([tot] * alt_options.shape[1]).T # turn into total matrix
ref_options = tot - alt_options  # remove alt to get ref counts

In [21]:
#precompute tot
tot = np.nansum(actual_alt, axis=1) + np.nansum(actual_ref, axis=1)

In [22]:
a_matrix = []
b_matrix = []
for a, b in zip(alt_options, ref_options):
    alt = (np.divide(k, np.vstack(a + 0.1)) * actual_alt).sum(axis=1)
    ref = (np.divide(l, np.vstack(b + 0.1)) * actual_ref).sum(axis=1)
    for i, (x, y) in enumerate(zip(alt, ref)):
        if np.isnan(x):
            alt[i] = 0.01 * y
        elif np.isnan(y):
            ref[i] = 0.01 * x
        assert not np.isnan(alt[i])
    u = alt / (alt + ref)
    a = u * (tot - 1)
    b = (1 - u) * (tot -1 )
    a_matrix.append(a)
    b_matrix.append(b)

In [23]:
a_matrix = np.array(a_matrix)
b_matrix = np.array(b_matrix)

In [25]:
m = []
n = []
target_coverage = 200
for _, s in filtered_sun_df.iterrows():
    x = (2.0 * s.NA + s.NB + s.NC + s.ND + s.NN) / 10 * target_coverage
    y = target_coverage - x
    m.append(x)
    n.append(y)

In [26]:
# test

r_values = []
for a, b in zip(a_matrix, b_matrix):
    r_values.append(np.sum(betaln(m + a, n + b) - betaln(a, b)))

In [27]:
r_map = {i: x for i, x in enumerate(r_values)}
best_index, score = sorted(r_map.iteritems(), key=lambda x: x[1])[-1]
best_haps = C[best_index]
ordered = sorted(r_map.iteritems(), key=lambda x: x[1])[-10:][::-1]

In [28]:
print 'log odds: {}'.format(score)
print ''
print 'results: '
for x, y in zip(filtered_sun_df.columns[1:], best_haps):
    if y > 0:
        print '{}: {}'.format(x, y)

print ' '.join(filtered_sun_df.columns[1:])
print 'top 10 hits:'
for i, x in enumerate([[C[pos], pos, val] for pos, val in ordered], 1):
    print '{}: {}'.format(i, x)


log odds: -90547.7740275

results: 
NA: 1
NB: 1
NC: 2
ND: 2
NN: 2
NA NB NC ND NN
top 10 hits:
1: [array([1, 1, 2, 2, 2]), 831, -90547.774027524792]
2: [array([1, 1, 3, 2, 2]), 857, -90602.882850657959]
3: [array([1, 1, 3, 2, 3]), 858, -90835.286303466099]
4: [array([1, 2, 3, 2, 3]), 995, -91123.109345410616]
5: [array([2, 1, 3, 2, 3]), 1535, -91314.782924877698]
6: [array([1, 2, 3, 3, 3]), 1000, -91404.069658403256]
7: [array([1, 2, 4, 2, 3]), 1019, -91486.604064697778]
8: [array([2, 1, 3, 3, 3]), 1540, -91567.670955308713]
9: [array([1, 1, 4, 2, 3]), 885, -91648.355608062688]
10: [array([2, 1, 4, 2, 3]), 1559, -91649.836772015347]


In [14]:
%connect_info

{
  "stdin_port": 44804, 
  "ip": "127.0.0.1", 
  "control_port": 37590, 
  "hb_port": 34893, 
  "signature_scheme": "hmac-sha256", 
  "key": "70a5b8cb-5372-48ff-958e-1c752754b9f0", 
  "kernel_name": "", 
  "shell_port": 33155, 
  "transport": "tcp", 
  "iopub_port": 34298
}

Paste the above JSON into a file, and connect with:
    $> ipython <app> --existing <file>
or, if you are local, you can connect with just:
    $> ipython <app> --existing /cluster/home/ifiddes/.local/share/jupyter/runtime/kernel-51b5bed3-3128-4132-a0d7-26a31128b203.json 
or even just:
    $> ipython <app> --existing 
if this is the most recent IPython session you have started.
