In [810]:
import pandas as pd
from glob import glob
import numpy as np
from tools.procOps import *
from tools.fileOps import *
from collections import *
import itertools
from scipy.special import *
from tools.bio import *

In [811]:
# load feature set
sun_df = pd.read_csv('merged_features.tsv', sep='\t')

In [813]:
# filter features such that if they are C/D/N they must be unambiguous
f = []
for loc, s in sun_df.set_index('loc').iterrows():
    if s.C > 0 or s.D > 0 or s.N > 0:
        if sum(s) > 1:
            continue
    f.append([loc] + list(s))
sun_df = pd.DataFrame(f, columns=sun_df.columns)

In [815]:
# now try this on real data

# load parsed pileups
files = glob('/hive/users/ifiddes/simons_normals/*.parsed_pileup.txt')
dfs = {}
for f in files:
    n = os.path.basename(f).split('.')[0]
    dfs[n] = pd.read_csv(f, sep='\t', index_col=0)
    
# load C/D copy number estimates

files = glob('/hive/users/ifiddes/simons_normals/*.filtered.txt')
def convert(x):
    x = x.split(':')
    n, v = x
    v = int(v)
    return n, v

copy_number = {}
for x in files:
    n = os.path.basename(x).split('.')[0]
    l = open(x).next().rstrip().split()
    c = []
    for x in l[2:4]:
        _, v = convert(x)
        c.append(v)
    copy_number[n] = c
    
    
# filter dataframes for C = 2 and D = 2
# also filter for informative positions
filtered_dfs = {}
for n, df in dfs.iteritems():
    c = copy_number[n]
    if sum(c) != 4:
        continue
    df_m = df.merge(sun_df, on='loc')
    df_m = df_m[df_m['loc'].isin(sun_df['loc'])]
    filtered_dfs[n] = df_m

In [818]:
# filter SUNs because there are 3 loci missing in the real data for some reason
filtered_sun_df = sun_df[sun_df['loc'].isin(df_m['loc'])]

In [820]:
# calculate total depth at each position
tot = []
for n, df_m in filtered_dfs.iteritems():
    tot.append(df_m.coverage)
tot = np.array(tot)
tot = tot.sum(axis=0)

In [816]:
# allow anywhere from 7 to 12 haplotypes
min_n = 7
max_n = 12

# range of possible genotypes
genotypes = range(0, 6)  # [0, 1, 2, 3, 4, 5]

# number of columns
num_paratypes = 4
r = []
for i in itertools.product(genotypes, repeat=num_paratypes):
    if min_n <= np.sum(i) <= max_n:
        r.append(i)

C = np.array(r).astype(float)
print 'first 10 proposed haplotypes:'
print C[:10]

f = filtered_sun_df[['NAB', 'NC', 'ND', 'NN']]
all_K = np.dot(C, f.T)

# now we can't just subtract 10, because the total number of paratypes at each proposed genotype changes
num_genotypes = C.sum(axis=1)
all_L = (num_genotypes - all_K.T).T

print 'all_k[0][:10]: {}'.format(all_K[0][:10])

print 'all_l[0][:10]: {}'.format(all_L[0][:10])

first 10 proposed haplotypes:
[[ 0.  0.  2.  5.]
 [ 0.  0.  3.  4.]
 [ 0.  0.  3.  5.]
 [ 0.  0.  4.  3.]
 [ 0.  0.  4.  4.]
 [ 0.  0.  4.  5.]
 [ 0.  0.  5.  2.]
 [ 0.  0.  5.  3.]
 [ 0.  0.  5.  4.]
 [ 0.  0.  5.  5.]]
all_k[0][:10]: [ 5.  2.  5.  5.  5.  2.  7.  7.  7.  0.]
all_l[0][:10]: [ 2.  5.  2.  2.  2.  5.  0.  0.  0.  7.]


In [None]:
# load training data. training data are only our solved genomes -- NA24385, NA12878, NA19240, H9 and CHM1

pileups = {'NA24385': '/hive/users/cbosworth/imputation/NA24385/'}

In [821]:
# load actual data from pileups into actual_alt, synthetic_alt, depth, s
actual_alt = []
synthetic_alt = []
depth = []
s_matrix = []  # genotype matrix

f = filtered_sun_df.set_index('loc').as_matrix()

for n, df_m in filtered_dfs.iteritems():
    for _, s in df_m.iterrows():
        if s.coverage >= 10 and s.ratio >= 0.01:
            actual_alt.append(s.alt_count)
            depth.append(s.coverage)
        else:
            depth.append(0)
            actual_alt.append(0)
    # always 4-2-2-2
    s_matrix.append([4, 2, 2, 2])

In [822]:
depth = np.array(depth).reshape(len(filtered_dfs), df_m.shape[0]).T
actual_alt = np.array(actual_alt).reshape(len(filtered_dfs), df_m.shape[0]).T
s = np.array(s_matrix).T

In [823]:
# calculate synthetic depth for all genomes
tsa = (np.multiply(depth, np.dot(f, s)) / 10).sum(axis=1)  # ploidy is always 10

ValueError: shapes (881,20) and (4,104) not aligned: 20 (dim 1) != 4 (dim 0)

In [None]:
taa = actual_alt.sum(axis=1)

In [None]:
depth_total = depth.sum(axis=1)

In [None]:
synthetic_alt = []
for h in H:
    synthetic_alt.append(np.multiply(depth_total, np.dot(f, h)) / sum(h))
synthetic_alt = np.array(synthetic_alt)

In [None]:
bias = taa / tsa

In [None]:
projected_alt = np.multiply(synthetic_alt, bias)

In [None]:
projected_ref = depth_total - projected_alt

In [None]:
u = projected_alt / (projected_alt + projected_ref)
a_matrix = u * (tot - 1)
b_matrix = (1 - u) * (tot  - 1)

In [None]:
# mix a and b slightly to give noise
delta = 0.01
aprime = (1 - delta) * a_matrix + delta * b_matrix
bprime = (1 - delta) * b_matrix + delta * a_matrix

In [None]:
# score these matrices for a random person (person #1)
m = df_m.alt_count
n = df_m.ref_count

r_values = []
for a, b in zip(aprime, bprime):
    r_values.append(np.sum(betaln(m + a, n + b) - betaln(a, b)))
    
# figure out which one was the best scoring
r_map = {i: x for i, x in enumerate(r_values)}
best_index, score = sorted(r_map.iteritems(), key=lambda x: x[1])[-1]
best_haps = H[best_index]
ordered = sorted(r_map.iteritems(), key=lambda x: x[1])[-10:][::-1]
print 'log odds: {}'.format(score)
print ''
print 'results: '
for x, y in zip(sun_df.columns[1:], best_haps):
    if y > 0:
        print '{}: {}'.format(x, y)

print ' '.join(sun_df.columns[1:])
print 'top 10 hits:'
for i, x in enumerate([[H[pos], pos, val] for pos, val in ordered], 1):
    print '{}: {}'.format(i, x)


In [None]:
# now lets try predicting copy number across the whole cohort

results = []
for g, df_m in dfs.iteritems():
    df_m = df_m[df_m['loc'].isin(filtered_sun_df['loc'])]
    m = df_m.alt_count
    n = df_m.ref_count
    r_values = []
    for a, b in zip(a_matrix, b_matrix):
        r_values.append(np.sum(betaln(m + a, n + b) - betaln(a, b)))
    # figure out which one was the best scoring
    r_map = {i: x for i, x in enumerate(r_values)}
    best_index, score = sorted(r_map.iteritems(), key=lambda x: x[1])[-1]
    best_haps = C[best_index]
    results.append([g] + list(best_haps))

In [None]:
results = pd.DataFrame(results, columns=['genome', 'NAB', 'NC', 'ND', 'NN'])
results.head()

In [None]:
Counter(results.NAB)

In [None]:
Counter(results.NC)

In [None]:
Counter(results.ND)

In [None]:
Counter(results.NN)

In [824]:
%connect_info

{
  "stdin_port": 53402, 
  "ip": "127.0.0.1", 
  "control_port": 46384, 
  "hb_port": 59930, 
  "signature_scheme": "hmac-sha256", 
  "key": "3eab0c5b-e164-4bd0-a925-70495a18f475", 
  "kernel_name": "", 
  "shell_port": 59278, 
  "transport": "tcp", 
  "iopub_port": 57040
}

Paste the above JSON into a file, and connect with:
    $> ipython <app> --existing <file>
or, if you are local, you can connect with just:
    $> ipython <app> --existing /cluster/home/ifiddes/.local/share/jupyter/runtime/kernel-7f5cc25f-c4ca-4470-88c5-94be0e177a03.json 
or even just:
    $> ipython <app> --existing 
if this is the most recent IPython session you have started.
