In [83]:
import pandas as pd
from glob import glob
import numpy as np
from tools.procOps import *
from tools.fileOps import *
import itertools
from scipy.special import beta
sun_df = pd.read_csv('copy_number/hg38_features.txt', sep='\t')
sun_df.columns = ['loc', 'NAB', 'NC', 'ND', 'NN']
sun_df = (sun_df.set_index('loc') * 2).reset_index()
sun_df.head()

Unnamed: 0,loc,NAB,NC,ND,NN
0,56,0,0,0,2
1,178,0,0,2,0
2,240,0,0,0,2
3,386,4,0,0,0
4,393,2,0,0,0


In [3]:
from tools.bio import *
fa = '/hive/users/ifiddes/notch2nl_berkeley_data/imputation_pipeline/copy_number/consensus.fa'
_, seq = read_fasta(fa, None).next()

In [6]:
# load parsed pileups
files = glob('/hive/users/ifiddes/simons_normals/96kb_consensus/*.parsed_pileup.txt')
dfs = {}
for f in files:
    n = os.path.basename(f).split('.')[0]
    dfs[n] = pd.read_csv(f, sep='\t', index_col=0)

In [8]:
# load C/D copy number estimates

files = glob('/hive/users/ifiddes/simons_normals/*.filtered.txt')

from collections import Counter
def convert(x):
    x = x.split(':')
    n, v = x
    v = int(v)
    return n, v

copy_number = {}
for x in files:
    n = os.path.basename(x).split('.')[0]
    l = open(x).next().rstrip().split()
    c = []
    for x in l[2:4]:
        _, v = convert(x)
        c.append(v)
    copy_number[n] = c

In [13]:
# filter dataframes for C = 2 and D = 2
# also filter for informative positions
filtered_dfs = {}
for n, df in dfs.iteritems():
    c = copy_number[n]
    if sum(c) != 4:
        continue
    df_m = df.merge(sun_df, on='loc')
    df_m = df_m[df_m['loc'].isin(sun_df['loc'])]
    filtered_dfs[n] = df_m

In [42]:
# calculate the average frequency of alts in the population for C/D SUNs
# if this frequency is below 0.1 or above 0.3 discard the feature
seen_alts = defaultdict(list)
for df in filtered_dfs.itervalues():
    for _, s in df.iterrows():
        if s.NAB > 0:
            continue
        e = 1.0 * (s.NAB + s.NC + s.ND + s.NN) / 10
        seen_alts[s['loc']].append([e, s.ratio])

In [59]:
means_variances = {}
N = len(filtered_dfs)  # number of genomes
bad_positions = set()
for loc, ratios in seen_alts.iteritems():
    e, ratios = zip(*ratios)
    e = e[0]  # always the same number
    mean = np.mean(ratios)
    if mean < e - 0.1 or mean > e + 0.1:
        bad_positions.add(loc)
    else:
        variance = 1.0 * 1 / (N - 1) * sum((x - mean) ** 2 for x in ratios)
        means_variances[loc] = [e, mean, variance]

In [60]:
# number discarded due to wonky ratio (not fixed):
print len(bad_positions)

97


In [61]:
# discard features whose variance is too high
filtered_means_variances = {}
for loc, (e, mean, variance) in means_variances.iteritems():
    if variance >= mean * (1 - mean):
        bad_positions.add(loc)
    else:
        filtered_means_variances[loc] = [e, mean, variance]

In [62]:
# number discarded
print len(bad_positions)

97


In [141]:
# calculate a/b
final_values = []
for loc, (e, mean, variance) in filtered_means_variances.iteritems():
    z = (mean * (1 - mean)) / variance - 1
    a = mean * z
    b = (1 - mean) * z
    final_values.append([loc, a, b, mean, variance, e])
final_values = pd.DataFrame(final_values, columns=['loc', 'a', 'b', 'mean', 'variance', 'expected'])
final_values = final_values.sort_values('loc')

In [87]:
def construct_C(max_n=12, min_n=8, k=4):
    list1=np.arange(0,6)
    r = []
    for l in itertools.product(list1,repeat=k):
        if min_n <= np.sum(l) <= max_n:
            r.append(l)
    return np.array(r)

C = construct_C()

In [193]:
filtered_features = sun_df[sun_df['loc'].isin(final_values['loc'])].set_index('loc')

In [194]:
adjusted_features = filtered_features.copy()
adjusted_features['NAB'] = [1 if x > 0 else 0 for x in adjusted_features['NAB']]
adjusted_features['NC'] = [1 if x > 0 else 0 for x in adjusted_features['NC']]
adjusted_features['ND'] = [1 if x > 0 else 0 for x in adjusted_features['ND']]
adjusted_features['NN'] = [1 if x > 0 else 0 for x in adjusted_features['NN']]

In [223]:
# test
data = dfs.values()[0]
data = data[data['loc'].isin(adjusted_features.index)]
feature_subset = adjusted_features[adjusted_features.index.isin(data['loc'])]

Ct = C.T
num = np.dot(feature_subset, Ct)
denom = np.sum(Ct, axis=0)
S = (1.0 + num) / (2.0 + denom)
S_log = np.log(S)
S_inv = np.log(1 - S)
M = data.ref_count
N = data.alt_count
R = (np.dot(M + 10, S_log) + np.dot(N + 10, S_inv))

In [224]:
a = final_values.a.tolist()
b = final_values.b.tolist()

R_prime = np.log(beta(N + a, M + b)) - np.log(beta(a, b))

In [229]:
R_map = {i: x for i, x in enumerate(R)}
best_index, score = sorted(R_map.iteritems(), key=lambda x: x[1])[-1]
best_haps = C[best_index]

In [231]:
R_prime_map = {i: x for i, x in enumerate(R_prime)}
best_index, score = sorted(R_prime_map.iteritems(), key=lambda x: x[1])[-1]
best_haps = C[best_index]