In [3]:
%connect_info

{
  "stdin_port": 53831, 
  "ip": "127.0.0.1", 
  "control_port": 40112, 
  "hb_port": 52643, 
  "signature_scheme": "hmac-sha256", 
  "key": "d39f3762-ee5b-49c0-a895-40b239462d63", 
  "kernel_name": "", 
  "shell_port": 48522, 
  "transport": "tcp", 
  "iopub_port": 50962
}

Paste the above JSON into a file, and connect with:
    $> ipython <app> --existing <file>
or, if you are local, you can connect with just:
    $> ipython <app> --existing /cluster/home/cbosworth/.local/share/jupyter/runtime/kernel-596d0d5d-01c8-4577-beb2-0df65ac2e8c1.json 
or even just:
    $> ipython <app> --existing 
if this is the most recent IPython session you have started.


In [1]:
from __future__ import division
import numpy as np
import argparse
import vcf
import itertools
import sys
import multiprocessing
import pandas as pd
from collections import *
from tools.bio import *
from cat.plots import *
from phase_lib import *


In [2]:
args=argparse.Namespace()

#args.features="onlyStop.features.tsv"
#args.features="/hive/users/ifiddes/notch2nl_berkeley_data/imputation_pipeline/H9_NA12878.features.tsv"
args.features="/hive/users/ifiddes/notch2nl_berkeley_data/imputation_pipeline/very_reduced.features.tsv"
args.ratio_plot="tmp.pdf"
args.inferred_copy_numbers=[4,2,2,2]
#args.bam="/hive/users/cbosworth/SimonsNormals/secondTry/LP6005442-DNA_B08.sorted.2.bam"
#args.bam="H9/E2del68_E2del19N_combined.100kb.sorted.bam"
args.bam=None
args.paratype_pseudo=0.01
args.read_pseudo=0
args.consensus_fasta='/hive/users/cbosworth/refs/notch/notch2_aligned_consensus.fasta'
args.pileup_converter='/cluster/home/ifiddes/pileup2base/pileup2base.pl'
args.save_pileup=None
#args.pileup=None  
args.pileup='/hive/users/ifiddes/notch2nl_berkeley_data/imputation_pipeline/H9.pileup.txt'

In [22]:
def construct_C(inferred_copy_numbers, filtered_features):
    """Construct C based on previous information of counts. Only allow possibilties to be enumerated that match this information"""

    # create map of paralog to inferred total
    inferred_copy_numbers = {x: int(y) for x, y in zip(['AB', 'C', 'D', 'N'], inferred_copy_numbers)}
    total_copies = sum(inferred_copy_numbers.values())
    features_by_paralog = [x.split('_')[1][0] for x in filtered_features.columns]

    
    #print(inferred_copy_numbers)
    #print(total_copies)
    print(filtered_features)
    print(features_by_paralog)
    # for simplicity, replace all A or B with AB
    features_by_paralog = [x if x not in ['A', 'B'] else 'AB' for x in features_by_paralog]

    # split them into groups, maintaining original positions
    feature_groups = defaultdict(list)
    for i, f in enumerate(features_by_paralog):
        feature_groups[f].append(i)

    #print(feature_groups)
    # construct all possibilities for each feature group
    possibilities = {}
    for f, positions in feature_groups.iteritems():
        inferred_copy = inferred_copy_numbers[f]
        r = np.array([np.array(x) for x in itertools.product(range(inferred_copy + 1), repeat=len(positions))
                      if sum(x) == inferred_copy])
        possibilities[f] = r

    def array_product(a1, a2):
        m1,n1 = a1.shape
        m2,n2 = a2.shape
        out = np.zeros((m1, m2, n1 + n2), dtype=int)
        out[:,:,:n1] = a1[:,None,:]
        out[:,:,n1:] = a2
        out.shape = (m1 * m2, -1)
        return out
 
    #print(possibilities)
    abc = array_product(possibilities['AB'], possibilities['C'])
    abcd = array_product(abc, possibilities['D'])
    abcdn = array_product(abcd, possibilities['N'])

    # finally, rearrange the columns to reflect the original positioning
    order = feature_groups['AB'] + feature_groups['C'] + feature_groups['D'] + feature_groups['N']
    i = np.argsort(order)
    ordered = abcdn[:,i]
    return ordered

In [23]:
_, seq = read_fasta(args.consensus_fasta, None).next()

if args.bam is not None:
    pileup_recs = make_pileup(args.bam)
    df = convert_pileup(pileup_recs, args.pileup_converter)
    data = parse_converted_pileup(df, seq)
    if args.save_pileup is not None:
        data.to_csv(args.pileup, sep='\t')
else:
    data = pd.read_csv(args.pileup, sep='\t', index_col=0)

features = pd.read_csv(args.features, sep='\t', index_col=0)
# find shared positions in case data is missing some
positions = set(features.index) & set(data['loc'])
filtered_data = data[data['loc'].isin(positions)]
# filter features too
filtered_features = features[features.index.isin(positions)]

In [25]:
#C = construct_C(args.inferred_copy_numbers, filtered_features)
#C=construct_C(args.inferred_copy_numbers,features)
import cPickle as pickle
C = pickle.load(open('very_reduced.features.precomputed_C_4_2_2_2.pickle'))

In [26]:
# create map of paralog to inferred total
inferred_copy_numbers = {x: int(y) for x, y in zip(['AB', 'C', 'D', 'N'], args.inferred_copy_numbers)}
total_copies = sum(inferred_copy_numbers.values())
features_by_paralog = [x.split('_')[1][0] for x in filtered_features.columns]
features_by_paralog = [x if x not in ['A', 'B'] else 'AB' for x in features_by_paralog]

feature_groups = defaultdict(list)
for i, f in enumerate(features_by_paralog):
    feature_groups[f].append(i)
feature_groups

ambig=[]
ab=np.sum(filtered_features[feature_groups['AB']],axis=1)
c=np.sum(filtered_features[feature_groups['C']],axis=1)
d=np.sum(filtered_features[feature_groups['D']],axis=1)
n=np.sum(filtered_features[feature_groups['N']],axis=1)

#ab>0 and c==0

notAmbig=((ab>0) & (c==0) & (d==0) & (n==0)) | ((ab==0) & (c>0) & (d==0) & (n==0)) | ((ab==0) & (c==0) & (d>0) & (n==0)) | ((ab==0) & (c==0) & (d==0) & (n>0))
ambig=~(((ab>0) & (c==0) & (d==0) & (n==0)) | ((ab==0) & (c>0) & (d==0) & (n==0)) | ((ab==0) & (c==0) & (d>0) & (n==0)) | ((ab==0) & (c==0) & (d==0) & (n>0)))

args.paratype_pseudo=np.asarray(1-0.99*notAmbig)
args.paratype_pseudo[:10]

array([ 0.01,  0.01,  0.01,  1.  ,  0.01,  1.  ,  0.01,  0.01,  0.01,  0.01])

In [27]:
#import Counter
from collections import *
Counter(args.paratype_pseudo)

Counter({0.010000000000000009: 1016, 1.0: 235})

In [28]:
Ct = C.T

num = np.dot(filtered_features, Ct)
denom = np.sum(Ct, axis=0)[0]

#(args.paratype_pseudo + num.T).T.shape
denom=[denom]*len(args.paratype_pseudo)+args.paratype_pseudo
#(args.paratype_pseudo+num.T).T.shape
#( (2.0*args.paratype_pseudo) + denom).shape

S = (args.paratype_pseudo + num.T) / ( (2.0*args.paratype_pseudo) + denom)

S_log = np.log(S)
S_inv = np.log(1 - S)

# M is the number of alt reads, N is the number of ref reads
M = 1.0 * filtered_data.alt_count
N = 1.0 * filtered_data.ref_count

#S_log.shape
R = (np.dot(M + args.read_pseudo, S_log.T) + np.dot(N + args.read_pseudo, S_inv.T)).T

In [29]:
R_map = {i: x for i, x in enumerate(R)}
best_index, score = sorted(R_map.iteritems(), key=lambda x: x[1])[-1]
best_haps = C[best_index]

best_s = S[best_index]
expected_alt = np.multiply(best_s, filtered_data['coverage'])
expected_ref = filtered_data['coverage'] - expected_alt
actual_alt = filtered_data['alt_count']
actual_ref = filtered_data['ref_count']
n = filtered_data['coverage']
deviance = (expected_alt - actual_alt) / (np.sqrt(n * best_s * (1 - best_s)))
variance = sum(np.multiply(deviance, deviance) ) / len(filtered_data['coverage'])

In [30]:
q = zip(deviance.index, list(deviance))
q = sorted(q,key=lambda x: x[-1])

In [31]:
print 'log odds: {}'.format(score)
print 'variance: {}'.format(variance)
print '5 lowest variance sites: '
for pos, var in q[:5]:
    print '{}: {}'.format(pos, var)
print ''
print 'results: '
for x, y in zip(filtered_features.columns, best_haps):
    if y > 0:
        print '{}: {}'.format(x, y)

print 'top 10 hits:'
ordered = sorted(R_map.iteritems(), key=lambda x: x[1])[-10:][::-1]
print ' '.join(features.columns)
for i, x in enumerate([[C[pos], pos, val] for pos, val in ordered], 1):
    print '{}: {}'.format(i, x)

log odds: -1843316.49056
variance: 3782.94476791
5 lowest variance sites: 
46024: -1549.10993118
50119: -905.841481676
27923: -653.79968399
86723: -574.888438265
21662: -385.349471998

results: 
CHM1_N_c3: 1
NA12878_D2_c6: 2
NA24385_N1_c3: 1
NA24385_C1_c2: 1
H9_A1_c8: 1
H9_B1_c1: 1
H9_B2_c3: 1
H9_B3_c7: 1
H9_C1_c4: 1
top 10 hits:
CHM1_A_c5 CHM1_B_c4 CHM1_N_c3 NA12878_A1_c7 NA12878_B1_c3 NA12878_B2_c5 NA12878_D2_c6 NA24385_B1_c4 NA24385_B2_c5 NA24385_A1_c0 NA24385_A2_c6 NA24385_N1_c3 NA24385_C1_c2 H9_A1_c8 H9_B1_c1 H9_B2_c3 H9_B3_c7 H9_C1_c4
1: [array([0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]), 184, -1843316.4905618769]
2: [array([0, 1, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1]), 9139, -1844190.1227900477]
3: [array([0, 1, 1, 0, 0, 0, 2, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1]), 9364, -1845158.2444901722]
4: [array([0, 1, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]), 9130, -1845742.1339456602]
5: [array([0, 0, 1, 0, 0, 0, 2, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1]), 679, -18466

In [13]:
%connect_info

{
  "stdin_port": 58075, 
  "ip": "127.0.0.1", 
  "control_port": 39227, 
  "hb_port": 60785, 
  "signature_scheme": "hmac-sha256", 
  "key": "303b584f-317c-4c17-9a35-e4b0cc3cbe57", 
  "kernel_name": "", 
  "shell_port": 38236, 
  "transport": "tcp", 
  "iopub_port": 58976
}

Paste the above JSON into a file, and connect with:
    $> ipython <app> --existing <file>
or, if you are local, you can connect with just:
    $> ipython <app> --existing /cluster/home/ifiddes/.local/share/jupyter/runtime/kernel-0e062627-c986-404b-bc23-c03bd04ca292.json 
or even just:
    $> ipython <app> --existing 
if this is the most recent IPython session you have started.


In [47]:
num.shape

(1251, 16380)

In [48]:
denom.shape

(1251,)

In [50]:
Ct.shape

(18, 16380)