Kd Analysis
===

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import glob
import numpy as np
import matplotlib.pyplot as plt
import random
import itertools
from collections import defaultdict, Counter
from IPython.display import HTML, Image
from adapters_cython import simple_hamming_distance
from champ import misc, intensity, seqtools

Run Specific Section
===

### Parameters

In [None]:
date = ''
project_name = ''
target_name = ''
neg_control_target_name = ''
all_channels = ['']
data_channel = ''
target_sequence_file = ''
read_name_dir = os.path.join('/shared', project_name, 'read_names')
nonneg_lda_weights_fpath = '/shared/bLDA_coef_nonneg.txt'
read_names_by_seq_fpath = os.path.join(read_name_dir, 'read_names_by_seq.txt')


out_fname = 'target_{}_genome_Kds.txt'.format(target_name)

target = targets[target_name]
neg_control_target = targets[neg_control_target_name]
datadir = os.path.join('results', date)
figdir = os.path.join('figs', date)
custom_fig_dir = os.path.join(figdir, 'custom')
custom_results_dir = os.path.join(datadir, 'custom')

for dpath in [custom_fig_dir, custom_results_dir]:
    if not os.path.isdir(dpath):
        os.makedirs(dpath)
out_fpath = os.path.join(custom_results_dir, out_fname)

print 'Image Collection Date:', date
print 'Sequencing Project Name:', project_name
print 'Target "{}":'.format(target_name), target
print 'Neg control target "{}":'.format(neg_control_target_name), neg_control_target
print 'Channels:', all_channels
print 'Protein channel:', data_channel
print 'Output file:', out_fpath

Load Data
===

In [None]:
all_read_name_fpath = os.path.join(read_name_dir, 'all_read_names.txt')
target_read_name_fpath = os.path.join(read_name_dir, 'target_{}_read_names.txt'.format(target_name.lower()))
perfect_target_read_name_fpath = os.path.join(read_name_dir, 'perfect_target_{}_read_names.txt'.format(target_name.lower()))
neg_control_target_read_name_fpath = os.path.join(read_name_dir, 'perfect_target_{}_read_names.txt'.format(neg_control_target_name.lower()))
phiX_read_name_fpath = os.path.join(read_name_dir, 'phix_read_names.txt')

all_read_names = set(line.strip() for line in open(all_read_name_fpath))
target_read_names = set(line.strip() for line in open(target_read_name_fpath))
perfect_target_read_names = set(line.strip() for line in open(perfect_target_read_name_fpath))
neg_control_target_read_names = set(line.strip() for line in open(neg_control_target_read_name_fpath))
phiX_read_names = set(line.strip() for line in open(phiX_read_name_fpath))

In [None]:
h5_fpaths = glob.glob(os.path.join(datadir, '*.h5'))
i = 0
while i < len(h5_fpaths):
    if 'PhiX' in h5_fpaths[i] or 'chip' in h5_fpaths[i]:
        h5_fpaths.pop(i)
    else:
        i += 1
h5_fpaths.sort(key=misc.parse_concentration)
h5_fpaths = h5_fpaths[:-1]
for fpath in h5_fpaths:
    print misc.parse_concentration(fpath), fpath

In [None]:
results_dir_name = date
results_dirs = [
    os.path.join(datadir,
                 results_dir_name,
                 os.path.splitext(os.path.basename(h5_fpath))[0])
    for h5_fpath in h5_fpaths
]

In [None]:
print 'Loading data...'
print
int_scores = hdf5_intensity_scores.IntensityScores(h5_fpaths)
int_scores.get_LDA_scores(results_dirs, nonneg_lda_weights_fpath)

In [None]:
print 'Normalizing data...'
int_scores.normalize_scores(perfect_target_read_names)

In [None]:
int_scores.plot_aligned_images('br', 'o*')

In [None]:
int_scores.plot_normalization_constants()

In [None]:
int_scores.print_reads_per_channel()

In [None]:
good_num_ims_cutoff = len(h5_fpaths) - 3
int_scores.build_good_read_names(good_num_ims_cutoff)

In [None]:
good_read_names = int_scores.good_read_names

In [None]:
good_perfect_read_names = perfect_target_read_names & good_read_names
print 'Good Reads:', len(good_read_names)
print 'Good Perfect Reads:', len(good_perfect_read_names)

In [None]:
int_scores.build_score_given_read_name_given_channel()

Fit Genomic Kds
===

In [None]:
import KdFitGenome

In [None]:
Kd_fpath = os.path.join(custom_results_dir, 'LDA_Imin_const_Imax_adjusted_Kds_and_ABAs.txt')
bam_fpath = os.path.join(datadir, project_name, 'GRCh38_mappings', 'GRCh38.p2_genomic_mappings.bam')

In [None]:
directional_Kd_offsets = range(5, 100, 5)

In [None]:
kdgf = KdFitGenome.KdFitGenome(int_scores,
                               h5_fpaths,
                               data_channel,
                               Kd_fpath,
                               directional_Kd_offsets)

In [None]:
kdgf.fit_Kds_in_bam_and_write_results(bam_fpath, out_fpath)