Kd Analysis
===

In [None]:
date = ''
project_name = ''
target_name = ''
neg_control_target_name = ''
all_channels = ['']
data_channel = ''
target_sequence_file = "/shared/targets.yml"
nonneg_lda_weights_fpath = '/shared/yeast_beast_LDA_weights.txt'  # for microscope 3
# nonneg_lda_weights_fpath = '/shared/bLDA_coef_nonneg.txt'  # for microscope 2 and 4

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import glob
import numpy as np
import matplotlib.pyplot as plt
import random
import itertools
from collections import defaultdict, Counter
from IPython.display import HTML, Image
from champ import misc, intensity, initialize, seqtools, adapters_cython
import yaml

read_name_dir = os.path.join('/shared', project_name, 'read_names')
read_names_by_seq_fpath = os.path.join(read_name_dir, 'read_names_by_seq.txt')
out_fname = 'LDA_intensity_scores.txt'

Run Specific Section
===

### Parameters

In [None]:
with open(target_sequence_file) as f:
    targets = yaml.load(f)

target = targets[target_name]
neg_control_target = targets[neg_control_target_name]
datadir = os.path.join('results', date)
figdir = os.path.join('figs', date)
custom_fig_dir = os.path.join(figdir, 'custom')
custom_results_dir = os.path.join(datadir, 'custom')
for dpath in [custom_fig_dir, custom_results_dir]:
    if not os.path.isdir(dpath):
        os.makedirs(dpath)
out_fpath = os.path.join(custom_results_dir, out_fname)

print 'Image Collection Date:', date
print 'Sequencing Project Name:', project_name
print 'Target "{}":'.format(target_name), target
print 'Neg control target "{}":'.format(neg_control_target_name), neg_control_target
print 'Channels:', all_channels
print 'Protein channel:', data_channel
print 'Output file:', out_fpath

### Sequence of Interest Function

In [None]:
interesting_seqs = set()
    
stretch = set()
for i in range(1, len(target)+1):
    stretch.update(seqtools.get_stretch_of_complement_seqs(target, i))
insertions = set()
for i in range(1, 3):
    insertions.update(seqtools.get_contiguous_insertion_seqs(target, i))
for i in range(1, 3):
    insertions.update(seqtools.get_insertion_seqs(target, i))   
deletions = set()
for i in range(1, 3):
    deletions.update(seqtools.get_deletion_seqs(target, i))
mismatches = set()
for i in range(1, 3):
    mismatches.update(seqtools.get_mismatch_seqs(target, i))
six_n_pam = seqtools.get_randomized_pam_seqs(target, 4, 6)
other_targets = set()
for s in targets.values():
    other_targets.add(s)

interesting_seqs.update(other_targets)
interesting_seqs.update(stretch)
interesting_seqs.update(insertions)
interesting_seqs.update(deletions)
interesting_seqs.update(mismatches)
interesting_seqs.update(six_n_pam)

print("Interesting sequences: %d" % len(interesting_seqs))

# Create Interesting Sequences Files

For some reason, the `read_names_by_seq.txt` file often contains sequences with extra bases on either end of the sequence we actually care about. Which is to say, the reads are not being parsed properly. This wasn't happening before and I don't know what's changed. Regardless, here we go through it, and check every single sequence in that file to see if it contains a sequence of interest as a substring. This way, we generate a custom file that contains an exact mapping between read names and interesting sequences.

In [None]:
from champ.seqtools import build_interesting_sequences
interesting_read_names = build_interesting_sequences(read_names_by_seq_fpath, interesting_seqs)
with open('interesting_reads_by_seq.txt', 'w') as f:
    for sequence, read_names in interesting_read_names.items():
        f.write("%s\t%s\n" % (sequence, "\t".join(read_names)))

In [None]:
# Decide how many insertions or deletions to allow
min_len = len(target) - 3
max_len = len(target) + 3
max_ham = 7

def is_interesting_seq(seq):
    if seq in interesting_seqs:
        return True

Load Data
===

In [None]:
all_read_name_fpath = os.path.join(read_name_dir, 'all_read_names.txt')
target_read_name_fpath = os.path.join(read_name_dir, 'target_{}_read_names.txt'.format(target_name.lower()))
perfect_target_read_name_fpath = os.path.join(read_name_dir, 'perfect_target_{}_read_names.txt'.format(target_name.lower()))
neg_control_target_read_name_fpath = os.path.join(read_name_dir, 'perfect_target_{}_read_names.txt'.format(neg_control_target_name.lower()))
phiX_read_name_fpath = os.path.join(read_name_dir, 'phix_read_names.txt')

all_read_names = set(line.strip() for line in open(all_read_name_fpath))
print("All read names: %d" % len(all_read_names))
target_read_names = set(line.strip() for line in open(target_read_name_fpath))
print("Target read names: %d" % len(target_read_names))
perfect_target_read_names = set(line.strip() for line in open(perfect_target_read_name_fpath))
print("Perfect target read names: %d" % len(perfect_target_read_names))
neg_control_target_read_names = set(line.strip() for line in open(neg_control_target_read_name_fpath))
print("Negative control read names: %d" % len(neg_control_target_read_names))
phiX_read_names = set(line.strip() for line in open(phiX_read_name_fpath))
print("Phix read names: %d" % len(phiX_read_names))

In [None]:
h5_fpaths = glob.glob('*.h5')
i = 0
while i < len(h5_fpaths):
    if 'PhiX' in h5_fpaths[i] or 'chip' in h5_fpaths[i]:
        h5_fpaths.pop(i)
    else:
        i += 1
h5_fpaths.sort(key=misc.parse_concentration)
for fpath in h5_fpaths:
    print misc.parse_concentration(fpath), fpath

In [None]:
results_dir_name = date
results_dirs = [
    os.path.join('results', 
                 os.path.splitext(os.path.basename(h5_fpath))[0])
    for h5_fpath in h5_fpaths
]
for d in results_dirs:
    print(d)

In [None]:
print 'Loading data...'
int_scores = intensity.IntensityScores(h5_fpaths)
int_scores.get_LDA_scores(results_dirs, nonneg_lda_weights_fpath)

In [None]:
import time
print 'Normalizing data...'
int_scores.normalize_scores()
print 'Done normalizing.'

In [None]:
int_scores.plot_aligned_images('br', 'o*')

In [None]:
int_scores.plot_normalization_constants()

In [None]:
int_scores.print_reads_per_channel()

In [None]:
good_num_ims_cutoff = len(h5_fpaths) - 3
int_scores.build_good_read_names(good_num_ims_cutoff)

In [None]:
good_read_names = int_scores.good_read_names

In [None]:
good_perfect_read_names = perfect_target_read_names & good_read_names
print 'Good Reads:', len(good_read_names)
print 'Good Perfect Reads:', len(good_perfect_read_names)

In [None]:
int_scores.build_score_given_read_name_given_channel()

Collating by Sequence
===

In [None]:
# Find only read names with cascade scores
print("Starting")
aligned_read_names = []
for h5_fpath in h5_fpaths:
    sys.stdout.write('.')
    sys.stdout.flush()
    for d in int_scores.scores[h5_fpath][data_channel].values():
        for read_name in d.keys():
            aligned_read_names.append(read_name)
aligned_read_names = set(aligned_read_names)
print '\nAligned reads in protein channel:', len(aligned_read_names)

In [None]:
try:
    max_ham
except:
    max_ham = 7

In [None]:
print 'Collating Reads by Sequence'
interesting_reads = seqtools.build_read_names_given_seq(target,
                                                        'interesting_reads_by_seq.txt',
                                                        aligned_read_names,
                                                        is_interesting_seq,
                                                        max_ham)

print(len(interesting_reads))

In [None]:
interesting_reads[neg_control_target].update(neg_control_target_read_names)

# Filter Reads

We filter reads with outlier intensities for their sequence identity, and filter sequences with fewer than 5 reads.

In [None]:
min_reads = 5
tukey_contant = 1.5  # Read acceptabale if in range [q1 - tukey_contant * iqr, q3 + tukey_contant * iqr]

In [None]:
print 'Filtering reads by intensity and seqs by final read count'
interesting_seqs = set()
for i, (seq, read_names) in enumerate(interesting_reads.items()):
    if i % 10000 == 0:
        sys.stdout.write('.')
        sys.stdout.flush()
    if len(read_names) < min_reads:
        continue
    for h5_fpath in h5_fpaths:
        if data_channel not in int_scores.score_given_read_name_in_channel[h5_fpath]:
            continue
        score_given_read_name = int_scores.score_given_read_name_in_channel[h5_fpath][data_channel]
        intensities = [
            score_given_read_name[read_name]
            for read_name in read_names
            if read_name in score_given_read_name
        ]
        if len(intensities) < min_reads:
            continue
        q1 = np.percentile(intensities, 25)
        q3 = np.percentile(intensities, 75)
        iqr = q3 - q1
        min_range, max_range = (q1 - tukey_contant * iqr, q3 + tukey_contant * iqr)
        new_read_names = set()
        for read_name in read_names:
            try:
                if min_range <= score_given_read_name[read_name] <= max_range:
                    new_read_names.add(read_name)
            except KeyError:
                pass
            
    interesting_reads[seq] = new_read_names
    if len(new_read_names) >= min_reads:
        interesting_seqs.add(seq)

In [None]:
fig, ax = plt.subplots(figsize=(7, 6))
seqtools.plot_library_comp_by_hamming_distance(ax,
                                               target,
                                               max_ham,
                                               min_reads,
                                               interesting_reads,
                                               interesting_seqs)
ax.set_title('Target {} Library'.format(target_name), fontsize=20)

In [None]:
print 'Negative Control Seqs:', len(interesting_reads[neg_control_target])

# Write Output

In [None]:
concentrations = map(misc.parse_concentration, h5_fpaths)
print 'Concentrations:'
concentrations

In [None]:
trait_name = 'concentration_pM'
trait_list = concentrations
attrs_dict = {
    'target': target, 
    'target_name': target_name,
    'neg_control_target': neg_control_target,
    'neg_control_target_name': neg_control_target_name,
}

int_scores.write_values_by_seq(
    course_trait_name=trait_name,
    course_trait_list=trait_list,
    h5_fpaths=h5_fpaths,
    attrs_dict=attrs_dict,
    channel_of_interest=data_channel,
    seqs_of_interest=interesting_seqs,
    read_names_given_seq=interesting_reads,
    out_fpath=out_fpath,       
)