In [2]:
import PyPDF2
from PIL import Image

import os
import sys
import warnings
import matplotlib
import numpy as np
from os import path

from skimage import data
from skimage import transform
from skimage.draw import circle
from skimage.util import img_as_float
from skimage.color import rgb2gray
from skimage.exposure import rescale_intensity

from skimage.feature import match_template # (only works for single match)?
from scipy import ndimage as ndi
import matplotlib.pyplot as plt
from skimage.feature import peak_local_max
from skimage import data, img_as_float

from sklearn.cluster import KMeans

from matplotlib import pyplot as plt
warnings.filterwarnings("ignore")

In [3]:
def load_april_2016_images():
    filenames = os.listdir('../data/april_2016_gels_renamed/')
    filenames = [x for x in filenames if 'pep1' in x]
    filenames = [x for x in filenames if not 'big' in x]
    
    imgs_april = []
    for filename in filenames:
        img = load_and_process_image('../data/april_2016_gels_renamed/' + filename)
        imgs_april.append(img)
        
    return imgs_april
    

def load_nov_2016_images():
    imgs_nov = []
    imgs_nov_idx = [1,6,12,21,41,42,51,52,56,83,84,89,90,96,97,106,123,131,136,152,153,156,157] # 7, 22
    
    for idx in imgs_nov_idx:
        filename = '../data/gels_nov_2016/Im{} - p. {}.png'.format(idx, idx)
        img = load_and_process_image(filename)
        imgs_nov.append(img)
        
    return imgs_nov


def load_and_process_image(filename):
    shape = (1276, 2100)

    cur_im = data.imread(filename, flatten=True)
    cur_im = img_as_float(cur_im)
    cur_im = rescale_intensity(cur_im)
    cur_im = rgb2gray(cur_im)

    cur_im = transform.resize(cur_im, output_shape=shape) # todo
    return cur_im


imgs_nov = load_nov_2016_images()
imgs_april = load_april_2016_images()
all_images = imgs_nov + imgs_april

In [4]:
import os
import sys
# Allow to import local python modules here in Jupyter
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from app.utils.preprocessing import *

### Grab Albumin roi

In [5]:
alb = imgs_nov[0][327:398,460:507]

### Find ROI in gel

In [74]:
def find_matches(img, template):
    overlap_thresh = 20
    result = match_template(img, template)
    xy_max = np.unravel_index(np.argsort(result.ravel())[-500:], result.shape)
    found = np.zeros(img.shape)

    from pdb import set_trace
    set_trace()
    
    # Don't include same ROI twice
    xy_dedup = []
    for x, y in zip(xy_max[0], xy_max[1]):
        # Maximum number of lanes
        if len(xy_dedup) >= 28: break
        # Minimum correlation
        if result[x, y] < .8: break
            
        overlap = found[x : x + alb.shape[0], y : y + alb.shape[1]]
        if not np.sum(overlap) > overlap_thresh:
            found[x : x + template.shape[0], y : y + template.shape[1]] = 1

            x_cen = x + int(template.shape[0] / 2)
            y_cen = y + int(template.shape[1] / 2)
            xy_dedup.append((x_cen, y_cen))
            
    return xy_dedup

### View marks

In [7]:
def mark_match_rois(img, marker_points):
    for x, y in marker_points:
        rr, cc = circle(x, y, 5)
        img[rr, cc] = 1
    return img

### Extract lanes above Alb roi

In [51]:
def extract_lanes_using_markers(img, markers):
    lanes = []
    i = 1
    # Weight X dimension higher than Y dimension
    markers_sorted = sorted(markers, key=lambda x: x[1] + 10*x[0])
    
    for x, y in markers_sorted:
        roi = img[x - 70 : x + 10, y - 10 : y + 10]
        lanes.append((roi, i))
        i += 1
    return lanes

In [9]:
def plot_lanes(lanes, labels):
    count = len(lanes)
    plt.figure(figsize=(20, 20))
    
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    for i, lane in enumerate(lanes):
        cols = 40
        rows = int(count / cols) + 1
        ax = plt.subplot(rows, cols, 1 + i)
        
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.set_title(labels[i])

        plt.imshow(lane)

In [75]:
# Choose nov, april, or all_images
to_analyze = all_images[9:10]

all_markers = [find_matches(img, alb) for img in to_analyze]

marked = [mark_match_rois(img.copy(), markers) for img, markers in zip(to_analyze, all_markers)]

lanes_per_gel = [extract_lanes_using_markers(img, markers) for img, markers in zip(to_analyze, all_markers)]

# Flatten
all_lanes = [item for sublist in lanes_per_gel for item in sublist]
len(all_lanes)

> <ipython-input-74-d876186c9d3b>(11)find_matches()
-> xy_dedup = []
(Pdb) xy_max
(array([ 510, 1157,  513, 1157, 1157,  513,   82,  514,  333,  336,  510,
        512,   82,  514,  511,  510,  511,  336,  514,  510,  510,  335,
         81,  513,  514,  334,  336,  510,  511,  335,   82,  513,  512,
        336,  511,   82,  514,  512,  514,  333,  512,  336,   81,  512,
        512,  336,  513,  512,  513,  333,  336, 1004,  513,   82,  514,
        511,  511,  512,  336,  512,  514,  336,  333,   82,  335,  512,
        336,  336,  335,  333,  336,  336, 1004,  336,  336,  336,  333,
         81,  512,   82,  513,  512,  333,  512,  334,  512,  512,  335,
       1006,  512,  512,  513,  336,  514,  514,  513,  512,  513,   81,
       1005,   82,  512,   82,  336,  514,  512,  335,  513,  336,  335,
        513,  335,  512,  336,  512,  335,   82,   81,  336, 1006,  336,
        334,  512,  513,   82,   82,   82,  336,   82,  335,  334,   81,
        513,  336,  513,   82,  336, 1004

KeyboardInterrupt: 

### Cluster out the bad lanes

In [53]:
# Gold std lane
plt.imshow(all_lanes[10][0])
plt.show()

SystemExit: 0

In [35]:
lanes_flat = [lanes[0].ravel() for lanes in all_lanes]
km = KMeans(n_clusters = 2)
km.fit(lanes_flat)

gld = all_lanes[0][0]
gld_label = np.argmin([np.sum((x.reshape(gld.shape) - gld)**2) for x in km.cluster_centers_])

labeled = [(img_and_lane_number, label) for img_and_lane_number, label in zip(lanes_flat, km.labels_)]
lanes_good = np.array(all_lanes)[km.labels_ == gld_label]
lanes_bad = np.array(all_lanes)[km.labels_ != gld_label]

start = 0
good_lanes_per_gel = []
bad_lanes_per_gel = []
for lanes in lanes_per_gel:
    current_labels = km.labels_[start : start + len(lanes)]
    good_inds = [current_labels == gld_label]
    bad_inds = [current_labels != gld_label]
    good_lanes_per_gel.append(np.array(lanes)[good_inds])
    bad_lanes_per_gel.append(np.array(lanes)[bad_inds])
    start += len(lanes)

len(lanes_good), len(lanes_bad)

(821, 343)

In [70]:
num = 9
len(all_markers[num]), len(lanes_per_gel[num]), len(good_lanes_per_gel[num]), len(bad_lanes_per_gel[num])

(16, 16, 0, 16)

### Plot

In [76]:
to_plot = 9

plt.imshow(marked[to_plot])

plot_lanes([x[0] for x in lanes_per_gel[to_plot]], [str(x) + 'a' for x in range(len(lanes_per_gel[to_plot]))])

plot_lanes([x[0] for x in good_lanes_per_gel[to_plot]], [str(x + 1) + 'g' for x in range(len(good_lanes_per_gel))])

plot_lanes([x[0] for x in bad_lanes_per_gel[to_plot]], [str(x + 1) + 'b' for x in range(len(bad_lanes_per_gel))])

plt.show()

SystemExit: 0

In [None]:
plt.imshow(marked[4])
plt.show()

### Load Labels

In [23]:
from datetime import datetime
from labels_collection import get_labels, get_dz_labels

april_2016_labels = get_labels(datetime(2016, 4, 1), datetime(2016, 4, 30))
april_2016_dz_labels = get_dz_labels(april_2016_labels)

nov_2016_labels = get_labels(datetime(2016, 11, 1), datetime(2016, 11, 30))
nov_2016_dz_labels = get_dz_labels(nov_2016_labels)

In [26]:
nov_2016_dz_labels

{'2016-11-01': [2, 3, 5, 7, 14, 16, 21],
 '2016-11-02': [2, 3, 11, 13, 17, 18, 20, 23, 25, 28],
 '2016-11-03': [2, 3, 6, 8, 10, 13, 28],
 '2016-11-04': [2, 5, 8, 14, 16, 17, 18, 22, 28],
 '2016-11-07': [2],
 '2016-11-08': [2, 3, 7, 10, 11, 20, 22, 23, 28],
 '2016-11-09': [2, 6, 9, 12, 13, 19, 22, 23, 28],
 '2016-11-10': [2, 3, 5, 6, 7, 9, 11, 14, 19, 20, 23],
 '2016-11-11': [2],
 '2016-11-12': [2],
 '2016-11-14': [2],
 '2016-11-15': [2, 3, 6, 7, 12, 13, 14, 16, 18, 19, 20, 24],
 '2016-11-16': [2, 4, 10, 12, 13, 15, 18, 26],
 '2016-11-17': [2, 4, 5, 6, 7, 8, 10, 18, 20],
 '2016-11-18': [2, 3, 4, 5, 9, 11, 13, 14, 26],
 '2016-11-21': [2, 4, 6, 7, 9, 10, 14, 15, 21, 24],
 '2016-11-22': [2, 9, 12, 19, 23],
 '2016-11-23': [2, 9, 13, 21],
 '2016-11-25': [2, 11, 25],
 '2016-11-28': [2],
 '2016-11-29': [2, 4, 7, 8, 10, 17, 21, 22, 24, 26, 28],
 '2016-11-30': [2, 4, 7, 8, 9, 11, 16, 19, 25, 28]}

In [25]:
X = [zip(calc_lane_means([z[0] for z in x]), [z[1] for z in x]) for x in good_lanes_per_gel]
X_flat = [z[0] for x in X for z in x]

print(len(imgs_nov), len(nov_2016_dz_labels.keys()))

y = []
for i, means in enumerate(X[0 : len(imgs_nov)]):
    date = nov_2016_dz_labels.keys()[i]
    for j in range(len(means)):
        if j in nov_2016_dz_labels[date]:
            y.append(1)
        else:
            y.append(0)

for i, means in enumerate(X[len(imgs_nov) : ]):
    date = april_2016_dz_labels.keys()[i]
    for j, k in means:
        if k in april_2016_dz_labels[date]:
            y.append(1)
        else:
            y.append(0)

len(X), len(X_flat), len(y)

(25, 22)


IndexError: list index out of range

### Classification

In [223]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split


x_train, x_test, y_train, y_test = train_test_split(X_flat, y)

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.70491803278688525

In [None]:
plt.figure(figsize=(20, 16))

X_means_df = pd.DataFrame(X_flat)

ctrl = X_means_df[np.array(y) == 0]
print len(ctrl), len(y)
plt.subplot(211)
ctrl.T.plot(alpha=.1, color='blue', ax=plt.gca(), legend=None, label='ctrl')

dz = X_means_df[np.array(y) == 1]
print len(dz), len(y)
plt.subplot(212)
dz.T.plot(alpha=.1, color='red', ax=plt.gca(), legend=None, label='dz')

plt.show()