In [1]:
# conclusions: filter on eccentricity > 0.69 and major_axis_length > 128

In [56]:
import numpy as np
import pandas as pd

from bokeh.plotting import show
from bokeh.models.annotations import Title
from bokeh.models import Plot, ColumnDataSource, Ellipse, Grid, LinearAxis, Text
from bokeh.io import output_notebook, export_png

from harrison_functions.utils.std.text import camel_to_snake_case
from harrison_functions.utils.plotting.plotly import plot_single_scatter, save_fig_as_png

pd.options.display.max_columns = None
output_notebook()

# troubleshooting
# from selenium import webdriver
# driver = webdriver.Firefox(executable_path='/home/harrisonized/geckodriver')

In [3]:
save=False

In [4]:
def find_nearest_point(point, points:list):
    """O(n^2) algorithm to find the nearest point
    Can make this faster with binary search on one of the variables
    However, since this is a small dataset (20 nuclei per image), whatever
    
    >>> find_nearest_point(
        point=(281.415801, 135.945238),
        points=[(693.094713, 59.080090), (295.184921, 118.996760), (282.528024, 182.998269)],
    )
    (295.184921, 118.99676)
    """
    
    d = np.inf
    for x, y in points:
        d_current = np.sqrt((point[0]-x)**2+(point[1]-y)**2)
        if d_current < d:
            closest_point = (x, y)
            d = d_current
        
    return closest_point

In [5]:
puncta_cols = [
    'image_number', 'object_number', 'parent_manual_nuclei', 'center_x',
    'center_y', 'bounding_box_min_x', 'bounding_box_max_x',
    'bounding_box_min_y', 'bounding_box_max_y', 'bounding_box_area',
    'orientation', 'major_axis_length', 'minor_axis_length', 'area',
    'convex_area', 'perimeter', 'eccentricity', 'form_factor',
    'compactness', 'integrated_intensity', 'min_intensity', 'max_intensity',
    'mean_intensity', 'median_intensity', 'edge_integrated_intensity',
    'edge_min_intensity', 'edge_max_intensity', 'edge_mean_intensity',
]

# Data

In [18]:
# read in data
nuclei = pd.read_csv("data/nuclei_subset.csv")
puncta = pd.read_csv("data/puncta_subset.csv")

In [20]:
# ----------------------------------------------------------------------
# Reassign nuclei
puncta_centers = (
    puncta
    .groupby(["image_number", "parent_manual_nuclei"])[["center_x", "center_y"]]
    .mean()
    .reset_index()
)
puncta_centers['center'] = puncta_centers[['center_x', 'center_y']].apply(list, axis=1)


# use find_nearest_point to find the center of the closest nuclei
# there are more nuclei than puncta, so this is fine
puncta_centers[["closest_nuclei_x", "closest_nuclei_y"]] = pd.DataFrame(
    puncta_centers[['image_number', 'center']].apply(
    lambda x: find_nearest_point(
        point=x['center'],
        points=nuclei.loc[(nuclei['image_number']==x['image_number']),
                          ["center_x", "center_y"]].to_records(index=False)
    )
    , axis=1).to_list(),
    columns=["closest_nuclei_x", "closest_nuclei_y"],
)

# left join nuclei_table on closest_nuclei_x and closest_nuclei_y
puncta_centers['nuclei_object_number'] = pd.merge(
    left=puncta_centers[["closest_nuclei_x", "closest_nuclei_y", 'image_number', 'parent_manual_nuclei']],
    right=nuclei[['center_x', 'center_y', 'image_number', 'object_number']],
    left_on=["closest_nuclei_x", "closest_nuclei_y", 'image_number',],
    right_on=['center_x', 'center_y', 'image_number',],
    how='left',
    suffixes=('', '_nuclei')
)['object_number']


# add back to puncta
puncta = pd.merge(
    left=puncta[puncta_cols],
    right=puncta_centers[['image_number', 'parent_manual_nuclei', 'nuclei_object_number']],
    left_on=['image_number', 'parent_manual_nuclei'],
    right_on=['image_number', 'parent_manual_nuclei',],
    how='left',
    suffixes=('', '_')
)


# filter puncta that are too far away from the nuclei
puncta = pd.merge(
    left=puncta[list(puncta_cols)+['nuclei_object_number']],
    right=nuclei[['image_number', 'object_number', 'bounding_box_min_x', 'bounding_box_max_x', 'bounding_box_min_y', 'bounding_box_max_y']],
    left_on=['image_number', 'nuclei_object_number'],
    right_on=['image_number', 'object_number'],
    how='left',
    suffixes=('', '_nuclei')
)  # left join nuclei data

puncta = puncta[
    (puncta['center_x'] >= puncta['bounding_box_min_x_nuclei']) & 
    (puncta['center_x'] <= puncta['bounding_box_max_x_nuclei']) &
    (puncta['center_y'] >= puncta['bounding_box_min_y_nuclei']) &
    (puncta['center_y'] <= puncta['bounding_box_max_y_nuclei'])
].copy()  # filter


# regenerate puncta_centers using filtered data
puncta_centers = (
    puncta
    .groupby(["image_number", "nuclei_object_number"])[["center_x", "center_y"]]
    .mean()
    .reset_index()
)

# Filter

In [8]:
# filters
nuclei_tmp = nuclei[
    (nuclei['eccentricity'] < 0.69)
    & (nuclei['major_axis_length'] < 128)
].copy()

puncta = pd.merge(
    left=nuclei[["image_number", 'object_number']],
    right=puncta.loc[:, puncta.columns != 'object_number'],
    left_on=["image_number", 'object_number'],
    right_on=['image_number', 'nuclei_object_number'],
    how="left",
).dropna(subset=['nuclei_object_number'])  # left join without duplicates

# EDA

In [None]:
puncta_summary = (
    puncta.groupby(["image_number", "nuclei_object_number"])[["area", "integrated_intensity"]]
    .sum()
    .reset_index()
)

In [51]:
summary = pd.merge(
    left=nuclei[['image_number', 'object_number', 'area']],
    right=puncta_summary,
    left_on=['image_number', 'object_number'],
    right_on=['image_number', 'nuclei_object_number'],
    suffixes=('_nuclei', '_puncta'),
    how='left'
).fillna({'area_puncta': 0, 'integrated_intensity': 0})
summary['nuclei_object_number'] = summary['object_number']  # fillna

In [63]:
fig = plot_single_scatter(
    summary,
    x='area_puncta',
    y='integrated_intensity',
    xlabel='Total Area',
    ylabel='Total Integrated Intensity',
    title='Puncta Stats'
)

fig