In [1]:
# conclusions: filter on eccentricity > 0.69 and major_axis_length > 128

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import t

from bokeh.plotting import show
from bokeh.models.annotations import Title
from bokeh.models import Plot, ColumnDataSource, Ellipse, Grid, LinearAxis, Text
from bokeh.io import output_notebook, export_png

from harrison_functions.utils.std.text import camel_to_snake_case
from harrison_functions.utils.plotting.plotly import plot_single_scatter, save_fig_as_png

pd.options.display.max_columns = None
output_notebook()

# troubleshooting
# from selenium import webdriver
# driver = webdriver.Firefox(executable_path='/home/harrisonized/geckodriver')

In [3]:
save=False

In [4]:
def find_nearest_point(point, points:list):
    """O(n^2) algorithm to find the nearest point
    Can make this faster with binary search on one of the variables
    However, since this is a small dataset (20 nuclei per image), whatever
    
    >>> find_nearest_point(
        point=(281.415801, 135.945238),
        points=[(693.094713, 59.080090), (295.184921, 118.996760), (282.528024, 182.998269)],
    )
    (295.184921, 118.99676)
    """
    
    d = np.inf
    for x, y in points:
        d_current = np.sqrt((point[0]-x)**2+(point[1]-y)**2)
        if d_current < d:
            closest_point = (x, y)
            d = d_current
        
    return closest_point

def flatten_columns(multicols):
    """Flattens a 2 level multi-index
    """
    return [f'{cols[0].lower()}_{cols[1]}'.strip('_') for cols in multicols]

In [5]:
puncta_cols = [
    'image_number', 'object_number', 'parent_manual_nuclei', 'center_x',
    'center_y', 'bounding_box_min_x', 'bounding_box_max_x',
    'bounding_box_min_y', 'bounding_box_max_y', 'bounding_box_area',
    'orientation', 'major_axis_length', 'minor_axis_length', 'area',
    'convex_area', 'perimeter', 'eccentricity', 'form_factor',
    'compactness', 'integrated_intensity', 'min_intensity', 'max_intensity',
    'mean_intensity', 'median_intensity', 'edge_integrated_intensity',
    'edge_min_intensity', 'edge_max_intensity', 'edge_mean_intensity',
]

# Data

In [6]:
# read in data
nuclei = pd.read_csv("data/nuclei_subset.csv")
puncta = pd.read_csv("data/puncta_subset.csv")

In [7]:
# ----------------------------------------------------------------------
# Reassign nuclei
puncta_centers = (
    puncta
    .groupby(["image_number", "parent_manual_nuclei"])[["center_x", "center_y"]]
    .mean()
    .reset_index()
)
puncta_centers['center'] = puncta_centers[['center_x', 'center_y']].apply(list, axis=1)


# use find_nearest_point to find the center of the closest nuclei
# there are more nuclei than puncta, so this is fine
puncta_centers[["closest_nuclei_x", "closest_nuclei_y"]] = pd.DataFrame(
    puncta_centers[['image_number', 'center']].apply(
    lambda x: find_nearest_point(
        point=x['center'],
        points=nuclei.loc[(nuclei['image_number']==x['image_number']),
                          ["center_x", "center_y"]].to_records(index=False)
    )
    , axis=1).to_list(),
    columns=["closest_nuclei_x", "closest_nuclei_y"],
)

# left join nuclei_table on closest_nuclei_x and closest_nuclei_y
puncta_centers['nuclei_object_number'] = pd.merge(
    left=puncta_centers[["closest_nuclei_x", "closest_nuclei_y", 'image_number', 'parent_manual_nuclei']],
    right=nuclei[['center_x', 'center_y', 'image_number', 'object_number']],
    left_on=["closest_nuclei_x", "closest_nuclei_y", 'image_number',],
    right_on=['center_x', 'center_y', 'image_number',],
    how='left',
    suffixes=('', '_nuclei')
)['object_number']


# add back to puncta
puncta = pd.merge(
    left=puncta[puncta_cols],
    right=puncta_centers[['image_number', 'parent_manual_nuclei', 'nuclei_object_number']],
    left_on=['image_number', 'parent_manual_nuclei'],
    right_on=['image_number', 'parent_manual_nuclei',],
    how='left',
    suffixes=('', '_')
)


# filter puncta that are too far away from the nuclei
puncta = pd.merge(
    left=puncta[list(puncta_cols)+['nuclei_object_number']],
    right=nuclei[['image_number', 'object_number', 'bounding_box_min_x', 'bounding_box_max_x', 'bounding_box_min_y', 'bounding_box_max_y']],
    left_on=['image_number', 'nuclei_object_number'],
    right_on=['image_number', 'object_number'],
    how='left',
    suffixes=('', '_nuclei')
)  # left join nuclei data

puncta = puncta[
    (puncta['center_x'] >= puncta['bounding_box_min_x_nuclei']) & 
    (puncta['center_x'] <= puncta['bounding_box_max_x_nuclei']) &
    (puncta['center_y'] >= puncta['bounding_box_min_y_nuclei']) &
    (puncta['center_y'] <= puncta['bounding_box_max_y_nuclei'])
].copy()  # filter


# regenerate puncta_centers using filtered data
puncta_centers = (
    puncta
    .groupby(["image_number", "nuclei_object_number"])[["center_x", "center_y"]]
    .mean()
    .reset_index()
)

# Filter

In [35]:
# filters
nuclei_tmp = nuclei[
    (nuclei['eccentricity'] < 0.69)
    & (nuclei['major_axis_length'] < 128)
].copy()

puncta = pd.merge(
    left=nuclei_tmp[["image_number", 'object_number']],
    right=puncta.loc[:, puncta.columns != 'object_number'],
    left_on=["image_number", 'object_number'],
    right_on=['image_number', 'nuclei_object_number'],
    how="left",
).dropna(subset=['nuclei_object_number'])  # left join without duplicates

# Draw Boundaries around Puncta

In [36]:
puncta_summary = puncta.groupby(["image_number", "nuclei_object_number"]).agg(
        {
            "area": [sum, "count"],
            "integrated_intensity": sum,
            "center_x": [np.mean, np.std],
            "center_y": [np.mean, np.std],
        }
    ).reset_index()
puncta_summary.columns = flatten_columns(puncta_summary.columns)

# derive effective radius
puncta_summary["center_std"] = np.sqrt(puncta_summary["center_x_std"]**2+puncta_summary["center_y_std"]**2)
puncta_summary["effective_radius_puncta"] = puncta_summary["center_std"].apply(lambda x: x*t.ppf(0.90, 2))  # 90% CI

# fillna
puncta_summary.loc[puncta_summary["effective_radius_puncta"].isna(), "effective_radius_puncta"
] = puncta_summary.loc[puncta_summary["effective_radius_puncta"].isna(), "area_sum"].apply(
    lambda x: np.sqrt(x / 3.14159)
)
puncta_summary["bounding_box_min_x"] = puncta_summary["center_x_mean"] - puncta_summary["effective_radius_puncta"]
puncta_summary["bounding_box_max_x"] = puncta_summary["center_x_mean"] + puncta_summary["effective_radius_puncta"]
puncta_summary["bounding_box_min_y"] = puncta_summary["center_y_mean"] - puncta_summary["effective_radius_puncta"]
puncta_summary["bounding_box_max_y"] = puncta_summary["center_y_mean"] + puncta_summary["effective_radius_puncta"]

puncta_summary

Unnamed: 0,image_number,nuclei_object_number,area_sum,area_count,integrated_intensity_sum,center_x_mean,center_x_std,center_y_mean,center_y_std,center_std,effective_radius_puncta,bounding_box_min_x,bounding_box_max_x,bounding_box_min_y,bounding_box_max_y
0,3,2.0,36.0,10,0.826917,281.415801,3.973904,135.945238,3.504848,5.298667,9.991262,271.424539,291.407063,125.953976,145.936500
1,3,3.0,41.0,6,1.006195,308.584722,2.340527,197.160417,4.868488,5.401874,10.185872,298.398850,318.770594,186.974545,207.346289
2,3,5.0,8.0,1,0.211475,314.875000,,197.875000,,,1.595770,313.279230,316.470770,196.279230,199.470770
3,3,7.0,62.0,5,1.647883,1101.156032,2.224032,380.154921,3.402007,4.064477,7.664051,1093.491981,1108.820083,372.490870,387.818972
4,3,10.0,286.0,28,8.031144,223.689128,9.258510,448.006676,8.897830,12.841005,24.213231,199.475897,247.902359,423.793446,472.219907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,20,49.0,4.0,1,0.091157,661.500000,,867.500000,,,1.128380,660.371620,662.628380,866.371620,868.628380
93,20,50.0,37.0,10,0.870710,142.769881,7.776392,923.256310,6.621799,10.213740,19.259213,123.510668,162.029094,903.997096,942.515523
94,20,51.0,175.0,16,4.513405,1060.745804,3.282882,907.430531,5.931118,6.779046,12.782693,1047.963111,1073.528497,894.647838,920.213224
95,20,52.0,19.0,15,0.433860,256.533333,14.516575,889.666667,11.482388,18.508814,34.900554,221.632779,291.433887,854.766113,924.567221


In [50]:
save=True

In [37]:
# get nuclei boundaries
shapes = list(
    puncta_summary.loc[
        (puncta_summary['image_number']==3)
        , ["bounding_box_min_x",
         "bounding_box_max_x",
         "bounding_box_min_y",
         "bounding_box_max_y",]
    ]
    .rename(columns={
                "bounding_box_min_x": "x0",
                "bounding_box_max_x": "x1",
                "bounding_box_min_y": "y0",
                "bounding_box_max_y": "y1",}
           )
    .apply(lambda x: {**{"type": "circle", 'xref':"x", 'yref':"y", 'line':{'width':1.5}}, **dict(x)}, axis=1)
)

In [51]:
# plot puncta
fig = plot_single_scatter(
    puncta[puncta['image_number']==3].copy(),
    x='center_x',
    y='center_y',
    title='Puncta',
    xlabel='x',
    ylabel='y'
)

fig.layout.update(
    xaxis = {'range': [-50, 1250], 'constrain': "domain"},
    yaxis = {'range': [1050, -50], 'scaleanchor': 'x', 'scaleratio': 1},
    shapes=shapes,
    height=700,
)
fig.update_traces(
    marker=dict(size=3)
)

if save:
    save_fig_as_png(fig, 'figures/bounding_boxes/puncta_bounding_box_3.png', height=800, scale=1)

fig

# Scatter Plots

In [52]:
fig = plot_single_scatter(
    summary,
    x='area_sum',
    y='integrated_intensity_sum',
    xlabel='Total Area',
    ylabel='Total Integrated Intensity',
    title='Puncta Total Intensity vs. Total Area'
)

if save:
    save_fig_as_png(fig, 'figures/scatter/scatter_intensity_area.png', height=800, scale=1)

fig

In [53]:
nuclei['effective_radius_nuclei'] = nuclei['area'].apply(lambda x: np.sqrt(x/3.14159))

In [55]:
summary = pd.merge(
    left=nuclei[['image_number', 'object_number', 'area', 'effective_radius_nuclei']],
    right=puncta_summary,
    left_on=['image_number', 'object_number'],
    right_on=['image_number', 'nuclei_object_number'],
    suffixes=('_nuclei', '_puncta'),
    how='left'
).fillna({'area_puncta': 0, 'integrated_intensity': 0})

summary['nuclei_object_number'] = summary['object_number']  # fillna
summary['pct_puncta'] = (
    summary['effective_radius_puncta'] / summary['effective_radius_nuclei']
).fillna(0)

In [56]:
fig = plot_single_scatter(
    summary,
    x='pct_puncta',
    y='integrated_intensity_sum',
    xlabel='Effective Radius Ratio Puncta / Nucleus',
    ylabel='Total Integrated Intensity',
    title='Puncta Total Intensity vs. Effective Radius Ratio'
)

if save:
    save_fig_as_png(fig, 'figures/scatter/scatter_intensity_radius_ratio.png',
                    height=800, scale=1)

fig

In [43]:
# check why some are gt 1
summary[(summary['effective_radius_puncta'] > summary['effective_radius_nuclei'])]

Unnamed: 0,image_number,object_number,area,effective_radius_nuclei,nuclei_object_number,area_sum,area_count,integrated_intensity_sum,center_x_mean,center_x_std,center_y_mean,center_y_std,center_std,effective_radius_puncta,bounding_box_min_x,bounding_box_max_x,bounding_box_min_y,bounding_box_max_y,pct_puncta
70,18,11,4201,36.568032,11,11.0,6.0,0.322499,317.0,27.299166,345.138889,20.251863,33.990917,64.093887,252.906113,381.093887,281.045002,409.232776,1.75273
73,18,14,7615,49.233442,14,6.0,5.0,0.175021,221.4,20.305172,395.1,17.096783,26.544303,50.052417,171.347583,271.452417,345.047583,445.152417,1.016635
98,20,10,4872,39.380288,10,17.0,9.0,0.386191,670.185185,15.198786,208.851852,20.511082,25.528564,48.137121,622.048064,718.322306,160.714731,256.988973,1.222366
115,20,27,4565,38.119363,27,46.0,21.0,1.051164,621.704762,13.251852,565.461905,17.551588,21.992495,41.469446,580.235315,663.174208,523.992458,606.931351,1.087884
116,20,28,5749,42.778091,28,15.0,9.0,0.341527,319.518519,21.523799,550.592593,13.758864,25.54565,48.16934,271.349178,367.687859,502.423253,598.761933,1.126028
135,20,47,7351,48.372492,47,17.0,7.0,0.395056,490.452381,18.939649,846.588435,19.741262,27.357408,51.585623,438.866758,542.038004,795.002813,898.174058,1.066425
