In [1]:
import numpy as np
import pandas as pd

from harrison_functions.utils.std.text import camel_to_snake_case
from harrison_functions.utils.plotting.plotly import plot_single_scatter

pd.options.display.max_columns = None

In [2]:
save=False

In [3]:
# selected images for eda
image_subset = [
    "sSP67_18_B6_CRE_003.tif",
    "sSP79_24_B6_CRE_001.tif",
    "sSP91_48_B6_CRE_006.tif",
    "sSP95_48_SAT_CRE_004.tif",
]

# Filter Nuclei Data

In [4]:
nuclei = pd.read_csv("data/nuclei.csv")
nuclei.columns = [camel_to_snake_case(col).replace("__", "_") for col in nuclei.columns]

In [5]:
# manually select columns

nuclei_subset = nuclei[[
    # index cols
    'image_number', 'object_number', 'path_name_tif', 'file_name_tif',
    
    # square areas
    'area_shape_center_x',
    'area_shape_center_y',
    'area_shape_bounding_box_minimum_x',
    'area_shape_bounding_box_maximum_x',
    'area_shape_bounding_box_minimum_y',
    'area_shape_bounding_box_maximum_y',  # useful for selecting frames
    'area_shape_bounding_box_area',  # area = (max_x-min_x)*(max_y-min_y)  # useful for images
    
    # size measures
    'area_shape_orientation',  # The angle in degrees [-90, 90] between x-axis and major_axis
    'area_shape_major_axis_length',  # "diameter" a
    'area_shape_minor_axis_length',  # "diameter" b
    'area_shape_area',  # area < pi*(a/2)*(b/2), actual area of the object
    'area_shape_convex_area',  # convex_area > pi*(a/2)*(b/2), area within bounding box
    'area_shape_perimeter',  # ~ 2*pi*np.sqrt(((a/2)**2+(b/2)**2)/2)  # perimeter of bounding box
    
    # measures of eccentricity
    'area_shape_eccentricity',  # np.sqrt(1-(b**2/a**2))
    'area_shape_form_factor',  # 4*pi*area/perimeter**2, equals 1 for a perfectly circular object.
    'area_shape_compactness',  # 1/form_factor. The mean squared distance of the object’s pixels from the centroid divided by the area.
    
    
    # ----------------------------------------------------------------------
    # Don't need
    
    # ratios
    # 'area_shape_equivalent_diameter',  # area = pi*(equivalent_diameter/2)**2, diameter of circle with the same area as the object
    # 'area_shape_extent',  #  area/box_area
    # 'area_shape_solidity',  # area/convex_area
    # 'area_shape_euler_number',  # this is always 1

    # max distance between tangent lines
    # 'area_shape_max_feret_diameter',  # similar to major_axis_length
    # 'area_shape_min_feret_diameter',  # similar to minor_axis_length
    
    # distances to outside the radius
    # 'area_shape_mean_radius',
    # 'area_shape_median_radius', 
    # 'area_shape_maximum_radius',
    
    # redundant
    # 'location_center_x', 'location_center_y',
    # 'location_center_z',  # this is always 0
    # 'number_object_number'  # this is equal to the object_number
    
]].copy()

In [6]:
# clean column names
nuclei_subset.columns = [
    (camel_to_snake_case(col)
     .replace("area_shape_", "")
     .replace('minimum', 'min')
     .replace("maximum", 'max')
    )
    for col in nuclei_subset.columns
]

nuclei_subset = nuclei_subset[(nuclei_subset['file_name_tif'].isin(image_subset)==True)].copy()
if save:
    nuclei_subset.to_csv('data/nuclei_subset.csv', index=None)

In [7]:
# preview
nuclei_subset.head()

Unnamed: 0,image_number,object_number,path_name_tif,file_name_tif,center_x,center_y,bounding_box_min_x,bounding_box_max_x,bounding_box_min_y,bounding_box_max_y,bounding_box_area,orientation,major_axis_length,minor_axis_length,area,convex_area,perimeter,eccentricity,form_factor,compactness
96,3,1,/Users/sarahpyfrom/Dropbox/Mac/Desktop/Sarah_S...,sSP67_18_B6_CRE_003.tif,693.094713,59.08009,655,733,19,98,6162,-45.901057,83.647997,67.946172,4445,4584,257.764502,0.583258,0.840689,1.1895
97,3,2,/Users/sarahpyfrom/Dropbox/Mac/Desktop/Sarah_S...,sSP67_18_B6_CRE_003.tif,295.184921,118.99676,255,334,81,161,6320,62.309321,82.042679,72.995776,4629,4861,269.078211,0.456489,0.803415,1.244687
98,3,3,/Users/sarahpyfrom/Dropbox/Mac/Desktop/Sarah_S...,sSP67_18_B6_CRE_003.tif,282.528024,182.998269,246,326,139,230,7280,28.7466,87.722871,71.947327,4621,5266,300.569588,0.572125,0.64277,1.555766
99,3,4,/Users/sarahpyfrom/Dropbox/Mac/Desktop/Sarah_S...,sSP67_18_B6_CRE_003.tif,211.894644,190.457987,151,268,117,271,18018,27.182444,161.416808,89.13511,10830,11781,442.700577,0.833709,0.694413,1.440064
100,3,5,/Users/sarahpyfrom/Dropbox/Mac/Desktop/Sarah_S...,sSP67_18_B6_CRE_003.tif,345.912244,212.319708,297,393,171,253,7872,-56.899251,92.188865,74.995123,5333,5644,291.119841,0.581573,0.790748,1.264626


# Puncta Data

In [8]:
puncta = pd.read_csv("data/puncta.csv")
puncta.columns = [camel_to_snake_case(col).replace("__", "_") for col in puncta.columns]

In [9]:
puncta_subset = puncta[[
    "image_number", "object_number", "parent_manual_nuclei",  # index_cols
    
    # square areas
    "area_shape_center_x",
    "area_shape_center_y",
    "area_shape_bounding_box_minimum_x",
    "area_shape_bounding_box_maximum_x",
    "area_shape_bounding_box_minimum_y",
    "area_shape_bounding_box_maximum_y",    
    "area_shape_bounding_box_area",
    
    # size measures
    "area_shape_orientation",
    "area_shape_major_axis_length",
    "area_shape_minor_axis_length",
    "area_shape_area",
    "area_shape_convex_area",
    "area_shape_perimeter",
    
    # measures of eccentricity
    "area_shape_eccentricity",
    "area_shape_form_factor",
    "area_shape_compactness",
    
    # intensities
    "intensity_integrated_intensity_masked_xist",
    "intensity_min_intensity_masked_xist",
    "intensity_max_intensity_masked_xist",
    "intensity_mean_intensity_masked_xist",
    "intensity_median_intensity_masked_xist",
    
    # edge intensities?
    "intensity_integrated_intensity_edge_masked_xist",
    "intensity_min_intensity_edge_masked_xist",
    "intensity_max_intensity_edge_masked_xist",
    "intensity_mean_intensity_edge_masked_xist",
    
    
    # ----------------------------------------------------------------------
    # Don't need
    
    # ratios
    # "area_shape_equivalent_diameter",
    # "area_shape_solidity",
    # "area_shape_extent",
    # "area_shape_euler_number",
    
    # max distance between tangent lines
    # "area_shape_min_feret_diameter",
    # "area_shape_max_feret_diameter",
    
    # distances to outside the radius
    # "area_shape_mean_radius",
    # "area_shape_median_radius",
    # "area_shape_maximum_radius",
    
    # intensities
    # "intensity_mad_intensity_masked_xist",  # median absolute deviation (MAD) of the intensities within the object
    # "intensity_std_intensity_masked_xist",
    # "intensity_mass_displacement_masked_xist",  # distance between the centers of gravity in the gray-level representation of the object and the binary representation of the object.
    # "intensity_lower_quartile_intensity_masked_xist",    
    # "intensity_upper_quartile_intensity_masked_xist",
    # "intensity_std_intensity_edge_masked_xist",
    
    # redundant
    # "location_center_x",  # same as area_center_x
    # "location_center_y",  # same as area_center_y
    # "location_center_z",  # always 0
    # "location_center_mass_intensity_x_masked_xist",  # similar to center_x with more digits
    # "location_center_mass_intensity_y_masked_xist",  # similar to center_x with more digits
    # "location_center_mass_intensity_z_masked_xist",  # always 0
    # "location_max_intensity_x_masked_xist",  # similar to area_shape_center_x, but rounded
    # "location_max_intensity_y_masked_xist",  # similar to area_shape_center_y, but rounded
    # "location_max_intensity_z_masked_xist",  # always 0
    # "number_object_number", # redundant
]]

In [10]:
# clean column names
puncta_subset.columns = [
    (camel_to_snake_case(col)
     .replace("area_shape_", "")
     .replace('_masked_xist', "")
     .replace('minimum', 'min')
     .replace("maximum", 'max')
     .replace('_intensity', '')
    )
    for col in puncta_subset.columns
]
intensity_cols = [col for col in puncta_subset.columns if 'intensity_' in col]
puncta_subset = puncta_subset.rename(columns=dict(zip(intensity_cols, ['_'.join(col.split('_')[::-1]) for col in intensity_cols])))

# subset by images
puncta_subset = puncta_subset[
    (puncta_subset['image_number'].isin(nuclei_subset['image_number'].unique())==True)
].copy()

if save:
    puncta_subset.to_csv('data/puncta_subset.csv', index=None)

In [11]:
# preview
puncta_subset.head()

Unnamed: 0,image_number,object_number,parent_manual_nuclei,center_x,center_y,bounding_box_min_x,bounding_box_max_x,bounding_box_min_y,bounding_box_max_y,bounding_box_area,orientation,major_axis_length,minor_axis_length,area,convex_area,perimeter,eccentricity,form_factor,compactness,integrated_intensity,min_intensity,max_intensity,mean_intensity,median_intensity,edge_integrated_intensity,edge_min_intensity,edge_max_intensity,edge_mean_intensity
1684,3,1,2,275.0,131.0,275,276,131,132,1,45.0,0.0,0.0,1,1,0.0,0.0,inf,0.0,0.023011,0.023011,0.023011,0.023011,0.023011,0.023011,0.023011,0.023011,0.023011
1685,3,2,2,280.0,131.0,280,281,131,132,1,45.0,0.0,0.0,1,1,0.0,0.0,inf,0.0,0.022431,0.022431,0.022431,0.022431,0.022431,0.022431,0.022431,0.022431,0.022431
1686,3,3,2,285.5,133.0,285,287,133,134,2,90.0,2.0,0.0,2,2,0.0,1.0,inf,0.0,0.045701,0.022522,0.023178,0.02285,0.023178,0.045701,0.022522,0.023178,0.02285
1687,3,4,2,280.142857,134.785714,278,283,133,138,25,-43.349067,5.195207,3.772838,14,16,12.485281,0.687467,1.128603,0.886051,0.321813,0.022461,0.024674,0.022987,0.022843,0.227497,0.022461,0.023072,0.02275
1688,3,5,2,277.0,136.0,277,278,136,137,1,45.0,0.0,0.0,1,1,0.0,0.0,inf,0.0,0.022675,0.022675,0.022675,0.022675,0.022675,0.022675,0.022675,0.022675,0.022675
