In [1]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher


Rules
- Drop Correlation features
- Drop Object Number features
- Drop features with X,Y suffix
- Drop correlations between same compartment features
- Drop correlations between same Feature categories
- Keep only one row per feature group
- Keep only highly correlated/anticorrelated features


In [2]:
feature_name_cols = ["Compartment", "Feature", "Feature_1", "Feature_2", "Feature_3", "Channel", "Suffix"]

In [3]:
orf_feature_df = pd.read_parquet("input/orf_feature_wide.parquet").rename(
    columns={"Mask": "Compartment"}
)

# Split Feature further
orf_feature_df[["Feature_1", "Feature_2", "Feature_3"]] = (
    orf_feature_df.Feature.str.split("_", expand=True)
)

# Remove leading underscore from column values

orf_feature_df["Suffix"] = orf_feature_df["Suffix"].apply(lambda x: x.lstrip("_"))

# Create feature name
orf_feature_df["feature_name"] = orf_feature_df[feature_name_cols].agg(
    lambda x: "_".join(filter(None, x)), axis=1
)

orf_feature_df.head()

Unnamed: 0,Compartment,Feature,Channel,Suffix,column_0,column_1,column_2,column_3,column_4,column_5,...,column_3630,column_3631,column_3632,column_3633,column_3634,column_3635,Feature_1,Feature_2,Feature_3,feature_name
0,Cells,AreaShapeCompactness,,,1.0,-0.15137,-0.101507,-0.059984,-0.057121,-0.10545,...,0.022632,-0.061216,-0.005541,-0.093061,-0.108426,-0.075014,AreaShapeCompactness,,,Cells_AreaShapeCompactness_AreaShapeCompactness
1,Cells,AreaShapeZernike,,6_6,-0.15137,1.0,-0.119333,0.031895,0.037169,-0.025448,...,0.000272,-0.045915,-0.003861,-0.02643,-0.077761,-0.06411,AreaShapeZernike,,,Cells_AreaShapeZernike_AreaShapeZernike_6_6
2,Cells,Texture_Correlation,AGP,3_01_256,-0.101507,-0.119333,1.0,-0.074286,-0.229202,0.20807,...,-0.270294,-0.092594,-0.175566,0.078074,0.023241,-0.097288,Texture,Correlation,,Cells_Texture_Correlation_Texture_Correlation_...
3,Cells,Texture_DifferenceEntropy,Mito,5_03_256,-0.059984,0.031895,-0.074286,1.0,0.248083,0.573112,...,0.097819,0.136732,0.066876,0.777491,0.513563,0.16853,Texture,DifferenceEntropy,,Cells_Texture_DifferenceEntropy_Texture_Differ...
4,Cells,Texture_Entropy,ER,10_00_256,-0.057121,0.037169,-0.229202,0.248083,1.0,0.000372,...,0.555658,0.723166,0.506062,0.050585,0.065895,0.478547,Texture,Entropy,,Cells_Texture_Entropy_Texture_Entropy_ER_10_00...


In [4]:
# Rename column names 

feature_names_df = orf_feature_df[feature_name_cols + ["feature_name"]].copy()
correlation_df = orf_feature_df.drop(columns=feature_name_cols).set_index("feature_name")
correlation_df.columns = feature_names_df.feature_name.to_list()

In [5]:
# Remove Correlation features
feature_names_df = feature_names_df[
    ~feature_names_df.Feature.str.contains("Correlation")
]

# Remove Object Number features

feature_names_df = feature_names_df[
    ~feature_names_df.Feature.str.contains("ObjectNumber")
]

feature_names_df = feature_names_df[
    ~feature_names_df.Feature.str.contains("Object_Number")
]

feature_names_df = feature_names_df[
    ~feature_names_df.Suffix.str.contains("X")
]

feature_names_df = feature_names_df[
    ~feature_names_df.Suffix.str.contains("Y")
]


correlation_df = correlation_df.loc[
    feature_names_df.feature_name.to_list(), feature_names_df.feature_name.to_list()
]

In [6]:
# Stack the dataframe

correlation_df_stacked = (
    correlation_df.mask(np.tril(np.ones(correlation_df.shape)).astype(bool))
    .stack()
    .reset_index()
    .rename(
        columns={
            0: "correlation",
            "feature_name": "feature_name_1",
            "level_1": "feature_name_2",
        }
    )
).sort_values(by=["correlation"], ascending=False)
correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation
2823858,Cytoplasm_AreaShapeBoundingBoxArea_AreaShapeBo...,Cells_AreaShapeBoundingBoxArea_AreaShapeBoundi...,1.000000
3406061,Cells_NeighborsSecondClosestDistance_Neighbors...,Cells_NeighborsSecondClosestDistance_Neighbors...,1.000000
879837,Cells_NeighborsAngleBetweenNeighbors_Neighbors...,Cells_NeighborsAngleBetweenNeighbors_Neighbors...,1.000000
1861691,Cells_AreaShapeMaxFeretDiameter_AreaShapeMaxFe...,Cytoplasm_AreaShapeMaxFeretDiameter_AreaShapeM...,1.000000
2970722,Cells_NeighborsFirstClosestDistance_NeighborsF...,Cells_NeighborsFirstClosestDistance_NeighborsF...,1.000000
...,...,...,...
858224,Cytoplasm_RadialDistribution_FracAtD_RadialDis...,Cytoplasm_RadialDistribution_FracAtD_RadialDis...,-0.990500
4937895,Nuclei_Texture_InverseDifferenceMoment_Texture...,Nuclei_Texture_Entropy_Texture_Entropy_Mito_3_...,-0.990562
2033056,Nuclei_Texture_Entropy_Texture_Entropy_AGP_3_0...,Nuclei_Texture_InverseDifferenceMoment_Texture...,-0.990900
4745297,Nuclei_Texture_Entropy_Texture_Entropy_Mito_3_...,Nuclei_Texture_InverseDifferenceMoment_Texture...,-0.990912


In [7]:
# Add other feature name columns

correlation_df_stacked = (
    correlation_df_stacked.merge(
        feature_names_df, left_on="feature_name_1", right_on="feature_name", how="left"
    )
    .drop(columns=["feature_name", "Feature"])
    .rename(
        columns={
            "Compartment": "Compartment_1",
            "Channel": "Channel_1",
            "Suffix": "Suffix_1",
            "Feature_1": "Feature_1_1",
            "Feature_2": "Feature_2_1",
            "Feature_3": "Feature_3_1",
        }
    )
    .merge(feature_names_df, left_on="feature_name_2", right_on="feature_name", how="left")
    .drop(columns=["feature_name", "Feature"])
    .rename(
        columns={
            "Compartment": "Compartment_2",
            "Channel": "Channel_2",
            "Suffix": "Suffix_2",
            "Feature_1": "Feature_1_2",
            "Feature_2": "Feature_2_2",
            "Feature_3": "Feature_3_2",
        }
    )
)
correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation,Compartment_1,Feature_1_1,Feature_2_1,Feature_3_1,Channel_1,Suffix_1,Compartment_2,Feature_1_2,Feature_2_2,Feature_3_2,Channel_2,Suffix_2
0,Cytoplasm_AreaShapeBoundingBoxArea_AreaShapeBo...,Cells_AreaShapeBoundingBoxArea_AreaShapeBoundi...,1.000000,Cytoplasm,AreaShapeBoundingBoxArea,,,,,Cells,AreaShapeBoundingBoxArea,,,,
1,Cells_NeighborsSecondClosestDistance_Neighbors...,Cells_NeighborsSecondClosestDistance_Neighbors...,1.000000,Cells,NeighborsSecondClosestDistance,,,,Adjacent,Cells,NeighborsSecondClosestDistance,,,,5
2,Cells_NeighborsAngleBetweenNeighbors_Neighbors...,Cells_NeighborsAngleBetweenNeighbors_Neighbors...,1.000000,Cells,NeighborsAngleBetweenNeighbors,,,,Adjacent,Cells,NeighborsAngleBetweenNeighbors,,,,5
3,Cells_AreaShapeMaxFeretDiameter_AreaShapeMaxFe...,Cytoplasm_AreaShapeMaxFeretDiameter_AreaShapeM...,1.000000,Cells,AreaShapeMaxFeretDiameter,,,,,Cytoplasm,AreaShapeMaxFeretDiameter,,,,
4,Cells_NeighborsFirstClosestDistance_NeighborsF...,Cells_NeighborsFirstClosestDistance_NeighborsF...,1.000000,Cells,NeighborsFirstClosestDistance,,,,5,Cells,NeighborsFirstClosestDistance,,,,Adjacent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5086450,Cytoplasm_RadialDistribution_FracAtD_RadialDis...,Cytoplasm_RadialDistribution_FracAtD_RadialDis...,-0.990500,Cytoplasm,RadialDistribution,FracAtD,,DNA,3of4,Cytoplasm,RadialDistribution,FracAtD,,DNA,4of4
5086451,Nuclei_Texture_InverseDifferenceMoment_Texture...,Nuclei_Texture_Entropy_Texture_Entropy_Mito_3_...,-0.990562,Nuclei,Texture,InverseDifferenceMoment,,Mito,5_02_256,Nuclei,Texture,Entropy,,Mito,3_02_256
5086452,Nuclei_Texture_Entropy_Texture_Entropy_AGP_3_0...,Nuclei_Texture_InverseDifferenceMoment_Texture...,-0.990900,Nuclei,Texture,Entropy,,AGP,3_03_256,Nuclei,Texture,InverseDifferenceMoment,,AGP,5_03_256
5086453,Nuclei_Texture_Entropy_Texture_Entropy_Mito_3_...,Nuclei_Texture_InverseDifferenceMoment_Texture...,-0.990912,Nuclei,Texture,Entropy,,Mito,3_00_256,Nuclei,Texture,InverseDifferenceMoment,,Mito,5_00_256


In [8]:
# Drop correlations between same compartment features

correlation_df_stacked = correlation_df_stacked.query("Compartment_1!=Compartment_2")

correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation,Compartment_1,Feature_1_1,Feature_2_1,Feature_3_1,Channel_1,Suffix_1,Compartment_2,Feature_1_2,Feature_2_2,Feature_3_2,Channel_2,Suffix_2
0,Cytoplasm_AreaShapeBoundingBoxArea_AreaShapeBo...,Cells_AreaShapeBoundingBoxArea_AreaShapeBoundi...,1.000000,Cytoplasm,AreaShapeBoundingBoxArea,,,,,Cells,AreaShapeBoundingBoxArea,,,,
3,Cells_AreaShapeMaxFeretDiameter_AreaShapeMaxFe...,Cytoplasm_AreaShapeMaxFeretDiameter_AreaShapeM...,1.000000,Cells,AreaShapeMaxFeretDiameter,,,,,Cytoplasm,AreaShapeMaxFeretDiameter,,,,
7,Cytoplasm_AreaShapeMinFeretDiameter_AreaShapeM...,Cells_AreaShapeMinFeretDiameter_AreaShapeMinFe...,1.000000,Cytoplasm,AreaShapeMinFeretDiameter,,,,,Cells,AreaShapeMinFeretDiameter,,,,
8,Cytoplasm_Intensity_MinIntensityEdge_Intensity...,Cells_Intensity_MinIntensityEdge_Intensity_Min...,1.000000,Cytoplasm,Intensity,MinIntensityEdge,,DNA,,Cells,Intensity,MinIntensityEdge,,DNA,
9,Cytoplasm_NumberObject_NumberObject_Number,Nuclei_NumberObject_NumberObject_Number,1.000000,Cytoplasm,NumberObject,,,,Number,Nuclei,NumberObject,,,,Number
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5086181,Cytoplasm_Texture_DifferenceEntropy_Texture_Di...,Cells_Texture_InverseDifferenceMoment_Texture_...,-0.984246,Cytoplasm,Texture,DifferenceEntropy,,AGP,3_01_256,Cells,Texture,InverseDifferenceMoment,,AGP,3_01_256
5086182,Cells_Texture_SumEntropy_Texture_SumEntropy_DN...,Cytoplasm_Texture_InverseDifferenceMoment_Text...,-0.984253,Cells,Texture,SumEntropy,,DNA,3_01_256,Cytoplasm,Texture,InverseDifferenceMoment,,DNA,5_01_256
5086216,Cells_Texture_InverseDifferenceMoment_Texture_...,Cytoplasm_Texture_DifferenceEntropy_Texture_Di...,-0.984951,Cells,Texture,InverseDifferenceMoment,,AGP,3_00_256,Cytoplasm,Texture,DifferenceEntropy,,AGP,5_00_256
5086269,Cells_Texture_InverseDifferenceMoment_Texture_...,Cytoplasm_Texture_DifferenceEntropy_Texture_Di...,-0.985737,Cells,Texture,InverseDifferenceMoment,,AGP,3_00_256,Cytoplasm,Texture,DifferenceEntropy,,AGP,3_00_256


In [9]:
# Drop correlations between same Feature_1 features

correlation_df_stacked = correlation_df_stacked.query("Feature_1_1!=Feature_1_2")

correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation,Compartment_1,Feature_1_1,Feature_2_1,Feature_3_1,Channel_1,Suffix_1,Compartment_2,Feature_1_2,Feature_2_2,Feature_3_2,Channel_2,Suffix_2
13,Nuclei_ParentNucleiIncludingEdges_ParentNuclei...,Cells_ParentCellsIncludingEdges_ParentCellsInc...,1.000000,Nuclei,ParentNucleiIncludingEdges,,,,,Cells,ParentCellsIncludingEdges,,,,
7499,Cells_Intensity_MeanIntensity_Intensity_MeanIn...,Cytoplasm_Texture_SumAverage_Texture_SumAverag...,0.993355,Cells,Intensity,MeanIntensity,,Mito,,Cytoplasm,Texture,SumAverage,,Mito,10_02_256
7523,Cells_Intensity_MeanIntensity_Intensity_MeanIn...,Cytoplasm_Texture_SumAverage_Texture_SumAverag...,0.993325,Cells,Intensity,MeanIntensity,,Mito,,Cytoplasm,Texture,SumAverage,,Mito,10_01_256
7632,Cells_Intensity_MeanIntensity_Intensity_MeanIn...,Cytoplasm_Texture_SumAverage_Texture_SumAverag...,0.993194,Cells,Intensity,MeanIntensity,,Mito,,Cytoplasm,Texture,SumAverage,,Mito,10_03_256
7643,Cells_Intensity_MedianIntensity_Intensity_Medi...,Cytoplasm_Texture_SumAverage_Texture_SumAverag...,0.993175,Cells,Intensity,MedianIntensity,,AGP,,Cytoplasm,Texture,SumAverage,,AGP,5_03_256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5079083,Nuclei_Texture_InfoMeas1_Texture_InfoMeas1_RNA...,Cells_Intensity_MaxIntensity_Intensity_MaxInte...,-0.936294,Nuclei,Texture,InfoMeas1,,RNA,5_01_256,Cells,Intensity,MaxIntensity,,RNA,
5080829,Nuclei_Texture_InfoMeas1_Texture_InfoMeas1_DNA...,Cells_Intensity_MaxIntensity_Intensity_MaxInte...,-0.948157,Nuclei,Texture,InfoMeas1,,DNA,10_02_256,Cells,Intensity,MaxIntensity,,DNA,
5080867,Nuclei_Texture_InfoMeas1_Texture_InfoMeas1_DNA...,Cells_Intensity_MaxIntensity_Intensity_MaxInte...,-0.948325,Nuclei,Texture,InfoMeas1,,DNA,10_00_256,Cells,Intensity,MaxIntensity,,DNA,
5080909,Nuclei_Texture_InfoMeas1_Texture_InfoMeas1_DNA...,Cells_Intensity_MaxIntensity_Intensity_MaxInte...,-0.948594,Nuclei,Texture,InfoMeas1,,DNA,5_01_256,Cells,Intensity,MaxIntensity,,DNA,


In [10]:
# Compute absolute correlation

correlation_df_stacked = correlation_df_stacked.assign(
    abs_correlation=lambda x: np.abs(x["correlation"])
)

correlation_df_stacked.sort_values(by=["abs_correlation"], ascending=False, inplace=True)

In [11]:
# Drop Duplicates

correlation_df_stacked = correlation_df_stacked.drop_duplicates(
    subset=["Feature_1_1", "Feature_1_2", "Feature_2_1", "Feature_2_2"], keep="first"
)

correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation,Compartment_1,Feature_1_1,Feature_2_1,Feature_3_1,Channel_1,Suffix_1,Compartment_2,Feature_1_2,Feature_2_2,Feature_3_2,Channel_2,Suffix_2,abs_correlation
13,Nuclei_ParentNucleiIncludingEdges_ParentNuclei...,Cells_ParentCellsIncludingEdges_ParentCellsInc...,1.000000,Nuclei,ParentNucleiIncludingEdges,,,,,Cells,ParentCellsIncludingEdges,,,,,1.000000
7499,Cells_Intensity_MeanIntensity_Intensity_MeanIn...,Cytoplasm_Texture_SumAverage_Texture_SumAverag...,0.993355,Cells,Intensity,MeanIntensity,,Mito,,Cytoplasm,Texture,SumAverage,,Mito,10_02_256,0.993355
7643,Cells_Intensity_MedianIntensity_Intensity_Medi...,Cytoplasm_Texture_SumAverage_Texture_SumAverag...,0.993175,Cells,Intensity,MedianIntensity,,AGP,,Cytoplasm,Texture,SumAverage,,AGP,5_03_256,0.993175
7710,Cytoplasm_Texture_SumAverage_Texture_SumAverag...,Cells_Intensity_MedianIntensity_Intensity_Medi...,0.993064,Cytoplasm,Texture,SumAverage,,AGP,10_00_256,Cells,Intensity,MedianIntensity,,AGP,,0.993064
7865,Cytoplasm_Texture_SumAverage_Texture_SumAverag...,Cells_Intensity_MeanIntensity_Intensity_MeanIn...,0.992825,Cytoplasm,Texture,SumAverage,,Mito,3_02_256,Cells,Intensity,MeanIntensity,,Mito,,0.992825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2842882,Nuclei_ObjectSkeletonNumberNonTrunkBranches_Ob...,Cytoplasm_AreaShapeCompactness_AreaShapeCompac...,0.002645,Nuclei,ObjectSkeletonNumberNonTrunkBranches,,,,mito_skel,Cytoplasm,AreaShapeCompactness,,,,,0.002645
2933375,Nuclei_AreaShapeEquivalentDiameter_AreaShapeEq...,Cytoplasm_AreaShapeOrientation_AreaShapeOrient...,-0.002319,Nuclei,AreaShapeEquivalentDiameter,,,,,Cytoplasm,AreaShapeOrientation,,,,,0.002319
2917953,Cells_AreaShapeOrientation_AreaShapeOrientation,Nuclei_ObjectSkeletonTotalObjectSkeletonLength...,-0.001368,Cells,AreaShapeOrientation,,,,,Nuclei,ObjectSkeletonTotalObjectSkeletonLength,,,,mito_skel,0.001368
2917687,Cells_AreaShapeOrientation_AreaShapeOrientation,Nuclei_ObjectSkeletonNumberNonTrunkBranches_Ob...,-0.001352,Cells,AreaShapeOrientation,,,,,Nuclei,ObjectSkeletonNumberNonTrunkBranches,,,,mito_skel,0.001352


In [12]:
# Keep only highly correlated features

correlation_df_stacked.sort_values(by=["correlation"], ascending=False, inplace=True)
correlation_df_stacked = correlation_df_stacked.query("correlation>0.75 or correlation<-0.75")

correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation,Compartment_1,Feature_1_1,Feature_2_1,Feature_3_1,Channel_1,Suffix_1,Compartment_2,Feature_1_2,Feature_2_2,Feature_3_2,Channel_2,Suffix_2,abs_correlation
13,Nuclei_ParentNucleiIncludingEdges_ParentNuclei...,Cells_ParentCellsIncludingEdges_ParentCellsInc...,1.000000,Nuclei,ParentNucleiIncludingEdges,,,,,Cells,ParentCellsIncludingEdges,,,,,1.000000
7499,Cells_Intensity_MeanIntensity_Intensity_MeanIn...,Cytoplasm_Texture_SumAverage_Texture_SumAverag...,0.993355,Cells,Intensity,MeanIntensity,,Mito,,Cytoplasm,Texture,SumAverage,,Mito,10_02_256,0.993355
7643,Cells_Intensity_MedianIntensity_Intensity_Medi...,Cytoplasm_Texture_SumAverage_Texture_SumAverag...,0.993175,Cells,Intensity,MedianIntensity,,AGP,,Cytoplasm,Texture,SumAverage,,AGP,5_03_256,0.993175
7710,Cytoplasm_Texture_SumAverage_Texture_SumAverag...,Cells_Intensity_MedianIntensity_Intensity_Medi...,0.993064,Cytoplasm,Texture,SumAverage,,AGP,10_00_256,Cells,Intensity,MedianIntensity,,AGP,,0.993064
7865,Cytoplasm_Texture_SumAverage_Texture_SumAverag...,Cells_Intensity_MeanIntensity_Intensity_MeanIn...,0.992825,Cytoplasm,Texture,SumAverage,,Mito,3_02_256,Cells,Intensity,MeanIntensity,,Mito,,0.992825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5075103,Cytoplasm_Texture_InverseDifferenceMoment_Text...,Cells_Intensity_MADIntensity_Intensity_MADInte...,-0.900308,Cytoplasm,Texture,InverseDifferenceMoment,,Mito,5_03_256,Cells,Intensity,MADIntensity,,Mito,,0.900308
5077690,Nuclei_Texture_InfoMeas1_Texture_InfoMeas1_DNA...,Cells_Intensity_StdIntensity_Intensity_StdInte...,-0.924803,Nuclei,Texture,InfoMeas1,,DNA,10_02_256,Cells,Intensity,StdIntensity,,DNA,,0.924803
5077849,Cells_Intensity_StdIntensity_Intensity_StdInte...,Nuclei_Texture_InfoMeas1_Texture_InfoMeas1_DNA...,-0.926236,Cells,Intensity,StdIntensity,,DNA,,Nuclei,Texture,InfoMeas1,,DNA,5_03_256,0.926236
5078973,Cells_Intensity_MaxIntensity_Intensity_MaxInte...,Nuclei_Texture_InfoMeas1_Texture_InfoMeas1_RNA...,-0.935466,Cells,Intensity,MaxIntensity,,RNA,,Nuclei,Texture,InfoMeas1,,RNA,5_03_256,0.935466


In [13]:
correlation_df_stacked[["feature_name_1", "feature_name_2", "correlation"]].to_csv('output/orf_top_correlations.csv', index=False)