In [1]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher


Rules
- Drop Correlation features
- Drop Object Number features
- Drop features with X,Y suffix
- Drop correlations between same compartment features
- Drop correlations between same Feature categories
- Keep only one row per feature group
- Keep only highly correlated/anticorrelated features


In [2]:
feature_name_cols = ["Compartment", "Feature", "Feature_1", "Feature_2", "Feature_3", "Channel", "Suffix"]

In [3]:
crispr_feature_df = pd.read_parquet("input/crispr_feature_wide.parquet").rename(
    columns={"Mask": "Compartment"}
)

# Split Feature further
crispr_feature_df[["Feature_1", "Feature_2", "Feature_3"]] = (
    crispr_feature_df.Feature.str.split("_", expand=True)
)

# Remove leading underscore from column values

crispr_feature_df["Suffix"] = crispr_feature_df["Suffix"].apply(lambda x: x.lstrip("_"))

# Create feature name
crispr_feature_df["feature_name"] = crispr_feature_df[feature_name_cols].agg(
    lambda x: "_".join(filter(None, x)), axis=1
)

crispr_feature_df.head()

Unnamed: 0,Compartment,Feature,Channel,Suffix,column_0,column_1,column_2,column_3,column_4,column_5,...,column_3645,column_3646,column_3647,column_3648,column_3649,column_3650,Feature_1,Feature_2,Feature_3,feature_name
0,Cells,Texture_Correlation,ER,10_02_256,1.0,0.507748,0.606947,0.002188,0.486636,0.603111,...,0.176715,0.463619,-0.30122,0.039268,0.203778,0.043879,Texture,Correlation,,Cells_Texture_Correlation_Texture_Correlation_...
1,Cytoplasm,AreaShapeBoundingBoxArea,,,0.507748,1.0,0.785837,-0.001925,0.575591,0.294705,...,0.054823,-0.145195,-0.243519,-0.059198,0.125026,-0.160831,AreaShapeBoundingBoxArea,,,Cytoplasm_AreaShapeBoundingBoxArea_AreaShapeBo...
2,Cytoplasm,Intensity_MassDisplacement,RNA,,0.606947,0.785837,1.0,0.002393,0.549004,0.280987,...,0.191329,-0.130523,-0.461357,0.196529,0.190383,0.022665,Intensity,MassDisplacement,,Cytoplasm_Intensity_MassDisplacement_Intensity...
3,Cytoplasm,Texture_Contrast,DNA,10_02_256,0.002188,-0.001925,0.002393,1.0,-0.002864,-0.002334,...,0.005742,0.005219,-0.001404,0.008478,-0.00091,0.087698,Texture,Contrast,,Cytoplasm_Texture_Contrast_Texture_Contrast_DN...
4,Cytoplasm,Texture_Correlation,Mito,10_01_256,0.486636,0.575591,0.549004,-0.002864,1.0,0.478242,...,0.048198,0.003907,-0.157372,0.046812,0.0835,-0.040116,Texture,Correlation,,Cytoplasm_Texture_Correlation_Texture_Correlat...


In [4]:
# Rename column names 

feature_names_df = crispr_feature_df[feature_name_cols + ["feature_name"]].copy()
correlation_df = crispr_feature_df.drop(columns=feature_name_cols).set_index("feature_name")
correlation_df.columns = feature_names_df.feature_name.to_list()

In [5]:
# Remove Correlation features
feature_names_df = feature_names_df[
    ~feature_names_df.Feature.str.contains("Correlation")
]

# Remove Object Number features

feature_names_df = feature_names_df[
    ~feature_names_df.Feature.str.contains("ObjectNumber")
]

feature_names_df = feature_names_df[
    ~feature_names_df.Feature.str.contains("Object_Number")
]

feature_names_df = feature_names_df[
    ~feature_names_df.Suffix.str.contains("X")
]

feature_names_df = feature_names_df[
    ~feature_names_df.Suffix.str.contains("Y")
]


correlation_df = correlation_df.loc[
    feature_names_df.feature_name.to_list(), feature_names_df.feature_name.to_list()
]

In [6]:
# Stack the dataframe

correlation_df_stacked = (
    correlation_df.mask(np.tril(np.ones(correlation_df.shape)).astype(bool))
    .stack()
    .reset_index()
    .rename(
        columns={
            0: "correlation",
            "feature_name": "feature_name_1",
            "level_1": "feature_name_2",
        }
    )
).sort_values(by=["correlation"], ascending=False)
correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation
1821117,Cells_NeighborsFirstClosestDistance_NeighborsF...,Cells_NeighborsFirstClosestDistance_NeighborsF...,1.000000
4809812,Cells_NeighborsSecondClosestDistance_Neighbors...,Cells_NeighborsSecondClosestDistance_Neighbors...,1.000000
5055090,Cells_NeighborsAngleBetweenNeighbors_Neighbors...,Cells_NeighborsAngleBetweenNeighbors_Neighbors...,1.000000
3356164,Cytoplasm_RadialDistributionFracAtD_RadialDist...,Cytoplasm_RadialDistributionFracAtD_RadialDist...,1.000000
871324,Cytoplasm_RadialDistributionMeanFrac_RadialDis...,Cytoplasm_RadialDistributionMeanFrac_RadialDis...,1.000000
...,...,...,...
2977468,Cytoplasm_RadialDistributionFracAtD_RadialDist...,Cytoplasm_RadialDistributionMeanFrac_RadialDis...,-0.997748
820909,Cytoplasm_RadialDistributionFracAtD_RadialDist...,Cytoplasm_RadialDistributionFracAtD_RadialDist...,-0.997794
3166497,Cytoplasm_RadialDistributionFracAtD_RadialDist...,Cytoplasm_RadialDistributionMeanFrac_RadialDis...,-0.997799
2640397,Cytoplasm_RadialDistributionMeanFrac_RadialDis...,Cytoplasm_RadialDistributionFracAtD_RadialDist...,-0.997951


In [7]:
# Add other feature name columns

correlation_df_stacked = (
    correlation_df_stacked.merge(
        feature_names_df, left_on="feature_name_1", right_on="feature_name", how="left"
    )
    .drop(columns=["feature_name", "Feature"])
    .rename(
        columns={
            "Compartment": "Compartment_1",
            "Channel": "Channel_1",
            "Suffix": "Suffix_1",
            "Feature_1": "Feature_1_1",
            "Feature_2": "Feature_2_1",
            "Feature_3": "Feature_3_1",
        }
    )
    .merge(feature_names_df, left_on="feature_name_2", right_on="feature_name", how="left")
    .drop(columns=["feature_name", "Feature"])
    .rename(
        columns={
            "Compartment": "Compartment_2",
            "Channel": "Channel_2",
            "Suffix": "Suffix_2",
            "Feature_1": "Feature_1_2",
            "Feature_2": "Feature_2_2",
            "Feature_3": "Feature_3_2",
        }
    )
)
correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation,Compartment_1,Feature_1_1,Feature_2_1,Feature_3_1,Channel_1,Suffix_1,Compartment_2,Feature_1_2,Feature_2_2,Feature_3_2,Channel_2,Suffix_2
0,Cells_NeighborsFirstClosestDistance_NeighborsF...,Cells_NeighborsFirstClosestDistance_NeighborsF...,1.000000,Cells,NeighborsFirstClosestDistance,,,,Adjacent,Cells,NeighborsFirstClosestDistance,,,,5
1,Cells_NeighborsSecondClosestDistance_Neighbors...,Cells_NeighborsSecondClosestDistance_Neighbors...,1.000000,Cells,NeighborsSecondClosestDistance,,,,Adjacent,Cells,NeighborsSecondClosestDistance,,,,5
2,Cells_NeighborsAngleBetweenNeighbors_Neighbors...,Cells_NeighborsAngleBetweenNeighbors_Neighbors...,1.000000,Cells,NeighborsAngleBetweenNeighbors,,,,5,Cells,NeighborsAngleBetweenNeighbors,,,,Adjacent
3,Cytoplasm_RadialDistributionFracAtD_RadialDist...,Cytoplasm_RadialDistributionFracAtD_RadialDist...,1.000000,Cytoplasm,RadialDistributionFracAtD,,,,mito_tubeness_1of20,Cytoplasm,RadialDistributionFracAtD,,,,mito_tubeness_1of16
4,Cytoplasm_RadialDistributionMeanFrac_RadialDis...,Cytoplasm_RadialDistributionMeanFrac_RadialDis...,1.000000,Cytoplasm,RadialDistributionMeanFrac,,,,mito_tubeness_1of20,Cytoplasm,RadialDistributionMeanFrac,,,,mito_tubeness_1of16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5134405,Cytoplasm_RadialDistributionFracAtD_RadialDist...,Cytoplasm_RadialDistributionMeanFrac_RadialDis...,-0.997748,Cytoplasm,RadialDistributionFracAtD,,,,mito_tubeness_20of20,Cytoplasm,RadialDistributionMeanFrac,,,,mito_tubeness_18of20
5134406,Cytoplasm_RadialDistributionFracAtD_RadialDist...,Cytoplasm_RadialDistributionFracAtD_RadialDist...,-0.997794,Cytoplasm,RadialDistributionFracAtD,,,,mito_tubeness_18of20,Cytoplasm,RadialDistributionFracAtD,,,,mito_tubeness_16of16
5134407,Cytoplasm_RadialDistributionFracAtD_RadialDist...,Cytoplasm_RadialDistributionMeanFrac_RadialDis...,-0.997799,Cytoplasm,RadialDistributionFracAtD,,,,mito_tubeness_16of16,Cytoplasm,RadialDistributionMeanFrac,,,,mito_tubeness_14of16
5134408,Cytoplasm_RadialDistributionMeanFrac_RadialDis...,Cytoplasm_RadialDistributionFracAtD_RadialDist...,-0.997951,Cytoplasm,RadialDistributionMeanFrac,,,,mito_tubeness_17of20,Cytoplasm,RadialDistributionFracAtD,,,,mito_tubeness_16of16


In [8]:
# Drop correlations between same compartment features

correlation_df_stacked = correlation_df_stacked.query("Compartment_1!=Compartment_2")

correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation,Compartment_1,Feature_1_1,Feature_2_1,Feature_3_1,Channel_1,Suffix_1,Compartment_2,Feature_1_2,Feature_2_2,Feature_3_2,Channel_2,Suffix_2
5,Nuclei_NumberObject_NumberObject_Number,Cytoplasm_NumberObject_NumberObject_Number,0.999999,Nuclei,NumberObject,,,,Number,Cytoplasm,NumberObject,,,,Number
27,Cells_AreaShapeMinFeretDiameter_AreaShapeMinFe...,Cytoplasm_AreaShapeMinFeretDiameter_AreaShapeM...,0.999984,Cells,AreaShapeMinFeretDiameter,,,,,Cytoplasm,AreaShapeMinFeretDiameter,,,,
44,Cells_AreaShapeMaxFeretDiameter_AreaShapeMaxFe...,Cytoplasm_AreaShapeMaxFeretDiameter_AreaShapeM...,0.999966,Cells,AreaShapeMaxFeretDiameter,,,,,Cytoplasm,AreaShapeMaxFeretDiameter,,,,
140,Cytoplasm_AreaShapeBoundingBoxArea_AreaShapeBo...,Cells_AreaShapeBoundingBoxArea_AreaShapeBoundi...,0.999931,Cytoplasm,AreaShapeBoundingBoxArea,,,,,Cells,AreaShapeBoundingBoxArea,,,,
735,Cytoplasm_Intensity_MinIntensityEdge_Intensity...,Cells_Intensity_MinIntensityEdge_Intensity_Min...,0.999735,Cytoplasm,Intensity,MinIntensityEdge,,ER,,Cells,Intensity,MinIntensityEdge,,ER,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5132443,Cells_Texture_DifferenceEntropy_Texture_Differ...,Nuclei_Texture_InverseDifferenceMoment_Texture...,-0.979870,Cells,Texture,DifferenceEntropy,,DNA,10_03_256,Nuclei,Texture,InverseDifferenceMoment,,DNA,3_03_256
5132455,Cells_Texture_DifferenceEntropy_Texture_Differ...,Nuclei_Texture_InverseDifferenceMoment_Texture...,-0.979962,Cells,Texture,DifferenceEntropy,,DNA,10_03_256,Nuclei,Texture,InverseDifferenceMoment,,DNA,5_01_256
5132462,Cells_Texture_DifferenceEntropy_Texture_Differ...,Nuclei_Texture_InverseDifferenceMoment_Texture...,-0.980010,Cells,Texture,DifferenceEntropy,,DNA,10_01_256,Nuclei,Texture,InverseDifferenceMoment,,DNA,5_03_256
5132598,Cells_Texture_DifferenceEntropy_Texture_Differ...,Nuclei_Texture_InverseDifferenceMoment_Texture...,-0.980713,Cells,Texture,DifferenceEntropy,,DNA,10_03_256,Nuclei,Texture,InverseDifferenceMoment,,DNA,5_03_256


In [9]:
# Drop correlations between same Feature_1 features

correlation_df_stacked = correlation_df_stacked.query("Feature_1_1!=Feature_1_2")

correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation,Compartment_1,Feature_1_1,Feature_2_1,Feature_3_1,Channel_1,Suffix_1,Compartment_2,Feature_1_2,Feature_2_2,Feature_3_2,Channel_2,Suffix_2
3051,Cytoplasm_AreaShapeMaxFeretDiameter_AreaShapeM...,Cells_AreaShapeMajorAxisLength_AreaShapeMajorA...,0.998344,Cytoplasm,AreaShapeMaxFeretDiameter,,,,,Cells,AreaShapeMajorAxisLength,,,,
4588,Nuclei_ParentNucleiIncludingEdges_ParentNuclei...,Cytoplasm_NumberObject_NumberObject_Number,0.997139,Nuclei,ParentNucleiIncludingEdges,,,,,Cytoplasm,NumberObject,,,,Number
5413,Cells_AreaShapeMinorAxisLength_AreaShapeMinorA...,Cytoplasm_AreaShapeMinFeretDiameter_AreaShapeM...,0.996440,Cells,AreaShapeMinorAxisLength,,,,,Cytoplasm,AreaShapeMinFeretDiameter,,,,
5920,Cells_AreaShapeMaxFeretDiameter_AreaShapeMaxFe...,Cytoplasm_AreaShapeMajorAxisLength_AreaShapeMa...,0.995947,Cells,AreaShapeMaxFeretDiameter,,,,,Cytoplasm,AreaShapeMajorAxisLength,,,,
6716,Cytoplasm_AreaShapeMinorAxisLength_AreaShapeMi...,Cells_AreaShapeMinFeretDiameter_AreaShapeMinFe...,0.995160,Cytoplasm,AreaShapeMinorAxisLength,,,,,Cells,AreaShapeMinFeretDiameter,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5128061,Nuclei_Texture_InverseDifferenceMoment_Texture...,Cells_Intensity_MaxIntensity_Intensity_MaxInte...,-0.958140,Nuclei,Texture,InverseDifferenceMoment,,DNA,5_01_256,Cells,Intensity,MaxIntensity,,DNA,
5128126,Cells_Intensity_MaxIntensity_Intensity_MaxInte...,Nuclei_Texture_InverseDifferenceMoment_Texture...,-0.958498,Cells,Intensity,MaxIntensity,,DNA,,Nuclei,Texture,InverseDifferenceMoment,,DNA,5_03_256
5128165,Cells_Intensity_MaxIntensity_Intensity_MaxInte...,Nuclei_Texture_InverseDifferenceMoment_Texture...,-0.958674,Cells,Intensity,MaxIntensity,,DNA,,Nuclei,Texture,InverseDifferenceMoment,,DNA,10_02_256
5128364,Cells_Intensity_MaxIntensity_Intensity_MaxInte...,Nuclei_Texture_InverseDifferenceMoment_Texture...,-0.959684,Cells,Intensity,MaxIntensity,,DNA,,Nuclei,Texture,InverseDifferenceMoment,,DNA,10_00_256


In [10]:
# Compute absolute correlation

correlation_df_stacked = correlation_df_stacked.assign(
    abs_correlation=lambda x: np.abs(x["correlation"])
)

correlation_df_stacked.sort_values(by=["abs_correlation"], ascending=False, inplace=True)

In [11]:
# Drop Duplicates

correlation_df_stacked = correlation_df_stacked.drop_duplicates(
    subset=["Feature_1_1", "Feature_1_2", "Feature_2_1", "Feature_2_2"], keep="first"
)

correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation,Compartment_1,Feature_1_1,Feature_2_1,Feature_3_1,Channel_1,Suffix_1,Compartment_2,Feature_1_2,Feature_2_2,Feature_3_2,Channel_2,Suffix_2,abs_correlation
3051,Cytoplasm_AreaShapeMaxFeretDiameter_AreaShapeM...,Cells_AreaShapeMajorAxisLength_AreaShapeMajorA...,0.998344,Cytoplasm,AreaShapeMaxFeretDiameter,,,,,Cells,AreaShapeMajorAxisLength,,,,,0.998344
4588,Nuclei_ParentNucleiIncludingEdges_ParentNuclei...,Cytoplasm_NumberObject_NumberObject_Number,0.997139,Nuclei,ParentNucleiIncludingEdges,,,,,Cytoplasm,NumberObject,,,,Number,0.997139
5413,Cells_AreaShapeMinorAxisLength_AreaShapeMinorA...,Cytoplasm_AreaShapeMinFeretDiameter_AreaShapeM...,0.996440,Cells,AreaShapeMinorAxisLength,,,,,Cytoplasm,AreaShapeMinFeretDiameter,,,,,0.996440
8965,Cells_AreaShapeArea_AreaShapeArea,Cytoplasm_Intensity_IntegratedIntensity_Intens...,0.992905,Cells,AreaShapeArea,,,,,Cytoplasm,Intensity,IntegratedIntensity,,DNA,,0.992905
10014,Cells_AreaShapeEquivalentDiameter_AreaShapeEqu...,Cytoplasm_AreaShapeMinFeretDiameter_AreaShapeM...,0.991642,Cells,AreaShapeEquivalentDiameter,,,,,Cytoplasm,AreaShapeMinFeretDiameter,,,,,0.991642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2921217,Nuclei_ObjectSkeletonNumberNonTrunkBranches_Ob...,Cells_ChildrenCytoplasm_ChildrenCytoplasm_Count,0.001466,Nuclei,ObjectSkeletonNumberNonTrunkBranches,,,,mito_skel,Cells,ChildrenCytoplasm,,,,Count,0.001466
2961130,Cells_NeighborsFirstClosestDistance_NeighborsF...,Nuclei_AreaShapeOrientation_AreaShapeOrientation,-0.001345,Cells,NeighborsFirstClosestDistance,,,,Adjacent,Nuclei,AreaShapeOrientation,,,,,0.001345
2960348,Nuclei_ObjectSkeletonNumberBranchEnds_ObjectSk...,Cells_NeighborsAngleBetweenNeighbors_Neighbors...,-0.001279,Nuclei,ObjectSkeletonNumberBranchEnds,,,,mito_skel,Cells,NeighborsAngleBetweenNeighbors,,,,5,0.001279
2957310,Nuclei_ObjectSkeletonTotalObjectSkeletonLength...,Cells_AreaShapeEquivalentDiameter_AreaShapeEqu...,-0.001022,Nuclei,ObjectSkeletonTotalObjectSkeletonLength,,,,mito_skel,Cells,AreaShapeEquivalentDiameter,,,,,0.001022


In [12]:
# Keep only highly correlated features

correlation_df_stacked.sort_values(by=["correlation"], ascending=False, inplace=True)
correlation_df_stacked = correlation_df_stacked.query("correlation>0.75 or correlation<-0.75")

correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation,Compartment_1,Feature_1_1,Feature_2_1,Feature_3_1,Channel_1,Suffix_1,Compartment_2,Feature_1_2,Feature_2_2,Feature_3_2,Channel_2,Suffix_2,abs_correlation
3051,Cytoplasm_AreaShapeMaxFeretDiameter_AreaShapeM...,Cells_AreaShapeMajorAxisLength_AreaShapeMajorA...,0.998344,Cytoplasm,AreaShapeMaxFeretDiameter,,,,,Cells,AreaShapeMajorAxisLength,,,,,0.998344
4588,Nuclei_ParentNucleiIncludingEdges_ParentNuclei...,Cytoplasm_NumberObject_NumberObject_Number,0.997139,Nuclei,ParentNucleiIncludingEdges,,,,,Cytoplasm,NumberObject,,,,Number,0.997139
5413,Cells_AreaShapeMinorAxisLength_AreaShapeMinorA...,Cytoplasm_AreaShapeMinFeretDiameter_AreaShapeM...,0.996440,Cells,AreaShapeMinorAxisLength,,,,,Cytoplasm,AreaShapeMinFeretDiameter,,,,,0.996440
8965,Cells_AreaShapeArea_AreaShapeArea,Cytoplasm_Intensity_IntegratedIntensity_Intens...,0.992905,Cells,AreaShapeArea,,,,,Cytoplasm,Intensity,IntegratedIntensity,,DNA,,0.992905
10014,Cells_AreaShapeEquivalentDiameter_AreaShapeEqu...,Cytoplasm_AreaShapeMinFeretDiameter_AreaShapeM...,0.991642,Cells,AreaShapeEquivalentDiameter,,,,,Cytoplasm,AreaShapeMinFeretDiameter,,,,,0.991642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5125701,Cells_Texture_InverseDifferenceMoment_Texture_...,Nuclei_Intensity_UpperQuartileIntensity_Intens...,-0.948061,Cells,Texture,InverseDifferenceMoment,,DNA,10_01_256,Nuclei,Intensity,UpperQuartileIntensity,,DNA,,0.948061
5126804,Cells_Texture_InverseDifferenceMoment_Texture_...,Nuclei_Intensity_MeanIntensity_Intensity_MeanI...,-0.952922,Cells,Texture,InverseDifferenceMoment,,DNA,10_01_256,Nuclei,Intensity,MeanIntensity,,DNA,,0.952922
5128061,Nuclei_Texture_InverseDifferenceMoment_Texture...,Cells_Intensity_MaxIntensity_Intensity_MaxInte...,-0.958140,Nuclei,Texture,InverseDifferenceMoment,,DNA,5_01_256,Cells,Intensity,MaxIntensity,,DNA,,0.958140
5128364,Cells_Intensity_MaxIntensity_Intensity_MaxInte...,Nuclei_Texture_InverseDifferenceMoment_Texture...,-0.959684,Cells,Intensity,MaxIntensity,,DNA,,Nuclei,Texture,InverseDifferenceMoment,,DNA,10_00_256,0.959684


In [13]:
correlation_df_stacked[["feature_name_1", "feature_name_2", "correlation"]].to_csv('output/crispr_top_correlations.csv', index=False)