In [1]:
import pandas as pd
import numpy as np

In [2]:
selected_feature_names_df = pd.DataFrame()
feature_name_cols = ["Compartment", "Feature", "Feature_1", "Feature_2", "Feature_3", "Channel", "Suffix"]

In [3]:
orf_feature_df = pd.read_parquet("input/orf_feature_wide.parquet").rename(
    columns={"Mask": "Compartment"}
)

# Split Feature further
orf_feature_df[["Feature_1", "Feature_2", "Feature_3"]] = (
    orf_feature_df.Feature.str.split("_", expand=True)
)

# Remove leading underscore from column values

orf_feature_df["Suffix"] = orf_feature_df["Suffix"].apply(lambda x: x.lstrip("_"))

# Create feature name
orf_feature_df["feature_name"] = orf_feature_df[["Compartment", "Feature_1", "Feature_2", "Feature_3", "Channel", "Suffix"]].agg(
    lambda x: "_".join(filter(None, x)), axis=1
)

orf_feature_df.head()

Unnamed: 0,Compartment,Feature,Channel,Suffix,column_0,column_1,column_2,column_3,column_4,column_5,...,column_3630,column_3631,column_3632,column_3633,column_3634,column_3635,Feature_1,Feature_2,Feature_3,feature_name
0,Cells,AreaShapeCompactness,,,1.0,-0.15137,-0.101507,-0.059984,-0.057121,-0.10545,...,0.022632,-0.061216,-0.005541,-0.093061,-0.108426,-0.075014,AreaShapeCompactness,,,Cells_AreaShapeCompactness
1,Cells,AreaShapeZernike,,6_6,-0.15137,1.0,-0.119333,0.031895,0.037169,-0.025448,...,0.000272,-0.045915,-0.003861,-0.02643,-0.077761,-0.06411,AreaShapeZernike,,,Cells_AreaShapeZernike_6_6
2,Cells,Texture_Correlation,AGP,3_01_256,-0.101507,-0.119333,1.0,-0.074286,-0.229202,0.20807,...,-0.270294,-0.092594,-0.175566,0.078074,0.023241,-0.097288,Texture,Correlation,,Cells_Texture_Correlation_AGP_3_01_256
3,Cells,Texture_DifferenceEntropy,Mito,5_03_256,-0.059984,0.031895,-0.074286,1.0,0.248083,0.573112,...,0.097819,0.136732,0.066876,0.777491,0.513563,0.16853,Texture,DifferenceEntropy,,Cells_Texture_DifferenceEntropy_Mito_5_03_256
4,Cells,Texture_Entropy,ER,10_00_256,-0.057121,0.037169,-0.229202,0.248083,1.0,0.000372,...,0.555658,0.723166,0.506062,0.050585,0.065895,0.478547,Texture,Entropy,,Cells_Texture_Entropy_ER_10_00_256


In [4]:
# Rename column names 

feature_names_df = orf_feature_df[feature_name_cols + ["feature_name"]].copy()
correlation_df = orf_feature_df.drop(columns=feature_name_cols).set_index("feature_name")
correlation_df.columns = feature_names_df.feature_name.to_list()

In [5]:
# Remove Correlation features
feature_names_df = feature_names_df[
    ~feature_names_df.Feature.str.contains("Correlation")
]

# Remove Object Number features

feature_names_df = feature_names_df[
    ~feature_names_df.Feature.str.contains("ObjectNumber")
]

feature_names_df = feature_names_df[
    ~feature_names_df.Feature.str.contains("Object_Number")
]

feature_names_df = feature_names_df[
    ~feature_names_df.Suffix.str.contains("X")
]

feature_names_df = feature_names_df[
    ~feature_names_df.Suffix.str.contains("Y")
]


correlation_df = correlation_df.loc[
    feature_names_df.feature_name.to_list(), feature_names_df.feature_name.to_list()
]

In [6]:
# Stack the dataframe

correlation_df_stacked = (
    correlation_df.mask(np.tril(np.ones(correlation_df.shape)).astype(bool))
    .stack()
    .reset_index()
    .rename(
        columns={
            0: "correlation",
            "feature_name": "feature_name_1",
            "level_1": "feature_name_2",
        }
    )
).sort_values(by=["correlation"], ascending=False)
correlation_df_stacked

Unnamed: 0,feature_name_1,feature_name_2,correlation
2823858,Cytoplasm_AreaShapeBoundingBoxArea,Cells_AreaShapeBoundingBoxArea,1.000000
3406061,Cells_NeighborsSecondClosestDistance_Adjacent,Cells_NeighborsSecondClosestDistance_5,1.000000
879837,Cells_NeighborsAngleBetweenNeighbors_Adjacent,Cells_NeighborsAngleBetweenNeighbors_5,1.000000
1861691,Cells_AreaShapeMaxFeretDiameter,Cytoplasm_AreaShapeMaxFeretDiameter,1.000000
2970722,Cells_NeighborsFirstClosestDistance_5,Cells_NeighborsFirstClosestDistance_Adjacent,1.000000
...,...,...,...
858224,Cytoplasm_RadialDistribution_FracAtD_DNA_3of4,Cytoplasm_RadialDistribution_FracAtD_DNA_4of4,-0.990500
4937895,Nuclei_Texture_InverseDifferenceMoment_Mito_5_...,Nuclei_Texture_Entropy_Mito_3_02_256,-0.990562
2033056,Nuclei_Texture_Entropy_AGP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_AGP_5_0...,-0.990900
4745297,Nuclei_Texture_Entropy_Mito_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mito_5_...,-0.990912


In [7]:
# Add other feature name columns

correlation_df_stacked = (
    correlation_df_stacked.merge(
        feature_names_df, left_on="feature_name_1", right_on="feature_name", how="left"
    )
    .drop(columns=["feature_name", "Feature"])
    .rename(
        columns={
            "Compartment": "Compartment_1",
            "Channel": "Channel_1",
            "Suffix": "Suffix_1",
            "Feature_1": "Feature_1_1",
            "Feature_2": "Feature_2_1",
            "Feature_3": "Feature_3_1",
        }
    )
    .merge(feature_names_df, left_on="feature_name_2", right_on="feature_name", how="left")
    .drop(columns=["feature_name", "Feature"])
    .rename(
        columns={
            "Compartment": "Compartment_2",
            "Channel": "Channel_2",
            "Suffix": "Suffix_2",
            "Feature_1": "Feature_1_2",
            "Feature_2": "Feature_2_2",
            "Feature_3": "Feature_3_2",
        }
    )
)
correlation_df_stacked.head()

Unnamed: 0,feature_name_1,feature_name_2,correlation,Compartment_1,Feature_1_1,Feature_2_1,Feature_3_1,Channel_1,Suffix_1,Compartment_2,Feature_1_2,Feature_2_2,Feature_3_2,Channel_2,Suffix_2
0,Cytoplasm_AreaShapeBoundingBoxArea,Cells_AreaShapeBoundingBoxArea,1.0,Cytoplasm,AreaShapeBoundingBoxArea,,,,,Cells,AreaShapeBoundingBoxArea,,,,
1,Cells_NeighborsSecondClosestDistance_Adjacent,Cells_NeighborsSecondClosestDistance_5,1.0,Cells,NeighborsSecondClosestDistance,,,,Adjacent,Cells,NeighborsSecondClosestDistance,,,,5
2,Cells_NeighborsAngleBetweenNeighbors_Adjacent,Cells_NeighborsAngleBetweenNeighbors_5,1.0,Cells,NeighborsAngleBetweenNeighbors,,,,Adjacent,Cells,NeighborsAngleBetweenNeighbors,,,,5
3,Cells_AreaShapeMaxFeretDiameter,Cytoplasm_AreaShapeMaxFeretDiameter,1.0,Cells,AreaShapeMaxFeretDiameter,,,,,Cytoplasm,AreaShapeMaxFeretDiameter,,,,
4,Cells_NeighborsFirstClosestDistance_5,Cells_NeighborsFirstClosestDistance_Adjacent,1.0,Cells,NeighborsFirstClosestDistance,,,,5,Cells,NeighborsFirstClosestDistance,,,,Adjacent


One pair of texture features between ER vs RNA (ideally the same feature for both channels to keep simple to explain); most likely it will be related to cell area and not terribly interesting but I could see a gene that ‘breaks’ the correlation between ER and RNA being interesting so it’s worth doing.

In [8]:
df = correlation_df_stacked.query("Channel_1 == 'ER' and Channel_2 == 'RNA'").query(
    "Feature_1_1 =='Texture' and Feature_1_2 == 'Texture'"
).query("Compartment_1==Compartment_2").query(
    "Suffix_1==Suffix_2"
).query("correlation>0.9")

In [9]:
selected_feature_names_df = pd.concat([selected_feature_names_df, pd.DataFrame(
    {
        "feature_name_1": df.iloc[0]["feature_name_1"],
        "feature_name_2": df.iloc[0]["feature_name_2"],
    },
    index=[0],
)], ignore_index=True)

selected_feature_names_df.head()

Unnamed: 0,feature_name_1,feature_name_2
0,Nuclei_Texture_AngularSecondMoment_ER_10_01_256,Nuclei_Texture_AngularSecondMoment_RNA_10_01_256


The 'neat' one mentioned above (in CRISPR):`Nuclei_Granularity_1_RNA` and `Cytoplasm_AreaShape_Solidity`

In [10]:
feature_name_1 = "Nuclei_Granularity_1_RNA"
feature_name_2 = "Cytoplasm_AreaShape_Solidity"


selected_feature_names_df = pd.concat(
    [
        selected_feature_names_df,
        pd.DataFrame(
            {"feature_name_1": [feature_name_1], "feature_name_2": [feature_name_2]},
            index=[0],
        ),
    ],
    ignore_index=True,
)

selected_feature_names_df.head()

Unnamed: 0,feature_name_1,feature_name_2
0,Nuclei_Texture_AngularSecondMoment_ER_10_01_256,Nuclei_Texture_AngularSecondMoment_RNA_10_01_256
1,Nuclei_Granularity_1_RNA,Cytoplasm_AreaShape_Solidity


Nucleus area vs cell area

In [11]:
feature_name_1 = "Nuclei_AreaShape_Area"
feature_name_2 = "Cells_AreaShape_Area"


selected_feature_names_df = pd.concat(
    [
        selected_feature_names_df,
        pd.DataFrame(
            {"feature_name_1": [feature_name_1], "feature_name_2": [feature_name_2]},
            index=[0],
        ),
    ],
    ignore_index=True,
)

selected_feature_names_df.head()

Unnamed: 0,feature_name_1,feature_name_2
0,Nuclei_Texture_AngularSecondMoment_ER_10_01_256,Nuclei_Texture_AngularSecondMoment_RNA_10_01_256
1,Nuclei_Granularity_1_RNA,Cytoplasm_AreaShape_Solidity
2,Nuclei_AreaShape_Area,Cells_AreaShape_Area


In [12]:
feature_name_1 = "Nuclei_Intensity_IntegratedIntensity_DNA"
feature_name_2 = "Cells_AreaShape_Area"


selected_feature_names_df = pd.concat(
    [
        selected_feature_names_df,
        pd.DataFrame(
            {"feature_name_1": [feature_name_1], "feature_name_2": [feature_name_2]},
            index=[0],
        ),
    ],
    ignore_index=True,
)

selected_feature_names_df.head()

Unnamed: 0,feature_name_1,feature_name_2
0,Nuclei_Texture_AngularSecondMoment_ER_10_01_256,Nuclei_Texture_AngularSecondMoment_RNA_10_01_256
1,Nuclei_Granularity_1_RNA,Cytoplasm_AreaShape_Solidity
2,Nuclei_AreaShape_Area,Cells_AreaShape_Area
3,Nuclei_Intensity_IntegratedIntensity_DNA,Cells_AreaShape_Area


In [13]:
feature_name_1 = "Nuclei_Intensity_IntegratedIntensity_RNA"
feature_name_2 = "Cells_AreaShape_Area"


selected_feature_names_df = pd.concat(
    [
        selected_feature_names_df,
        pd.DataFrame(
            {"feature_name_1": [feature_name_1], "feature_name_2": [feature_name_2]},
            index=[0],
        ),
    ],
    ignore_index=True,
)

selected_feature_names_df.head()

Unnamed: 0,feature_name_1,feature_name_2
0,Nuclei_Texture_AngularSecondMoment_ER_10_01_256,Nuclei_Texture_AngularSecondMoment_RNA_10_01_256
1,Nuclei_Granularity_1_RNA,Cytoplasm_AreaShape_Solidity
2,Nuclei_AreaShape_Area,Cells_AreaShape_Area
3,Nuclei_Intensity_IntegratedIntensity_DNA,Cells_AreaShape_Area
4,Nuclei_Intensity_IntegratedIntensity_RNA,Cells_AreaShape_Area


In [14]:
selected_feature_names_df.to_csv('output/selected_feature_names.csv', index=False)