In [1]:
import pandas as pd
import utils
import numpy as np

In [2]:
feature_group = "AreaShape"
feature_group_list = ["Texture", "Intensity", "RadialDistribution"]
compartments = ["Cells", "Cytoplasm", "Nuclei"]
channels = ["DNA", "RNA", "Mito", "ER", "AGP"]

In [3]:
slc_or_genes = []
with open("output/SLC-OR_genes.txt") as f:
    for line in f:
        slc_or_genes.append(line.strip())

In [4]:
orf_genesets = [
    slc_or_genes,
    ['YAP1', 'WWTR1', 'VGLL4', 'PRKCE', 'STK3', 'CEP72', 'IL20RB', 'MTMR9'] 
]

In [5]:
orf_metadata_df = pd.read_csv(
    "../00.download-and-process-annotations/output/orf_metadata.tsv.gz",
    sep="\t",
    usecols=["Metadata_JCP2022", "Metadata_Symbol", "Metadata_pert_type"],
)

orf_profiles_with_feature_names_df = pd.read_parquet(
    "../profiles/profiles_wellpos_cc_var_mad_outlier_orf.parquet"
).merge(orf_metadata_df, on="Metadata_JCP2022", how="inner")

orf_profiles_with_feature_names_df.head()

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,...,Nuclei_Texture_Variance_RNA_3_00_256,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,Metadata_Symbol,Metadata_pert_type
0,source_4,BR00117035,A01,JCP2022_905588,1.976969,2.47889,-0.027964,-0.219052,-0.387556,-0.408526,...,-3.679746,-3.644897,-3.663296,-3.716321,-3.636625,-3.645543,-3.727525,-3.679245,CDK9,trt
1,source_4,BR00117036,A01,JCP2022_905588,0.609293,1.348788,-1.335341,-0.904727,-1.493824,-1.208351,...,2.437158,2.377791,2.409654,2.368619,2.390954,2.298647,2.406632,2.438143,CDK9,trt
2,source_4,BR00121558,A01,JCP2022_905588,-0.185963,1.084949,0.525013,-0.442693,0.36696,-0.529556,...,-0.525705,-0.495094,-0.506351,-0.553593,-0.530804,-0.562515,-0.519971,-0.535011,CDK9,trt
3,source_4,BR00121559,A01,JCP2022_905588,1.975644,2.602919,2.095917,-3.154853,1.84804,-3.808197,...,-0.86896,-0.879897,-0.857301,-0.868555,-0.879776,-0.898712,-0.885029,-0.847506,CDK9,trt
4,source_4,BR00121560,A01,JCP2022_905588,-3.07734,-2.134534,-2.492584,-0.933778,-2.360221,-0.558088,...,-0.916399,-0.913091,-0.958685,-0.89614,-0.88787,-0.906178,-0.915392,-0.889229,CDK9,trt


In [6]:
feature_names = utils.get_featurecols(orf_profiles_with_feature_names_df)

feature_name_interpretation_df = pd.DataFrame()

for feature_name in feature_names:
    df = pd.DataFrame(utils.parse_cp_features(feature_name), index=[0])
    feature_name_interpretation_df = pd.concat([feature_name_interpretation_df, df], ignore_index=True)

feature_name_interpretation_df.head()

Unnamed: 0,feature,compartment,feature_group,feature_type,channel
0,Cells_AreaShape_Area,Cells,AreaShape,Area,XNONE
1,Cells_AreaShape_BoundingBoxArea,Cells,AreaShape,BoundingBoxArea,XNONE
2,Cells_AreaShape_BoundingBoxMaximum_X,Cells,AreaShape,BoundingBoxMaximum,XNONE
3,Cells_AreaShape_BoundingBoxMaximum_Y,Cells,AreaShape,BoundingBoxMaximum,XNONE
4,Cells_AreaShape_BoundingBoxMinimum_X,Cells,AreaShape,BoundingBoxMinimum,XNONE


In [7]:
for geneset in orf_genesets:
    compartment_feature_group_df = pd.DataFrame()
    for compartment in compartments:
        feature_cols = (
            feature_name_interpretation_df.query("feature_group==@feature_group")
            .query("compartment==@compartment")
            .feature.to_list()
        )

        profiles = orf_profiles_with_feature_names_df.query(
            "Metadata_Symbol in @geneset"
        )
        plates_with_genes = list(np.unique(profiles.Metadata_Plate.to_list()))
        negative_control_profiles = orf_profiles_with_feature_names_df.query(
            "Metadata_Plate in @plates_with_genes"
        ).query("Metadata_pert_type=='negcon'")

        consensus_profiles = utils.consensus(profiles, "Metadata_JCP2022")
        consensus_profiles = utils.consensus(consensus_profiles, "Metadata_pert_type")[
            feature_cols
        ].copy()
        consensus_negative_control_profiles = utils.consensus(
            negative_control_profiles, "Metadata_pert_type"
        )[feature_cols].copy()

        id = ["genes", "negcon"]

        feature_values = np.asarray(
            [
                np.abs(utils.get_featuredata(consensus_profiles)).values[0],
                np.abs(
                    utils.get_featuredata(consensus_negative_control_profiles)
                ).values[0],
            ]
        )

        cosine_sim = utils.cosine_similarity(id, feature_values).values[0, 1]

        df = pd.DataFrame(
            {
                "Compartment": compartment,
                "Feature_group": feature_group,
                "Similarity": cosine_sim,
            },
            index=[0],
        )

        compartment_feature_group_df = pd.concat(
            [compartment_feature_group_df, df], ignore_index=True
        )

    compartment_feature_group_df = compartment_feature_group_df.pivot(
        index="Feature_group", columns="Compartment", values="Similarity"
    )

    print('-'.join(geneset))
    print(compartment_feature_group_df.to_markdown())

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

OR2L13-OR2C1-SLC10A3-SLC35F5-SLC22A12-SLC16A14-OR2A4-OR5L1-SLC22A14-OR7E2P-OR6N1-OR2A2-OR52I2-OR6B2-OR56A1-OR2B11-OR2AE1-OR4N4-OR10S1-OR4D6-OR13J1-OR13A1-OR14I1-OR12D3-OR5C1-SLC7A9-SLC22A13-OR2S2-SLC35E2A-OR10H2-SLC29A3-SLCO6A1-SLC7A7-SLC28A2-SLC49A4-SLC17A2-SLC35G5
| Feature_group   |    Cells |   Cytoplasm |   Nuclei |
|:----------------|---------:|------------:|---------:|
| AreaShape       | 0.670766 |    0.681845 | 0.817445 |


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

YAP1-WWTR1-VGLL4-PRKCE-STK3-CEP72-IL20RB-MTMR9
| Feature_group   |    Cells |   Cytoplasm |   Nuclei |
|:----------------|---------:|------------:|---------:|
| AreaShape       | 0.462371 |    0.532238 | 0.582476 |


In [8]:
for geneset in orf_genesets:
    channel_feature_group_df = pd.DataFrame()
    for channel in channels:
        for feature_group in feature_group_list:
            feature_cols = (
                feature_name_interpretation_df.query("feature_group==@feature_group")
                .query("channel==@channel")
                .feature.to_list()
            )

            profiles = orf_profiles_with_feature_names_df.query(
                "Metadata_Symbol in @geneset"
            )
            plates_with_genes = list(np.unique(profiles.Metadata_Plate.to_list()))
            negative_control_profiles = orf_profiles_with_feature_names_df.query(
                "Metadata_Plate in @plates_with_genes"
            ).query("Metadata_pert_type=='negcon'")

            consensus_profiles = utils.consensus(profiles, "Metadata_JCP2022")
            consensus_profiles = utils.consensus(consensus_profiles, "Metadata_pert_type")[
                feature_cols
            ].copy()
            consensus_negative_control_profiles = utils.consensus(
                negative_control_profiles, "Metadata_pert_type"
            )[feature_cols].copy()

            id = ["genes", "negcon"]

            feature_values = np.asarray(
                [
                    np.abs(utils.get_featuredata(consensus_profiles)).values[0],
                    np.abs(
                        utils.get_featuredata(consensus_negative_control_profiles)
                    ).values[0],
                ]
            )

            cosine_sim = utils.cosine_similarity(id, feature_values).values[0, 1]

            df = pd.DataFrame(
                {
                    "Channel": channel,
                    "Feature_group": feature_group,
                    "Similarity": cosine_sim,
                },
                index=[0],
            )

            channel_feature_group_df = pd.concat(
                [channel_feature_group_df, df], ignore_index=True
            )

    channel_feature_group_df = channel_feature_group_df.pivot(
        index="Feature_group", columns="Channel", values="Similarity"
    )

    print('-'.join(geneset))
    print(channel_feature_group_df.to_markdown())

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

OR2L13-OR2C1-SLC10A3-SLC35F5-SLC22A12-SLC16A14-OR2A4-OR5L1-SLC22A14-OR7E2P-OR6N1-OR2A2-OR52I2-OR6B2-OR56A1-OR2B11-OR2AE1-OR4N4-OR10S1-OR4D6-OR13J1-OR13A1-OR14I1-OR12D3-OR5C1-SLC7A9-SLC22A13-OR2S2-SLC35E2A-OR10H2-SLC29A3-SLCO6A1-SLC7A7-SLC28A2-SLC49A4-SLC17A2-SLC35G5
| Feature_group      |      AGP |      DNA |       ER |     Mito |      RNA |
|:-------------------|---------:|---------:|---------:|---------:|---------:|
| Intensity          | 0.663982 | 0.638312 | 0.451621 | 0.748014 | 0.678516 |
| RadialDistribution | 0.628601 | 0.82015  | 0.741996 | 0.769545 | 0.612776 |
| Texture            | 0.713483 | 0.319648 | 0.692508 | 0.770683 | 0.622415 |


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

YAP1-WWTR1-VGLL4-PRKCE-STK3-CEP72-IL20RB-MTMR9
| Feature_group      |      AGP |      DNA |       ER |     Mito |      RNA |
|:-------------------|---------:|---------:|---------:|---------:|---------:|
| Intensity          | 0.668385 | 0.625085 | 0.689311 | 0.799832 | 0.67024  |
| RadialDistribution | 0.623159 | 0.729679 | 0.703727 | 0.53702  | 0.619651 |
| Texture            | 0.673136 | 0.678875 | 0.566791 | 0.751775 | 0.719063 |
