In [1]:
import pandas as pd
from pycytominer.cyto_utils import write_gct
import utils

In [2]:
pipelines = {
    "ORF": "wellpos_cc_var_mad_outlier_featselect_sphering_harmony",
    "CRISPR": "wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected",
}

### ORF profiles

In [3]:
orf_profiles_df = pd.read_parquet(f'../profiles/profiles_{pipelines["ORF"]}.parquet')
orf_profiles_df.shape

(81660, 726)

In [4]:
orf_phenotypic_activity_df = (
    pd.read_csv(
        f'../03.retrieve-annotations/output/phenotypic-activity-{pipelines["ORF"]}.csv.gz',
        usecols=[
            "Metadata_JCP2022",
            "mean_average_precision",
            "p_value",
            "corrected_p_value",
            "below_corrected_p",
        ],
    )
    .rename(
        columns={
            "mean_average_precision": "Metadata_mean_average_precision",
            "p_value": "Metadata_p_value",
            "corrected_p_value": "Metadata_corrected_p_value",
        }
    )
    .query("below_corrected_p == True")
    .drop(columns=["below_corrected_p"])
)
orf_phenotypic_activity_df.shape

(7817, 4)

In [5]:
orf_metadata_df = pd.read_csv(
    "../00.download-and-process-annotations/output/orf_metadata.tsv.gz", sep="\t"
)
orf_metadata_df.shape

(15142, 26)

In [6]:
orf_df  = (
    orf_profiles_df.merge(
        orf_metadata_df,
        how="left",
        on="Metadata_JCP2022",
    ).merge(
        orf_phenotypic_activity_df,
        how="inner",
        on="Metadata_JCP2022",
    )
)
orf_df.shape

(39350, 754)

In [7]:
# Consensus profile

orf_df = utils.consensus(orf_df, group_by_feature="Metadata_Symbol")
orf_df.shape

(7031, 754)

In [8]:
write_gct(
    orf_df,
    f'../profiles/orf_phenotypically_active_{pipelines["ORF"]}.gct',
    features=orf_df.columns[orf_df.columns.str.startswith("X_")].to_list(),
    meta_features=orf_df.columns[~orf_df.columns.str.startswith("X_")].to_list(),
)

### CRISPR profiles

In [9]:
crispr_profiles_df = pd.read_parquet(f'../profiles/profiles_{pipelines["CRISPR"]}.parquet')
crispr_profiles_df.shape

(51185, 263)

In [10]:
crispr_phenotypic_activity_df = (
    pd.read_csv(
        f'../03.retrieve-annotations/output/phenotypic-activity-{pipelines["CRISPR"]}.csv.gz',
        usecols=[
            "Metadata_JCP2022",
            "mean_average_precision",
            "p_value",
            "corrected_p_value",
            "below_corrected_p",
        ],
    )
    .rename(
        columns={
            "mean_average_precision": "Metadata_mean_average_precision",
            "p_value": "Metadata_p_value",
            "corrected_p_value": "Metadata_corrected_p_value",
        }
    )
    .query("below_corrected_p == True")
    .drop(columns=["below_corrected_p"])
)
crispr_phenotypic_activity_df.shape

(5546, 4)

In [11]:
crispr_metadata_df = pd.read_csv(
    "../00.download-and-process-annotations/output/crispr_metadata.tsv.gz", sep="\t"
)
crispr_metadata_df.shape

(7977, 17)

In [12]:
crispr_df  = (
    crispr_profiles_df.merge(
        crispr_metadata_df,
        how="left",
        on="Metadata_JCP2022",
    ).merge(
        crispr_phenotypic_activity_df,
        how="inner",
        on="Metadata_JCP2022",
    )
)
crispr_df.shape

(29909, 282)

In [13]:
# Consensus profile

crispr_df = utils.consensus(crispr_df, group_by_feature="Metadata_Symbol")
crispr_df.shape

(5546, 282)

In [14]:
write_gct(
    crispr_df,
    f'../profiles/crispr_phenotypically_active_{pipelines["CRISPR"]}.gct',
    features=crispr_df.columns[crispr_df.columns.str.startswith("X_")].to_list(),
    meta_features=crispr_df.columns[~crispr_df.columns.str.startswith("X_")].to_list(),
)