In [1]:
import pandas as pd
import utils
import numpy as np
import warnings
from tqdm import tqdm
from copairs.map import average_precision, mean_average_precision
import logging

logging.basicConfig(format="%(levelname)s:%(asctime)s:%(name)s:%(message)s")
logging.getLogger("copairs").setLevel(logging.INFO)

warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
operations = "wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected"
batch_size = 20000
null_size = 20000
fdr = 0.1

In [3]:
# CRISPR controls

crispr_controls_df = pd.DataFrame(
    {
        "Metadata_JCP2022": ["JCP2022_805264", "JCP2022_800001", "JCP2022_800002"],
        "Metadata_pert_type": ["poscon", "negcon", "negcon"],
    },
    index=[0, 1, 2],
)

### Prepare the data

#### Read the CRISPR parquet file

In [4]:
crispr_df = pd.read_parquet(
    f"../profiles/profiles_{operations}.parquet"
)
crispr_df.shape

(51185, 263)

#### Add annotations

In [5]:
crispr_metdata_df = (
    pd.read_csv(
        "../00.download-and-process-annotations/output/crispr_metadata.tsv.gz", sep="\t"
    )
    .merge(crispr_controls_df, on="Metadata_JCP2022", how="left")
    .fillna(value={"Metadata_pert_type": "trt"})
)
compound_metadata_df = pd.read_csv(
    "../datasets/metadata/compound.csv.gz", usecols=["Metadata_JCP2022"]
).assign(
    Metadata_pert_type=lambda x: np.where(
        x["Metadata_JCP2022"] == "JCP2022_999999", "empty", "poscon"
    )
)

metadata_df = pd.concat(
    [crispr_metdata_df, compound_metadata_df],
    join="outer",
    ignore_index=True,
)

crispr_df = crispr_df.merge(metadata_df, on="Metadata_JCP2022", how="inner")
crispr_df.shape

(51185, 279)

#### Remove `poscon` wells.

In [6]:
crispr_df = crispr_df.query('Metadata_pert_type!="poscon"').reset_index(drop=True)
crispr_df.shape

(50616, 279)

#### Remove featues with `nan` values.
These need to be removed as the `nan` values will cause the mean average precision calculation to fail.

In [7]:
crispr_df = utils.remove_nan_features(crispr_df)
crispr_df.shape

Removed nan features: []


(50616, 279)

#### Remove reagents with only one replicate

In [8]:
reagents_with_one_replicate = (
    crispr_df.Metadata_JCP2022.value_counts()[
        crispr_df.Metadata_JCP2022.value_counts() == 1
    ]
    .reset_index()["index"]
    .tolist()
)

crispr_df = crispr_df[~crispr_df.Metadata_JCP2022.isin(reagents_with_one_replicate)]
crispr_df.shape

(50613, 279)

### Calculate mAP for each ORF perturbation

In [9]:
# Adding a new column for negative control
crispr_df["Metadata_negcon"] = np.where(crispr_df["Metadata_pert_type"] == "negcon", 1, 0)

In [10]:
pos_sameby = ["Metadata_JCP2022"]
pos_diffby = []
neg_sameby = ["Metadata_Plate"]
neg_diffby = ["Metadata_negcon"]

In [11]:
metadata_df = utils.get_metadata(crispr_df)
feature_df = utils.get_featuredata(crispr_df)
feature_values = feature_df.values

In [12]:
result = average_precision(
    metadata_df, feature_values, pos_sameby, pos_diffby, neg_sameby, neg_diffby, batch_size=batch_size
)

INFO:2024-05-01 15:47:31,628:copairs:Indexing metadata...
INFO:2024-05-01 15:47:31,676:copairs:Finding positive pairs...
INFO:2024-05-01 15:47:36,795:copairs:Finding negative pairs...
INFO:2024-05-01 15:47:37,504:copairs:Computing positive similarities...


  0%|          | 0/753 [00:00<?, ?it/s]

INFO:2024-05-01 15:47:45,754:copairs:Computing negative similarities...


  0%|          | 0/59 [00:00<?, ?it/s]

INFO:2024-05-01 15:47:46,441:copairs:Building rank lists...
INFO:2024-05-01 15:47:59,210:copairs:Computing average precision...
INFO:2024-05-01 15:47:59,628:copairs:Creating result DataFrame...
INFO:2024-05-01 15:47:59,632:copairs:Finished.


In [13]:
# Remove negcon
result = result.query('Metadata_pert_type!="negcon"').reset_index(drop=True)

In [14]:
agg_result = (
    mean_average_precision(result, pos_sameby, null_size=null_size, threshold=fdr, seed=12527)
    .rename(columns={'average_precision': 'mean_average_precision'})
)

INFO:2024-05-01 15:47:59,741:copairs:Computing null_dist...


  0%|          | 0/87 [00:00<?, ?it/s]

INFO:2024-05-01 15:47:59,796:copairs:Computing p-values...


  0%|          | 0/7971 [00:00<?, ?it/s]

In [15]:
agg_result.to_csv(f"output/phenotypic-activity-{operations}.csv.gz", index=False)