In [1]:
import pandas as pd
import utils
import numpy as np
import warnings
import random
from tqdm import tqdm
import random
from copairs.map import average_precision, mean_average_precision
import logging

logging.basicConfig(format="%(levelname)s:%(asctime)s:%(name)s:%(message)s")
logging.getLogger("copairs").setLevel(logging.INFO)

warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
operations = "wellpos_var_mad_int_featselect_harmony_PCA"
batch_size = 20000
null_size = 20000
fdr = 0.1

### Prepare the data

#### Read the ORF parquet file

In [3]:
orf_df = pd.read_parquet(
    f"../profiles/profiles_{operations}.parquet"
)

In [4]:
orf_df.columns

Index(['Metadata_Source', 'Metadata_Plate', 'Metadata_Well',
       'Metadata_JCP2022', 'Metadata_broad_sample', 'Metadata_Name',
       'Metadata_Vector', 'Metadata_Transcript', 'Metadata_Symbol',
       'Metadata_NCBI_Gene_ID',
       ...
       '354', '355', '356', '357', '358', '359', '360', '361', '362', '363'],
      dtype='object', length=385)

#### Remove empty wells

In [5]:
orf_df = utils.remove_empty_wells(orf_df)
orf_df.shape

(81490, 385)

#### Remove `poscon` wells.

In [6]:
orf_df = orf_df.query('Metadata_pert_type!="poscon"').reset_index(drop=True)
orf_df.shape

(79560, 385)

#### Remove `BAD CONSTRUCT` wells.

In [7]:
orf_df = orf_df.query('Metadata_broad_sample!="BAD CONSTRUCT"').reset_index(drop=True)
orf_df.shape

(79560, 385)

#### Remove featues with `nan` values.
These need to be removed as the `nan` values will cause the mean average precision calculation to fail.

In [8]:
orf_df = utils.remove_nan_features(orf_df)

Removed nan features: []


#### Remove low infection efficiency wells

In [9]:
# Add plate map name to the dataframe

plate_map_df = pd.read_csv(
    "../00.download-and-process-annotations/input/experiment-metadata.tsv",
    sep="\t",
    usecols=["Plate_Map_Name", "Assay_Plate_Barcode"],
).rename(columns={"Assay_Plate_Barcode": "Metadata_Plate", "Plate_Map_Name": "Metadata_plate_map_name"})

orf_df = orf_df.merge(plate_map_df, on="Metadata_Plate", how="left")

orf_df = utils.remove_low_infection_efficiency_wells(orf_df)
orf_df.shape

(72343, 386)

### Calculate mAP for each ORF perturbation

In [10]:
# Adding a new column for negative control
orf_df["Metadata_negcon"] = np.where(orf_df["Metadata_pert_type"] == "negcon", 1, 0)

In [11]:
pos_sameby = ["Metadata_JCP2022"]
pos_diffby = []
neg_sameby = ["Metadata_Plate"]
neg_diffby = ["Metadata_negcon"]

In [12]:
metadata_df = utils.get_metadata(orf_df)
feature_df = utils.get_featuredata(orf_df)
feature_values = feature_df.values

In [13]:
result = average_precision(
    metadata_df, feature_values, pos_sameby, pos_diffby, neg_sameby, neg_diffby, batch_size=batch_size
)

INFO:2024-04-05 15:22:02,145:copairs:Indexing metadata...


INFO:2024-04-05 15:22:02,202:copairs:Finding positive pairs...
INFO:2024-04-05 15:22:02,850:copairs:Finding negative pairs...
INFO:2024-04-05 15:22:03,564:copairs:Computing positive similarities...


  0%|          | 0/76 [00:00<?, ?it/s]

INFO:2024-04-05 15:22:05,262:copairs:Computing negative similarities...


  0%|          | 0/52 [00:00<?, ?it/s]

INFO:2024-04-05 15:22:06,357:copairs:Building rank lists...
INFO:2024-04-05 15:22:08,085:copairs:Computing average precision...
INFO:2024-04-05 15:22:08,150:copairs:Creating result DataFrame...
INFO:2024-04-05 15:22:08,154:copairs:Finished.


In [14]:
# Remove negcon
result = result.query('Metadata_pert_type!="negcon"').reset_index(drop=True)

In [15]:
agg_result = (
    mean_average_precision(result, pos_sameby, null_size=null_size, threshold=fdr, seed=12527)
    .rename(columns={'average_precision': 'mean_average_precision'})
)

INFO:2024-04-05 15:22:08,304:copairs:Computing null_dist...


  0%|          | 0/18 [00:00<?, ?it/s]

INFO:2024-04-05 15:22:08,359:copairs:Computing p-values...


  0%|          | 0/13739 [00:00<?, ?it/s]

In [16]:
agg_result.to_csv(f"output/phenotypic-activity-{operations}.csv.gz", index=False)