In [1]:
import pandas as pd
import utils
import numpy as np
from tqdm import tqdm
import logging
from sklearn.metrics.pairwise import cosine_similarity

logging.basicConfig(format="%(levelname)s:%(asctime)s:%(name)s:%(message)s")
logging.getLogger("copairs").setLevel(logging.INFO)

In [2]:
operations = "wellpos_cc_var_mad_outlier_featselect_sphering_harmony"
batch_size = 20000

### Prepare the data

#### Read the ORF parquet file

In [3]:
orf_df = pd.read_parquet(
    f"../profiles/profiles_{operations}.parquet"
)

#### Remove empty wells and `negcon`s

In [4]:
orf_df = utils.remove_negcon_empty_wells(orf_df)
orf_df.shape

(77810, 743)

#### Remove `poscon` wells.

In [5]:
orf_df = orf_df.query('Metadata_pert_type!="poscon"').reset_index(drop=True)
orf_df.shape

(75880, 743)

#### Remove `BAD CONSTRUCT` wells.

In [6]:
orf_df = orf_df.query('Metadata_broad_sample!="BAD CONSTRUCT"').reset_index(drop=True)
orf_df.shape

(75880, 743)

#### Remove low infection effiency wells

In [7]:
orf_df = utils.remove_low_infection_efficiency_wells(orf_df)
orf_df.shape

(69058, 743)

#### Remove ORF reagents without a phenotypic activity

In [8]:
orf_phenotypic_activity_df = (
    pd.read_csv(
        f"../03.retrieve-annotations/output/phenotypic-activity-{operations}.csv.gz",
        usecols=["Metadata_JCP2022", "below_corrected_p"],
    )
    .rename(columns={"below_corrected_p": "Metadata_below_corrected_p"})
).query("Metadata_below_corrected_p==True")

orf_phenotypic_activity_df.shape

(7817, 2)

In [9]:
orf_df = (
    orf_df.merge(orf_phenotypic_activity_df, on="Metadata_JCP2022", how="inner")
    .reset_index(drop=True)
)

orf_df.shape

(39325, 744)

### Create cosine similarity matrix

#### Add Approved Symbols

In [10]:
orf_metadata = pd.read_csv(
    "../00.download-and-process-annotations/output/orf.csv.gz",
    usecols=["Metadata_JCP2022", "Metadata_Approved_Symbol"],
)

orf_df = orf_df.merge(orf_metadata, on="Metadata_JCP2022", how="left")

orf_df = orf_df.assign(
    Metadata_Gene_Symbol=lambda x: np.where(
        x.Metadata_Approved_Symbol.isna(), x.Metadata_Symbol, x.Metadata_Approved_Symbol
    )
)

orf_df.shape

(39325, 746)

#### Create consensus profiles

In [11]:
consensus_df = utils.consensus(orf_df, "Metadata_Gene_Symbol")
consensus_df.shape

(7031, 746)

In [12]:
feature_df = utils.get_featuredata(consensus_df)
feature_values = feature_df.values
id = list(consensus_df["Metadata_Gene_Symbol"])

In [13]:
cosine_sim_df = utils.cosine_similarity(
    id,
    feature_values,
    batch_size=batch_size,
)

INFO:2024-04-11 18:03:09,360:copairs:Computing positive similarities...


  0%|          | 0/1236 [00:00<?, ?it/s]

In [14]:
cosine_sim_df.head()

Unnamed: 0,CDK9,CDX1,DIDO1,MORF4L1,ZNF816,CEBPG,SATB2,GMEB1,ZNF511,ZNF449,...,RSRC1,WDR53,HSD17B3,SLC39A9,TNF,OSER1,MRAP2,DNAAF6,LY6D,SSBP3
CDK9,1.0,-0.036738,0.024192,0.088266,0.118091,0.19383,-0.095525,-0.055995,0.072834,-0.154795,...,0.128611,0.019198,-0.018367,0.047726,0.065281,-0.056427,0.04859,0.09442,-0.014681,0.078272
CDX1,-0.036738,1.0,0.055702,-0.092422,-0.065295,-0.088568,0.025723,0.095202,0.098659,0.084236,...,-0.015004,-0.041229,0.047989,-0.011991,-0.041313,0.021274,0.02288,0.103896,0.169554,0.018814
DIDO1,0.024192,0.055702,1.0,0.054351,-0.03991,0.029791,0.013561,-0.046964,0.088997,-0.019238,...,-0.049064,-0.039306,-0.035729,0.005856,0.004698,-0.150168,-0.049035,-0.001288,0.040873,0.034702
MORF4L1,0.088266,-0.092422,0.054351,1.0,0.37592,0.22984,-0.067568,-0.004566,0.004135,-0.011579,...,0.00798,0.092175,-0.101405,0.125299,0.095949,0.071784,0.091268,0.135257,0.055546,-0.048352
ZNF816,0.118091,-0.065295,-0.03991,0.37592,1.0,0.277695,-0.125863,-0.022127,-0.028693,-0.01987,...,0.112145,-0.00904,-0.028831,0.138181,0.089911,-0.038642,0.075395,0.129067,0.002618,-0.069227


In [15]:
cosine_sim_df.to_parquet(
    f"cos_sim/cosine_similarities_genes_orf.parquet", index=True
)