In [1]:
import pandas as pd
import utils
import numpy as np
from tqdm import tqdm
import logging

logging.basicConfig(format="%(levelname)s:%(asctime)s:%(name)s:%(message)s")
logging.getLogger("copairs").setLevel(logging.INFO)

In [2]:
operations = "wellpos_cc_var_mad_outlier_featselect_sphering_harmony"
batch_size = 20000

### Prepare the data

#### Read the ORF parquet file

In [3]:
orf_df = pd.read_parquet(
    f"../profiles/profiles_{operations}.parquet"
)

#### Remove empty wells and `negcon`s

In [4]:
orf_df = utils.remove_negcon_empty_wells(orf_df)
orf_df.shape

(77810, 743)

#### Remove `poscon` wells.

In [5]:
orf_df = orf_df.query('Metadata_pert_type!="poscon"').reset_index(drop=True)
orf_df.shape

(75880, 743)

#### Remove `BAD CONSTRUCT` wells.

In [6]:
orf_df = orf_df.query('Metadata_broad_sample!="BAD CONSTRUCT"').reset_index(drop=True)
orf_df.shape

(75880, 743)

#### Remove low infection effiency wells

In [7]:
orf_df = utils.remove_low_infection_efficiency_wells(orf_df)
orf_df.shape

(69058, 743)

#### Remove ORF reagents without a phenotypic activity

In [8]:
orf_phenotypic_activity_df = (
    pd.read_csv(
        f"../03.retrieve-annotations/output/phenotypic-activity-{operations}.csv.gz",
        usecols=["Metadata_JCP2022", "below_corrected_p"],
    )
    .rename(columns={"below_corrected_p": "Metadata_below_corrected_p"})
).query("Metadata_below_corrected_p==True")

orf_phenotypic_activity_df.shape

(7817, 2)

In [9]:
orf_df = (
    orf_df.merge(orf_phenotypic_activity_df, on="Metadata_JCP2022", how="inner")
    .reset_index(drop=True)
)

orf_df.shape

(39325, 744)

### Create cosine similarity matrix

#### Add Approved Symbols

In [10]:
orf_metadata = pd.read_csv(
    "../00.download-and-process-annotations/output/orf.csv.gz",
    usecols=["Metadata_JCP2022", "Metadata_Approved_Symbol"],
)

orf_df = orf_df.merge(orf_metadata, on="Metadata_JCP2022", how="left")

orf_df = orf_df.assign(
    Metadata_Gene_Symbol=lambda x: np.where(
        x.Metadata_Approved_Symbol.isna(), x.Metadata_Symbol, x.Metadata_Approved_Symbol
    )
)

orf_df.shape

(39325, 746)

#### Create consensus profiles

In [11]:
consensus_df = utils.consensus(orf_df, "Metadata_Gene_Symbol")
consensus_df.shape

(7031, 746)

In [12]:
pos_sameby = []
pos_diffby = ["Metadata_Gene_Symbol"]
neg_sameby = []
neg_diffby = []

In [13]:
metadata_df = utils.get_metadata(consensus_df)
feature_df = utils.get_featuredata(consensus_df)
feature_values = feature_df.values

In [14]:
cosine_sim_df = utils.cosine_similarity(
    metadata_df,
    feature_values,
    pos_sameby,
    pos_diffby,
    neg_sameby,
    neg_diffby,
    batch_size=batch_size,
)

INFO:2024-04-05 15:36:06,803:copairs:Finding positive pairs...
INFO:2024-04-05 15:36:54,218:copairs:Computing positive similarities...


  0%|          | 0/1236 [00:00<?, ?it/s]

In [15]:
cosine_sim_df.head()

Unnamed: 0,CDK9,CDX1,DIDO1,MORF4L1,ZNF816,CEBPG,SATB2,GMEB1,ZNF511,ZNF449,...,RSRC1,WDR53,HSD17B3,SLC39A9,TNF,OSER1,MRAP2,DNAAF6,LY6D,SSBP3
CDK9,1.0,-0.036738,0.024192,0.088266,0.118091,0.19383,-0.095525,-0.055995,0.072834,-0.154795,...,0.077817,-0.017365,0.066567,0.111152,0.030896,0.011113,0.055405,-0.062637,0.026236,0.066541
CDX1,-0.036738,1.0,0.044687,0.004917,-0.045187,0.168101,-0.045539,0.004681,-0.017317,0.0585,...,-0.012456,0.044695,0.049403,-0.070566,-0.049111,0.099573,0.067627,-0.044814,-0.026791,0.152591
DIDO1,0.024192,0.044687,1.0,-0.009977,0.015594,0.066995,0.002727,0.025855,-0.047823,-0.000241,...,0.010941,-0.0921,-0.040281,-0.022955,0.113931,0.013767,0.034814,-0.005883,-0.020173,-0.015723
MORF4L1,0.088266,0.004917,-0.009977,1.0,0.193922,0.214003,0.344931,0.245831,-0.050684,-0.082916,...,-0.069621,0.093656,0.16732,0.047869,0.074827,0.043172,0.050987,0.012764,-0.052054,-0.11202
ZNF816,0.118091,-0.045187,0.015594,0.193922,1.0,0.038202,-0.042108,-0.009093,0.009383,0.022128,...,-0.10214,-0.025482,0.040352,0.138435,0.049009,0.118845,0.133849,-0.104738,0.00939,-0.002249


In [16]:
cosine_sim_df.to_parquet(
    f"cos_sim/cosine_similarities_genes_orf.parquet", index=True
)