In [1]:
import pandas as pd
import utils
import numpy as np
from tqdm import tqdm
import logging

logging.basicConfig(format="%(levelname)s:%(asctime)s:%(name)s:%(message)s")
logging.getLogger("copairs").setLevel(logging.INFO)

In [2]:
operations = "wellpos_var_mad_int_featselect_harmony_PCA_corrected"
batch_size = 20000

### Prepare the data

#### Read the CRISPR parquet file

In [3]:
crispr_df = pd.read_parquet(
    f"../profiles/profiles_{operations}.parquet"
)

#### Remove empty wells and `negcon`s

In [4]:
crispr_df = utils.remove_negcon_empty_wells(crispr_df)
crispr_df.shape

(43707, 309)

#### Remove `poscon` wells.

In [5]:
crispr_df = crispr_df.query('Metadata_Symbol!="PLK1"').reset_index(drop=True)
crispr_df.shape

(43138, 309)

#### Remove ORF reagents without a phenotypic activity

In [6]:
crispr_phenotypic_activity_df = (
    pd.read_csv(
        f"../03.retrieve-annotations/output/phenotypic-activity-{operations}.csv.gz",
        usecols=["Metadata_JCP2022", "below_corrected_p"],
    )
    .rename(columns={"below_corrected_p": "Metadata_below_corrected_p"})
).query("Metadata_below_corrected_p==True")

crispr_phenotypic_activity_df.shape

(5685, 2)

In [7]:
crispr_df = (
    crispr_df.merge(crispr_phenotypic_activity_df, on="Metadata_JCP2022", how="inner")
    .reset_index(drop=True)
)

crispr_df.shape

(30686, 310)

### Create cosine similarity matrix

#### Add Approved Symbols

In [8]:
crispr_metadata = pd.read_csv(
    "../00.download-and-process-annotations/output/crispr.csv.gz",
    usecols=["Metadata_JCP2022", "Metadata_Approved_Symbol"],
)

crispr_df = crispr_df.drop(columns=["Metadata_Approved_symbol"]).merge(
    crispr_metadata, on="Metadata_JCP2022", how="left"
)

crispr_df = crispr_df.assign(
    Metadata_Gene_Symbol=lambda x: np.where(
        x.Metadata_Approved_Symbol.isna(), x.Metadata_Symbol, x.Metadata_Approved_Symbol
    )
)

crispr_df.shape

(30686, 311)

#### Create consensus profiles

In [9]:
consensus_df = utils.consensus(crispr_df, "Metadata_Gene_Symbol")
consensus_df.shape

(5684, 311)

In [10]:
pos_sameby = []
pos_diffby = ["Metadata_Gene_Symbol"]
neg_sameby = []
neg_diffby = []

In [11]:
metadata_df = utils.get_metadata(consensus_df)
feature_df = utils.get_featuredata(consensus_df)
feature_values = feature_df.values

In [12]:
cosine_sim_df = utils.cosine_similarity(
    metadata_df,
    feature_values,
    pos_sameby,
    pos_diffby,
    neg_sameby,
    neg_diffby,
    batch_size=batch_size,
)

INFO:2024-04-05 15:39:07,211:copairs:Finding positive pairs...
INFO:2024-04-05 15:39:38,184:copairs:Computing positive similarities...


  0%|          | 0/808 [00:00<?, ?it/s]

In [13]:
cosine_sim_df.head()

Unnamed: 0,PALS2,MRPL58,GUCY1B1,TAFAZZIN,MARS1,SARS1,ATP5PO,EPRS1,GFUS,TMT1B,...,MIB2,SPSB1,TP53I3,PLD5,SFXN1,BZW2,SIX6,SMARCA4,TGIF1,SPTLC1
PALS2,1.0,-0.422487,-0.210183,-0.740775,-0.055936,-0.068931,-0.587807,-0.41472,-0.572635,-0.622593,...,0.148122,-0.355132,0.178931,0.42399,0.559796,0.53907,-0.545599,0.478428,-0.359071,0.58536
MRPL58,-0.422487,1.0,0.185705,0.595375,-0.467842,0.327467,0.553897,0.217571,-0.570478,0.01195,...,-0.389247,-0.051933,0.214238,0.303927,0.275834,0.223305,-0.128052,0.03601,-0.507883,0.394999
GUCY1B1,-0.210183,0.185705,1.0,0.517195,0.008332,0.337339,0.381405,0.196439,0.523371,0.456519,...,-0.4879,0.540347,0.637117,-0.173665,0.622148,0.167815,-0.660105,0.482375,0.073153,-0.163348
TAFAZZIN,-0.740775,0.595375,0.517195,1.0,0.020713,0.709119,0.61483,0.001276,-0.270886,0.057081,...,0.014641,0.188011,-0.252593,-0.049781,0.085963,-0.121196,0.149903,-0.316598,-0.062708,0.188427
MARS1,-0.055936,-0.467842,0.008332,0.020713,1.0,0.101256,-0.196862,-0.042089,-0.354583,-0.003254,...,-0.014736,-0.166872,-0.267012,0.328517,0.045806,0.047072,0.176365,0.24771,-0.119467,0.016594


In [14]:
cosine_sim_df.to_parquet(
    f"cos_sim/cosine_similarities_genes_crispr.parquet", index=True
)