In [1]:
import pandas as pd
import utils
import numpy as np
from tqdm import tqdm
import logging

logging.basicConfig(format="%(levelname)s:%(asctime)s:%(name)s:%(message)s")
logging.getLogger("copairs").setLevel(logging.INFO)

In [2]:
operations = "wellpos_var_mad_int_featselect_harmony_PCA_corrected"
batch_size = 20000

### Prepare the data

#### Read the CRISPR parquet file

In [3]:
crispr_df = pd.read_parquet(
    f"../profiles/profiles_{operations}.parquet"
)

#### Remove empty wells and `negcon`s

In [4]:
crispr_df = utils.remove_negcon_empty_wells(crispr_df)
crispr_df.shape

(43707, 309)

#### Remove `poscon` wells.

In [5]:
crispr_df = crispr_df.query('Metadata_Symbol!="PLK1"').reset_index(drop=True)
crispr_df.shape

(43138, 309)

#### Remove ORF reagents without a phenotypic activity

In [6]:
crispr_phenotypic_activity_df = (
    pd.read_csv(
        f"../03.retrieve-annotations/output/phenotypic-activity-{operations}.csv.gz",
        usecols=["Metadata_JCP2022", "below_corrected_p"],
    )
    .rename(columns={"below_corrected_p": "Metadata_below_corrected_p"})
).query("Metadata_below_corrected_p==True")

crispr_phenotypic_activity_df.shape

(5685, 2)

In [7]:
crispr_df = (
    crispr_df.merge(crispr_phenotypic_activity_df, on="Metadata_JCP2022", how="inner")
    .reset_index(drop=True)
)

crispr_df.shape

(30686, 310)

### Create cosine similarity matrix

#### Add Approved Symbols

In [8]:
crispr_metadata = pd.read_csv(
    "../00.download-and-process-annotations/output/crispr.csv.gz",
    usecols=["Metadata_JCP2022", "Metadata_Approved_Symbol"],
)

crispr_df = crispr_df.drop(columns=["Metadata_Approved_symbol"]).merge(
    crispr_metadata, on="Metadata_JCP2022", how="left"
)

crispr_df = crispr_df.assign(
    Metadata_Gene_Symbol=lambda x: np.where(
        x.Metadata_Approved_Symbol.isna(), x.Metadata_Symbol, x.Metadata_Approved_Symbol
    )
)

crispr_df.shape

(30686, 311)

#### Create consensus profiles

In [9]:
consensus_df = utils.consensus(crispr_df, "Metadata_Gene_Symbol")
consensus_df.shape

(5684, 311)

In [10]:
feature_df = utils.get_featuredata(consensus_df)
feature_values = feature_df.values
id = list(consensus_df["Metadata_Gene_Symbol"])

In [11]:
cosine_sim_df = utils.cosine_similarity(
    id,
    feature_values,
    batch_size=batch_size,
)

INFO:2024-04-11 18:04:30,971:copairs:Computing positive similarities...


  0%|          | 0/808 [00:00<?, ?it/s]

In [12]:
cosine_sim_df.head()

Unnamed: 0,PALS2,MRPL58,GUCY1B1,TAFAZZIN,MARS1,SARS1,ATP5PO,EPRS1,GFUS,TMT1B,...,MIB2,SPSB1,TP53I3,PLD5,SFXN1,BZW2,SIX6,SMARCA4,TGIF1,SPTLC1
PALS2,1.0,-0.643831,0.720265,-0.422487,-0.509733,-0.210183,0.735649,0.328283,-0.008024,-0.740775,...,-0.500447,-0.575982,0.053975,0.2776,0.502114,0.435335,0.580798,0.023257,0.556337,0.734259
MRPL58,-0.643831,1.0,-0.626703,0.465669,0.531746,0.56102,-0.469886,0.070868,-0.037122,0.6196,...,0.278111,0.323475,-0.192634,-0.048423,-0.295869,-0.502622,-0.445301,0.00361,-0.5045,-0.594612
GUCY1B1,0.720265,-0.626703,1.0,-0.46793,-0.564229,-0.232995,0.712205,0.206827,-0.226712,-0.774118,...,-0.457503,-0.53927,0.149425,0.334244,0.660528,0.510027,0.734846,-0.143996,0.653741,0.730705
TAFAZZIN,-0.422487,0.465669,-0.46793,1.0,0.424794,0.525662,-0.390304,-0.136345,-0.303524,0.569585,...,-0.010277,0.389118,-0.554688,0.001064,-0.19741,-0.161778,-0.340859,0.098475,-0.661722,-0.367321
MARS1,-0.509733,0.531746,-0.564229,0.424794,1.0,0.564323,-0.398502,0.049872,-0.065045,0.484347,...,0.068258,0.193165,-0.242758,-0.155394,-0.264613,-0.327999,-0.441528,-0.016923,-0.611758,-0.367407


In [13]:
cosine_sim_df.to_parquet(
    f"cos_sim/cosine_similarities_genes_crispr.parquet", index=True
)