In [1]:
import pandas as pd
import utils
import numpy as np
from tqdm import tqdm
import logging

logging.basicConfig(format="%(levelname)s:%(asctime)s:%(name)s:%(message)s")
logging.getLogger("copairs").setLevel(logging.INFO)

In [2]:
operations = "wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected"
batch_size = 20000

### Prepare the data

#### Read the CRISPR parquet file

In [3]:
crispr_df = pd.read_parquet(
    f"../profiles/profiles_{operations}.parquet"
)
crispr_df.head()

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022,X_1,X_2,X_3,X_4,X_5,X_6,...,X_250,X_251,X_252,X_253,X_254,X_255,X_256,X_257,X_258,X_259
0,source_13,CP-CC9-R1-01,A02,JCP2022_800002,-0.223417,-0.049487,-0.826231,0.345093,-0.273055,-0.186349,...,-0.041767,0.073491,-0.157532,-0.071469,-0.037753,0.053746,0.023982,0.156385,-0.06443,0.004538
1,source_13,CP-CC9-R1-01,K14,JCP2022_804257,-0.066288,0.076715,-1.067553,-0.367578,0.122728,-0.823368,...,0.034426,0.033929,0.102059,0.144348,0.208035,0.050269,0.001003,0.02716,-0.040426,-0.049395
2,source_13,CP-CC9-R1-01,L23,JCP2022_800002,-0.079349,-0.016958,-0.277558,-0.625543,-0.236846,-0.140912,...,0.025519,0.134598,-0.018768,-0.079568,-0.214031,0.031374,-0.079467,0.001434,0.063802,-0.003757
3,source_13,CP-CC9-R1-01,K02,JCP2022_800001,-0.271417,-0.117471,0.462461,-0.48033,-0.406793,-0.69113,...,-0.129316,0.224771,0.204519,-0.08196,0.038231,0.081065,-0.009409,0.077791,-0.043725,0.08684
4,source_13,CP-CC9-R1-01,I08,JCP2022_800001,0.008612,-0.059106,-1.081568,-1.246591,0.025878,-0.241614,...,0.005117,-0.077804,0.038525,0.089097,0.074524,-0.120086,-0.038295,0.024144,0.140383,-0.023736


### Create cosine similarity matrix

#### Add Approved Symbols

In [4]:
crispr_metadata = pd.read_csv(
    "../00.download-and-process-annotations/output/crispr_metadata.tsv.gz",
    sep="\t",
    usecols=["Metadata_JCP2022", "Metadata_Approved_Symbol", "Metadata_Symbol"],
)

crispr_df = crispr_df.merge(crispr_metadata, on="Metadata_JCP2022", how="left")

crispr_df = crispr_df.assign(
    Metadata_Gene_Symbol=lambda x: np.where(
        x.Metadata_Approved_Symbol.isna(), x.Metadata_Symbol, x.Metadata_Approved_Symbol
    )
)

crispr_df.shape

(51185, 266)

#### Create consensus profiles

In [5]:
consensus_df = utils.consensus(crispr_df, "Metadata_Gene_Symbol")
consensus_df.shape

(7977, 266)

In [6]:
feature_df = utils.get_featuredata(consensus_df)
feature_values = feature_df.values
id = list(consensus_df["Metadata_Gene_Symbol"])

In [7]:
cosine_sim_df = utils.cosine_similarity(
    id,
    feature_values,
    batch_size=batch_size,
)

INFO:2024-06-25 17:37:21,828:copairs:Computing positive similarities...


  0%|          | 0/1591 [00:00<?, ?it/s]

In [8]:
cosine_sim_df.head()

Unnamed: 0,non-targeting,PALS2,no-guide,VNN3P,MRE11,SEPTIN5,SEPTIN1,AFG2B,MRPL58,GUCY1B1,...,SFXN1,BZW2,SLC25A22,GUCA1A,SIX6,SMARCA4,TGIF1,LRSAM1,INPP5B,SPTLC1
non-targeting,1.0,-0.059666,0.554028,-0.04492,0.084458,0.198055,0.163457,0.04502,0.218653,-0.030881,...,0.00168,0.029651,0.004144,-0.038176,0.017746,0.082222,0.133552,0.256799,0.150499,-0.148298
PALS2,-0.059666,1.0,0.068846,0.257723,-0.055745,0.160068,0.066599,0.22666,-0.096645,0.182161,...,0.033832,0.003136,0.077804,0.18082,0.108754,-0.149984,0.164068,-0.032468,0.032573,-0.004394
no-guide,0.554028,0.068846,1.0,-0.021055,-0.020687,0.118124,-0.013529,0.061264,0.2244,-0.030042,...,-0.00075,0.091534,0.099934,-0.056077,0.029897,0.240102,0.050657,0.138434,-0.004588,0.064069
VNN3P,-0.04492,0.257723,-0.021055,1.0,0.089644,0.067335,-0.025578,0.123663,0.071904,-0.054312,...,-0.197055,0.110045,0.065167,0.082572,0.00901,-0.067736,-0.049077,-0.073309,0.013022,0.079246
MRE11,0.084458,-0.055745,-0.020687,0.089644,1.0,0.071703,-0.21677,-0.076589,-0.111722,-0.066612,...,-0.073299,0.049652,-0.065445,0.165883,0.172696,-0.025268,0.084685,0.012524,0.104712,-0.056433


In [9]:
cosine_sim_df.to_parquet(
    f"cos_sim/cosine_similarities_genes_crispr-all.parquet", index=True
)