In [1]:
import pandas as pd
import utils
import numpy as np
from tqdm import tqdm
import logging

logging.basicConfig(format="%(levelname)s:%(asctime)s:%(name)s:%(message)s")
logging.getLogger("copairs").setLevel(logging.INFO)

In [2]:
operations = "wellpos_cc_var_mad_outlier_featselect_sphering_harmony_crispr"
batch_size = 20000

### Prepare the data

#### Read the CRISPR parquet file

In [3]:
crispr_df = pd.read_parquet(
    f"../profiles/profiles_{operations}.parquet"
)
crispr_df.head()

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022,X_1,X_2,X_3,X_4,X_5,X_6,...,X_590,X_591,X_592,X_593,X_594,X_595,X_596,X_597,X_598,X_599
0,source_13,CP-CC9-R1-01,A02,JCP2022_800002,0.072467,-0.158762,-0.017511,0.106918,-0.061986,-0.187484,...,-0.053909,-0.09402,-0.059859,0.002499,-0.029754,0.107617,-0.043598,-0.173765,0.15761,-0.199753
1,source_13,CP-CC9-R1-01,K20,JCP2022_804622,-0.084254,-0.280737,0.203017,-0.457098,0.465258,-0.588459,...,-0.22616,-0.225858,-0.050087,0.107821,0.006811,0.071454,-0.090616,-0.248816,0.032766,0.014232
2,source_13,CP-CC9-R1-01,K19,JCP2022_805152,-0.220093,-0.07763,0.258283,0.265943,0.286596,-0.011306,...,-0.24162,-0.047951,-0.174141,0.146646,0.058707,0.037053,-0.031482,-0.257376,-0.012701,0.246745
3,source_13,CP-CC9-R1-01,K18,JCP2022_802786,0.02933,-0.098859,0.101108,0.208802,0.02612,-0.4085,...,-0.158171,-0.191018,0.001785,0.22397,0.292346,0.282254,-0.081359,-0.202753,0.245245,0.280027
4,source_13,CP-CC9-R1-01,K17,JCP2022_804790,0.048176,0.187429,0.123768,0.170143,0.0859,-0.501685,...,-0.186205,-0.052261,-0.162797,0.077316,0.146613,0.012979,0.14572,-0.124978,0.011479,-0.141255


### Create cosine similarity matrix

#### Add Approved Symbols

In [4]:
crispr_metadata = pd.read_csv(
    "../00.download-and-process-annotations/output/crispr_metadata.tsv.gz",
    sep="\t",
    usecols=["Metadata_JCP2022", "Metadata_Approved_Symbol", "Metadata_Symbol"],
)

crispr_df = crispr_df.merge(crispr_metadata, on="Metadata_JCP2022", how="left")

crispr_df = crispr_df.assign(
    Metadata_Gene_Symbol=lambda x: np.where(
        x.Metadata_Approved_Symbol.isna(), x.Metadata_Symbol, x.Metadata_Approved_Symbol
    )
)

crispr_df.shape

(51185, 606)

#### Create consensus profiles

In [5]:
consensus_df = utils.consensus(crispr_df, "Metadata_Gene_Symbol")
consensus_df.shape

(7977, 606)

In [6]:
feature_df = utils.get_featuredata(consensus_df)
feature_values = feature_df.values
id = list(consensus_df["Metadata_Gene_Symbol"])

In [7]:
cosine_sim_df = utils.cosine_similarity(
    id,
    feature_values,
    batch_size=batch_size,
)

INFO:2024-09-11 17:18:28,324:copairs:Computing positive similarities...


  0%|          | 0/1591 [00:00<?, ?it/s]

In [8]:
cosine_sim_df.head()

Unnamed: 0,non-targeting,NMNAT2,PIK3AP1,GOT1,OLR1,CHAT,GNA14,PALS2,MCM6,RAB21,...,SFXN1,BZW2,SLC25A22,GUCA1A,SIX6,SMARCA4,TGIF1,LRSAM1,INPP5B,SPTLC1
non-targeting,1.0,0.000198,-0.155372,-0.142131,-0.134412,-0.089354,-0.109164,0.028733,0.188457,-0.001575,...,0.014156,-0.076763,0.029949,0.00476,0.011702,0.079479,0.043226,0.198848,0.028288,-0.092342
NMNAT2,0.000198,1.0,0.317265,0.293904,-0.034857,0.329257,0.349793,0.190879,0.305049,0.048298,...,0.007679,-0.152132,-0.036735,0.098488,-0.1383,-0.075837,0.161809,0.247273,0.111211,-0.037786
PIK3AP1,-0.155372,0.317265,1.0,0.459136,0.125333,0.473906,0.23893,0.105264,0.037498,0.144313,...,0.266539,0.013586,0.112145,0.188767,0.138911,-0.062433,0.174342,0.005957,0.033262,0.145084
GOT1,-0.142131,0.293904,0.459136,1.0,0.249129,0.396783,0.424852,0.1278,0.198454,0.339,...,0.240508,0.078663,0.093205,0.261955,0.092529,0.046214,0.02414,-0.013267,0.00169,0.120112
OLR1,-0.134412,-0.034857,0.125333,0.249129,1.0,0.19115,0.308245,0.183074,-0.048855,0.251311,...,0.209641,0.212062,0.014415,0.014148,0.174049,0.007347,0.0649,0.105497,0.044881,0.126412


In [9]:
cosine_sim_df.to_parquet(
    f"cos_sim/cosine_similarities_genes_crispr-all-no-correction.parquet", index=True
)