In [1]:
import pandas as pd
import utils
import numpy as np
from tqdm import tqdm
import logging

logging.basicConfig(format="%(levelname)s:%(asctime)s:%(name)s:%(message)s")
logging.getLogger("copairs").setLevel(logging.INFO)

In [2]:
operations = "wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected"
batch_size = 20000

### Prepare the data

#### Read the CRISPR parquet file

In [3]:
crispr_df = pd.read_parquet(
    f"../profiles/profiles_{operations}.parquet"
)

#### Remove ORF reagents without a phenotypic activity

In [4]:
crispr_phenotypic_activity_df = (
    pd.read_csv(
        f"../03.retrieve-annotations/output/phenotypic-activity-{operations}.csv.gz",
        usecols=["Metadata_JCP2022", "below_corrected_p"],
    )
    .rename(columns={"below_corrected_p": "Metadata_below_corrected_p"})
).query("Metadata_below_corrected_p==True")

crispr_phenotypic_activity_df.shape

(5546, 2)

In [5]:
crispr_df = (
    crispr_df.merge(crispr_phenotypic_activity_df, on="Metadata_JCP2022", how="inner")
    .reset_index(drop=True)
)

crispr_df.shape

(29909, 264)

### Create cosine similarity matrix

#### Add Approved Symbols

In [6]:
crispr_metadata = pd.read_csv(
    "../00.download-and-process-annotations/output/crispr_metadata.tsv.gz",
    sep="\t",
    usecols=["Metadata_JCP2022", "Metadata_Approved_Symbol", "Metadata_Symbol"],
)

crispr_df = crispr_df.merge(crispr_metadata, on="Metadata_JCP2022", how="left")

crispr_df = crispr_df.assign(
    Metadata_Gene_Symbol=lambda x: np.where(
        x.Metadata_Approved_Symbol.isna(), x.Metadata_Symbol, x.Metadata_Approved_Symbol
    )
)

crispr_df.shape

(29909, 267)

#### Create consensus profiles

In [7]:
consensus_df = utils.consensus(crispr_df, "Metadata_Gene_Symbol")
consensus_df.shape

(5546, 267)

In [8]:
feature_df = utils.get_featuredata(consensus_df)
feature_values = feature_df.values
id = list(consensus_df["Metadata_Gene_Symbol"])

In [9]:
cosine_sim_df = utils.cosine_similarity(
    id,
    feature_values,
    batch_size=batch_size,
)

INFO:2024-05-02 12:09:53,813:copairs:Computing positive similarities...


  0%|          | 0/769 [00:00<?, ?it/s]

In [10]:
cosine_sim_df.head()

Unnamed: 0,PALS2,SEPTIN1,AFG2B,MRPL58,TAFAZZIN,MARS1,SARS1,ATP5PO,EPRS1,GFUS,...,TAPBP,TP53I3,PLD5,SFXN1,SLC25A22,GUCA1A,SIX6,SMARCA4,TGIF1,SPTLC1
PALS2,1.0,0.066599,0.22666,-0.096645,0.060595,-0.140241,-0.166251,0.351551,0.104548,0.175636,...,0.063718,0.022331,0.038044,0.033832,0.077804,0.18082,0.108754,-0.149984,0.164068,-0.004394
SEPTIN1,0.066599,1.0,0.300627,0.105448,-0.049673,0.131625,0.159204,0.077655,-0.085825,-0.294846,...,-0.004568,0.097005,-0.045114,0.256349,0.114172,-0.250846,0.15765,-0.13625,-0.038479,-0.032529
AFG2B,0.22666,0.300627,1.0,0.050718,0.012705,-0.381485,-0.378036,0.008984,-0.308267,0.014491,...,0.027969,0.035475,0.061099,0.121436,-0.173788,-0.143466,0.1466,-0.196483,0.129058,-0.089391
MRPL58,-0.096645,0.105448,0.050718,1.0,0.126722,0.179809,0.352486,-0.023718,0.309831,-0.034218,...,-0.093693,0.114346,-0.085495,-0.036108,0.124639,-0.14903,-0.105554,0.076929,-0.076278,0.037285
TAFAZZIN,0.060595,-0.049673,0.012705,0.126722,1.0,0.112294,0.137403,0.119867,0.144929,-0.002554,...,-0.080834,-0.006972,0.031903,0.057618,0.058439,0.114054,-0.156385,0.022209,-0.241812,0.031025


In [11]:
cosine_sim_df.to_parquet(
    f"cos_sim/cosine_similarities_genes_crispr.parquet", index=True
)