In [1]:
import pandas as pd
import utils
import numpy as np
from tqdm import tqdm
import logging
from sklearn.metrics.pairwise import cosine_similarity

logging.basicConfig(format="%(levelname)s:%(asctime)s:%(name)s:%(message)s")
logging.getLogger("copairs").setLevel(logging.INFO)

In [2]:
operations = "wellpos_cc_var_mad_outlier_featselect_sphering_harmony"
batch_size = 20000

### Prepare the data

#### Read the ORF parquet file

In [3]:
orf_df = pd.read_parquet(
    f"../profiles/profiles_{operations}.parquet"
)
orf_df.shape

(81660, 726)

### Create cosine similarity matrix

#### Add Approved Symbols

In [4]:
orf_metadata = pd.read_csv(
    "../00.download-and-process-annotations/output/orf_metadata.tsv.gz",
    sep="\t",
    usecols=["Metadata_JCP2022", "Metadata_Approved_Symbol", "Metadata_Symbol"],
)

orf_df = orf_df.merge(orf_metadata, on="Metadata_JCP2022", how="left")

orf_df = orf_df.assign(
    Metadata_Gene_Symbol=lambda x: np.where(
        x.Metadata_Approved_Symbol.isna(), x.Metadata_Symbol, x.Metadata_Approved_Symbol
    )
)

orf_df.shape

(81660, 729)

#### Create consensus profiles

In [5]:
consensus_df = utils.consensus(orf_df, "Metadata_Gene_Symbol")
consensus_df.shape

(12602, 729)

In [6]:
feature_df = utils.get_featuredata(consensus_df)
feature_values = feature_df.values
id = list(consensus_df["Metadata_Gene_Symbol"])

In [7]:
cosine_sim_df = utils.cosine_similarity(
    id,
    feature_values,
    batch_size=batch_size,
)

INFO:2024-05-17 10:11:52,876:copairs:Computing positive similarities...


  0%|          | 0/3970 [00:00<?, ?it/s]

In [8]:
cosine_sim_df.head()

Unnamed: 0,CDK9,ZNF597,CDX1,LUCIFERASE,SIX2,HSF5,ATF7,DIDO1,RAI1,HcRed,...,HPGD,HSD17B3,RAB32,SLC39A9,TNF,OSER1,MRAP2,DNAAF6,LY6D,COQ4
CDK9,1.0,-0.053557,-0.036738,0.185173,0.053934,-0.033102,-0.02088,0.024192,0.102774,0.188763,...,0.040449,-0.018367,0.028402,0.047726,0.065281,-0.056427,0.04859,0.09442,-0.014681,0.003006
ZNF597,-0.053557,1.0,0.069338,0.094281,0.036989,0.18069,0.047083,0.095729,0.081758,0.062295,...,0.021428,0.064239,0.004501,0.066912,-0.142149,-0.046353,0.065912,-0.042277,0.068089,-0.004047
CDX1,-0.036738,0.069338,1.0,0.053382,0.170138,0.124885,0.115133,0.055702,0.083784,0.109262,...,0.010269,0.047989,-0.032858,-0.011991,-0.041313,0.021274,0.02288,0.103896,0.169554,0.061498
LUCIFERASE,0.185173,0.094281,0.053382,1.0,0.099741,0.133293,0.148306,0.182818,0.190666,0.607766,...,0.184806,-0.034328,0.200412,0.107746,0.003294,0.048373,0.169493,0.122674,0.071305,0.122501
SIX2,0.053934,0.036989,0.170138,0.099741,1.0,0.153347,0.068425,0.043138,0.097845,0.108223,...,0.035675,-0.1724,-0.060922,0.094394,-0.016155,-0.021136,0.007049,0.093166,0.078007,0.002403


In [9]:
cosine_sim_df.to_parquet(
    f"cos_sim/cosine_similarities_genes_orf-all.parquet", index=True
)