# Creating Similarity Matrix using Cosin Similarity

NOTES:

I realized that there are 121,000 different cells, so to plot a cosin similarity heatmap, there would be around 1 billion different scalars to deal with

Because of this, I am going to filter out the cells that are associated with the stages of the double injury, and just plot a heatmap of those

Hopefully, the number of cells is small enough that it is time efficient, but large enough that it looks clean.

In [1]:
import pandas as pd
import numpy as np

import scanpy as sc # to read seurat object (so.Robj)
import scipy
import anndata

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

#default plt to sns
sns.set(font_scale = 1.5)
sns.set_theme()

import os

In [2]:
#read data
print("ETA: ~40 sec")
h5ad = "scrna_data/stage5.h5ad"
seurat_clusters = sc.read_h5ad(h5ad)
print("h5ad import successful \n")

#extracting metadata
metadata = sc.get.obs_df(seurat_clusters, keys = ['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.rpl', 'percent.rps', 'time', 'location', 'RNA_snn_res.0.5', 'seurat_clusters', 'RNA_snn_res.1.8'])

#genes
gene_index = seurat_clusters.var.index.to_numpy()
print("first 5 genes:", gene_index[0:5])

#metadata
orig_ident = metadata["orig.ident"].to_numpy()
ident_index = metadata.index.to_numpy()
print("sample metadata:", orig_ident[0:5])

#expression matrix
expr_data = scipy.sparse.csc_matrix(seurat_clusters.X)
print("\nexpr_data:", expr_data.shape)

#NOTE: EXPR DATA IS SCIPY SPARSE MATRIX

#SPARSE DATA FRAME CAUSE FILE TOO BIG
#expr_data = expr_data.tocsc()
gene_expr_data = pd.DataFrame.sparse.from_spmatrix(expr_data)
print("gene df:", gene_expr_data.shape)

#replace index, column headings
gene_expr_data.index = ident_index
gene_expr_data.columns = gene_index
gene_expr_data["ident"] = orig_ident


ETA: ~40 sec



This is where adjacency matrices should go now.
  warn(


h5ad import successful 

first 5 genes: ['TTN' 'LOC100513133' 'LOC110257246' 'CTDSP1' 'ANKRD1']
sample metadata: ['AR1_MI28_P30_8064AZ' 'AR1_MI28_P30_8064AZ' 'AR1_MI28_P30_8064AZ'
 'AR1_MI28_P30_8064AZ' 'AR1_MI28_P30_8064AZ']

expr_data: (121239, 2000)
gene df: (121239, 2000)


In [3]:
gene_expr_data

Unnamed: 0,TTN,LOC100513133,LOC110257246,CTDSP1,ANKRD1,RPP30,TPM1,MYH7,DDX5,RBPMS,...,ELAVL2,LOC110260055,SH3GL2,LOC102166958,TRIM69,CRSP3,SLC25A31,LOC102161303,LOC100515185,ident
AR1_MI28_P30_8064AZ_TTTGTTGTCCATCTGC,1.697666,0.770168,-1.246079,0.696817,-1.167797,-1.013148,-1.906602,-2.166894,0.893988,0.408413,...,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214,AR1_MI28_P30_8064AZ
AR1_MI28_P30_8064AZ_TTTGTTGTCAAATGAG,-0.327701,-0.037845,-0.382901,-0.715539,0.880698,0.343318,-0.311018,0.472611,-0.361272,0.060178,...,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214,AR1_MI28_P30_8064AZ
AR1_MI28_P30_8064AZ_TTTGTTGCATGGCCAC,1.209177,1.256926,0.870256,-1.246729,1.530894,1.890148,-1.906602,0.916366,0.767533,0.637964,...,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214,AR1_MI28_P30_8064AZ
AR1_MI28_P30_8064AZ_TTTGTTGAGGGCAGAG,0.138056,0.915296,0.616571,-1.475042,1.192702,0.659002,0.635606,-2.166894,-1.984069,0.828888,...,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214,AR1_MI28_P30_8064AZ
AR1_MI28_P30_8064AZ_TTTGGTTTCACGTCCT,1.253481,0.202796,0.676249,-0.569255,1.173452,0.584889,-0.339262,0.699816,-0.072271,0.509724,...,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214,AR1_MI28_P30_8064AZ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FH_Embryo_FH3_TAAGCACTCATTGCGA,0.611586,-0.383652,0.379707,-0.256823,-1.167797,-1.013148,0.885588,-0.409514,-0.303753,0.473320,...,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214,FH_Embryo_FH3
FH_Embryo_FH3_TAAGCCACAAGGTCGA,-1.431611,-1.950535,0.130116,-0.308941,-0.535375,-0.297280,0.198273,-0.506293,0.006462,-0.130211,...,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214,FH_Embryo_FH3
FH_Embryo_FH3_TCACATTTCTGGGCGT,-0.884059,-0.425313,0.171121,-0.688443,-1.167797,-1.013148,0.941532,-0.117165,-1.984069,-0.161183,...,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214,FH_Embryo_FH3
FH_Embryo_FH3_TAAGCCATCGAAACAA,-0.829695,-2.725287,0.256042,-0.965918,-1.167797,-1.013148,-0.155203,-0.303930,-1.984069,-2.139322,...,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214,FH_Embryo_FH3


In [4]:
metadata

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.rpl,percent.rps,time,location,RNA_snn_res.0.5,seurat_clusters,RNA_snn_res.1.8
AR1_MI28_P30_8064AZ_TTTGTTGTCCATCTGC,AR1_MI28_P30_8064AZ,1386.0,877,0.0,0.505051,0.216450,P30,AZ,0,7,10
AR1_MI28_P30_8064AZ_TTTGTTGTCAAATGAG,AR1_MI28_P30_8064AZ,2080.0,1103,0.0,0.432692,0.096154,P30,AZ,0,0,10
AR1_MI28_P30_8064AZ_TTTGTTGCATGGCCAC,AR1_MI28_P30_8064AZ,2134.0,1251,0.0,0.515464,0.421743,P30,AZ,0,5,3
AR1_MI28_P30_8064AZ_TTTGTTGAGGGCAGAG,AR1_MI28_P30_8064AZ,1291.0,865,0.0,0.542215,0.309837,P30,AZ,0,12,10
AR1_MI28_P30_8064AZ_TTTGGTTTCACGTCCT,AR1_MI28_P30_8064AZ,4320.0,2265,0.0,0.208333,0.254630,P30,AZ,0,5,3
...,...,...,...,...,...,...,...,...,...,...,...
FH_Embryo_FH3_TAAGCACTCATTGCGA,FH_Embryo_FH3,1930.0,1152,0.0,1.191710,0.414508,FH,Whole,23,4,0
FH_Embryo_FH3_TAAGCCACAAGGTCGA,FH_Embryo_FH3,6534.0,3265,0.0,0.306091,0.214264,FH,Whole,27,40,28
FH_Embryo_FH3_TCACATTTCTGGGCGT,FH_Embryo_FH3,1357.0,933,0.0,1.252763,0.221076,FH,Whole,23,2,16
FH_Embryo_FH3_TAAGCCATCGAAACAA,FH_Embryo_FH3,1696.0,1220,0.0,0.176887,0.176887,FH,Whole,14,32,22


## Filtering the Data

In [5]:
double_injury_stages = set(orig_ident)
double_injury_stages = list(double_injury_stages)

day1 = [i for i in double_injury_stages if "CTL-P1" in i]
control_day28 = [i for i in double_injury_stages if "CTL-P28" in i]
control_day56 = [i for i in double_injury_stages if "CTL-P56" in i]
control_model = [day1, control_day28, control_day56]

print("Control Model:")
print("day1:", day1)
print("control_day28:", control_day28[0:5])
print("control_day56:", control_day56[0:5])


first_injury_day28 = [i for i in double_injury_stages if "AR1_P28" in i]
first_injury_day56 = [i for i in double_injury_stages if "AR1_P56" in i]
single_injury_model = [day1, first_injury_day28, first_injury_day56]

print("\nFirst Injury Model:")
print("day1:", day1)
print("first_injury_day28:", first_injury_day28[0:5])
print("first_injury_day56:", first_injury_day56[0:5])

double_injury_day28 = first_injury_day28
double_injury_day30 = [i for i in double_injury_stages if "AR1_MI28_P30" in i]
double_injury_day35 = [i for i in double_injury_stages if "AR1_MI28_P35" in i]
double_injury_day42 = [i for i in double_injury_stages if "AR1_MI28_P42" in i]
double_injury_day56 = [i for i in double_injury_stages if "AR1_MI28_P56" in i]
double_injury_model = [day1, double_injury_day28, double_injury_day30, double_injury_day35, double_injury_day42, double_injury_day56]

print("\nDouble Injury Model:")
print("day1:", day1)
print("first_injury_day28:", first_injury_day28[0:5])
print("double_injury_day30:", double_injury_day30[0:5])
print("double_injury_day35:", double_injury_day35[0:5])
print("double_injury_day42:", double_injury_day42[0:5])
print("double_injury_day56:", double_injury_day56[0:5])

Control Model:
day1: ['CTL-P1_8094', 'CTL-P1_8095', 'CTL-P1_8026_p1']
control_day28: ['CTL-P28_8046_RZ', 'CTL-P28_8046_BZ']
control_day56: ['CTL-P56_8052_AZ', 'CTL-P56_8052_RZ']

First Injury Model:
day1: ['CTL-P1_8094', 'CTL-P1_8095', 'CTL-P1_8026_p1']
first_injury_day28: ['AR1_P28_8030_RZ', 'AR1_P28_8014RZ', 'AR1_P28_8014BZ', 'AR1_P28_\t8030_CZ']
first_injury_day56: ['AR1_P56_8097RZ', 'AR1_P56_8096CZ', 'AR1_P56_8096RZ', 'AR1_P56_8097CZ']

Double Injury Model:
day1: ['CTL-P1_8094', 'CTL-P1_8095', 'CTL-P1_8026_p1']
first_injury_day28: ['AR1_P28_8030_RZ', 'AR1_P28_8014RZ', 'AR1_P28_8014BZ', 'AR1_P28_\t8030_CZ']
double_injury_day30: ['AR1_MI28_P30_8064RZ', 'AR1_MI28_P30_8064CZ', 'AR1_MI28_P30_8064AZ']
double_injury_day35: ['AR1_MI28_P35_8095AZ', 'AR1_MI28_P35_8065AZ', 'AR1_MI28_P35_8095BZ', 'AR1_MI28_P35_8065CZ', 'AR1_MI28_P35_8065RZ']
double_injury_day42: ['AR1_MI28_P42_8094RZ', 'AR1_MI28_P42_8094BZ', 'AR1_MI28_P42_8094AZ']
double_injury_day56: ['AR1_MI28_P56_8060RZ', 'AR1_MI28_P56_8060

In [6]:
double_injury_stages = []
for i in double_injury_model:
    for j in i:
        double_injury_stages.append(j)

#get rid of control p1
double_injury_stages = double_injury_stages[3:]
double_injury_stages

['AR1_P28_8030_RZ',
 'AR1_P28_8014RZ',
 'AR1_P28_8014BZ',
 'AR1_P28_\t8030_CZ',
 'AR1_MI28_P30_8064RZ',
 'AR1_MI28_P30_8064CZ',
 'AR1_MI28_P30_8064AZ',
 'AR1_MI28_P35_8095AZ',
 'AR1_MI28_P35_8065AZ',
 'AR1_MI28_P35_8095BZ',
 'AR1_MI28_P35_8065CZ',
 'AR1_MI28_P35_8065RZ',
 'AR1_MI28_P35_8095RZ',
 'AR1_MI28_P42_8094RZ',
 'AR1_MI28_P42_8094BZ',
 'AR1_MI28_P42_8094AZ',
 'AR1_MI28_P56_8060RZ',
 'AR1_MI28_P56_8060IZ',
 'AR1_MI28_P56_8060AZ',
 'AR1_MI28_P56_7995_BZ',
 'AR1_MI28_P56_7995_RZ']

In [13]:
double_injury_gene_expression = gene_expr_data[gene_expr_data["ident"].isin(double_injury_stages)]
double_injury_gene_expression = double_injury_gene_expression.drop("ident", axis=1)

In [14]:
double_injury_gene_expression

Unnamed: 0,TTN,LOC100513133,LOC110257246,CTDSP1,ANKRD1,RPP30,TPM1,MYH7,DDX5,RBPMS,...,LOC110261361,ELAVL2,LOC110260055,SH3GL2,LOC102166958,TRIM69,CRSP3,SLC25A31,LOC102161303,LOC100515185
AR1_MI28_P30_8064AZ_TTTGTTGTCCATCTGC,1.697666,0.770168,-1.246079,0.696817,-1.167797,-1.013148,-1.906602,-2.166894,0.893988,0.408413,...,-0.004879,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214
AR1_MI28_P30_8064AZ_TTTGTTGTCAAATGAG,-0.327701,-0.037845,-0.382901,-0.715539,0.880698,0.343318,-0.311018,0.472611,-0.361272,0.060178,...,-0.004879,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214
AR1_MI28_P30_8064AZ_TTTGTTGCATGGCCAC,1.209177,1.256926,0.870256,-1.246729,1.530894,1.890148,-1.906602,0.916366,0.767533,0.637964,...,-0.004879,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214
AR1_MI28_P30_8064AZ_TTTGTTGAGGGCAGAG,0.138056,0.915296,0.616571,-1.475042,1.192702,0.659002,0.635606,-2.166894,-1.984069,0.828888,...,-0.004879,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214
AR1_MI28_P30_8064AZ_TTTGGTTTCACGTCCT,1.253481,0.202796,0.676249,-0.569255,1.173452,0.584889,-0.339262,0.699816,-0.072271,0.509724,...,-0.004879,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AR1_P28_8030_RZ_CAAGACTTCGAAGAAT,0.854501,0.336325,-0.019811,-0.075418,0.437257,0.605845,0.700467,1.022302,-1.984069,0.853153,...,-0.004879,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214
AR1_P28_8030_RZ_TGGCGTGCACAACCGC,0.304568,1.254133,-0.031365,0.284706,-1.167797,-1.013148,-0.614623,0.098600,-0.148904,0.486386,...,-0.004879,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214
AR1_P28_8030_RZ_TTCCTTCGTATGCGGA,1.188570,1.122099,0.382559,-0.485922,0.376746,-1.013148,0.485823,1.390713,0.449160,-0.028186,...,-0.004879,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214
AR1_P28_8030_RZ_TTCCTTCTCCGTGTAA,1.289793,0.872047,0.306062,-1.307921,0.831363,0.294917,0.755251,-1.036553,0.723184,0.697352,...,-0.004879,-0.010554,-0.012284,-0.004765,-0.013779,-0.006163,-0.004045,-0.007989,-0.004059,-0.006214


## Creating Heatmap using Similarity Matrix

In [15]:
double_injury_gene_expression = double_injury_gene_expression.to_numpy()
double_injury_gene_expression

array([[ 1.69766599,  0.7701677 , -1.24607904, ..., -0.00798947,
        -0.00405917, -0.00621407],
       [-0.32770103, -0.03784509, -0.38290124, ..., -0.00798947,
        -0.00405917, -0.00621407],
       [ 1.20917674,  1.25692625,  0.87025606, ..., -0.00798947,
        -0.00405917, -0.00621407],
       ...,
       [ 1.18857015,  1.12209918,  0.38255857, ..., -0.00798947,
        -0.00405917, -0.00621407],
       [ 1.28979319,  0.87204697,  0.30606248, ..., -0.00798947,
        -0.00405917, -0.00621407],
       [ 0.19737853,  1.33278891, -0.86149992, ..., -0.00798947,
        -0.00405917, -0.00621407]])

In [22]:
for i in tqdm(double_injury_gene_expression, desc = "calculating cosin similarity"):
    for j in double_injury_gene_expression:
        pass


calculating cosin similarity:   2%|▏         | 1463/61630 [00:07<04:48, 208.22it/s]


KeyboardInterrupt: 