In [1]:
import pandas as pd
import numpy as np

import scanpy as sc # to read seurat object (so.Robj)
import scipy
import anndata

import matplotlib.pyplot as plt
import seaborn as sns
import fastcluster

from tqdm import tqdm

#default plt to sns
sns.set(font_scale = 1.5)
sns.set_theme()

import os

from neo4j import GraphDatabase

In [2]:
driver = GraphDatabase.driver(uri = "bolt://localhost:7687", auth = ("neo4j","snf"))
driver.verify_connectivity()

## Constraints

In [3]:
query = "CREATE CONSTRAINT ON (c:cardiomyocite) ASSERT c.id  IS UNIQUE"
        
with driver.session() as session:
    info = session.run(query)

## Deploying Nodes

In [4]:
#read data
print("ETA: ~40 sec")
h5ad = "../scrna_data/stage5.h5ad"
seurat_clusters = sc.read_h5ad(h5ad)
print("h5ad import successful \n")

#extracting metadata
metadata = sc.get.obs_df(seurat_clusters, keys = ['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.rpl', 'percent.rps', 'time', 'location', 'RNA_snn_res.0.5', 'seurat_clusters', 'RNA_snn_res.1.8'])

#genes
gene_index = seurat_clusters.var.index.to_numpy()
print("first 5 genes:", gene_index[0:5])

#metadata
orig_ident = metadata["orig.ident"].to_numpy()
ident_index = metadata.index.to_numpy()
print("sample metadata:", orig_ident[0:5])

#expression matrix
expr_data = scipy.sparse.csc_matrix(seurat_clusters.X)
print("\nexpr_data:", expr_data.shape)

#NOTE: EXPR DATA IS SCIPY SPARSE MATRIX

#SPARSE DATA FRAME CAUSE FILE TOO BIG
#expr_data = expr_data.tocsc()
gene_expr_data = pd.DataFrame.sparse.from_spmatrix(expr_data)
print("gene df:", gene_expr_data.shape)

#replace index, column headings
gene_expr_data.index = ident_index
gene_expr_data.columns = gene_index
gene_expr_data["ident"] = orig_ident


ETA: ~40 sec



This is where adjacency matrices should go now.
  warn(


h5ad import successful 

first 5 genes: ['TTN' 'LOC100513133' 'LOC110257246' 'CTDSP1' 'ANKRD1']
sample metadata: ['AR1_MI28_P30_8064AZ' 'AR1_MI28_P30_8064AZ' 'AR1_MI28_P30_8064AZ'
 'AR1_MI28_P30_8064AZ' 'AR1_MI28_P30_8064AZ']


Filter Data/Metadata

In [None]:
double_injury_stages = set(orig_ident)
double_injury_stages = list(double_injury_stages)

day1 = [i for i in double_injury_stages if "CTL-P1" in i]
control_day28 = [i for i in double_injury_stages if "CTL-P28" in i]
control_day56 = [i for i in double_injury_stages if "CTL-P56" in i]
control_model = [day1, control_day28, control_day56]

print("Control Model:")
print("day1:", day1)
print("control_day28:", control_day28[0:5])
print("control_day56:", control_day56[0:5])


first_injury_day28 = [i for i in double_injury_stages if "AR1_P28" in i]
first_injury_day56 = [i for i in double_injury_stages if "AR1_P56" in i]
single_injury_model = [day1, first_injury_day28, first_injury_day56]

print("\nFirst Injury Model:")
print("day1:", day1)
print("first_injury_day28:", first_injury_day28[0:5])
print("first_injury_day56:", first_injury_day56[0:5])

double_injury_day28 = first_injury_day28
double_injury_day30 = [i for i in double_injury_stages if "AR1_MI28_P30" in i]
double_injury_day35 = [i for i in double_injury_stages if "AR1_MI28_P35" in i]
double_injury_day42 = [i for i in double_injury_stages if "AR1_MI28_P42" in i]
double_injury_day56 = [i for i in double_injury_stages if "AR1_MI28_P56" in i]
double_injury_model = [day1, double_injury_day28, double_injury_day30, double_injury_day35, double_injury_day42, double_injury_day56]

print("\nDouble Injury Model:")
print("day1:", day1)
print("first_injury_day28:", first_injury_day28[0:5])
print("double_injury_day30:", double_injury_day30[0:5])
print("double_injury_day35:", double_injury_day35[0:5])
print("double_injury_day42:", double_injury_day42[0:5])
print("double_injury_day56:", double_injury_day56[0:5])

Control Model:
day1: ['CTL-P1_8026_p1', 'CTL-P1_8095', 'CTL-P1_8094']
control_day28: ['CTL-P28_8046_BZ', 'CTL-P28_8046_RZ']
control_day56: ['CTL-P56_8052_RZ', 'CTL-P56_8052_AZ']

First Injury Model:
day1: ['CTL-P1_8026_p1', 'CTL-P1_8095', 'CTL-P1_8094']
first_injury_day28: ['AR1_P28_8014RZ', 'AR1_P28_8014BZ', 'AR1_P28_\t8030_CZ', 'AR1_P28_8030_RZ']
first_injury_day56: ['AR1_P56_8096CZ', 'AR1_P56_8097RZ', 'AR1_P56_8097CZ', 'AR1_P56_8096RZ']

Double Injury Model:
day1: ['CTL-P1_8026_p1', 'CTL-P1_8095', 'CTL-P1_8094']
first_injury_day28: ['AR1_P28_8014RZ', 'AR1_P28_8014BZ', 'AR1_P28_\t8030_CZ', 'AR1_P28_8030_RZ']
double_injury_day30: ['AR1_MI28_P30_8064CZ', 'AR1_MI28_P30_8064RZ', 'AR1_MI28_P30_8064AZ']
double_injury_day35: ['AR1_MI28_P35_8065AZ', 'AR1_MI28_P35_8065RZ', 'AR1_MI28_P35_8095BZ', 'AR1_MI28_P35_8095RZ', 'AR1_MI28_P35_8065CZ']
double_injury_day42: ['AR1_MI28_P42_8094AZ', 'AR1_MI28_P42_8094RZ', 'AR1_MI28_P42_8094BZ']
double_injury_day56: ['AR1_MI28_P56_7995_RZ', 'AR1_MI28_P56_806

In [None]:
control = []
single_injury = []
double_injury = []

#iterate over each stage
for i in control_model:
    control += i

for i in double_injury_model:
    double_injury += i

for i in single_injury_model:
    single_injury += i

print(double_injury)

['CTL-P1_8026_p1', 'CTL-P1_8095', 'CTL-P1_8094', 'AR1_P28_8014RZ', 'AR1_P28_8014BZ', 'AR1_P28_\t8030_CZ', 'AR1_P28_8030_RZ', 'AR1_MI28_P30_8064CZ', 'AR1_MI28_P30_8064RZ', 'AR1_MI28_P30_8064AZ', 'AR1_MI28_P35_8065AZ', 'AR1_MI28_P35_8065RZ', 'AR1_MI28_P35_8095BZ', 'AR1_MI28_P35_8095RZ', 'AR1_MI28_P35_8065CZ', 'AR1_MI28_P35_8095AZ', 'AR1_MI28_P42_8094AZ', 'AR1_MI28_P42_8094RZ', 'AR1_MI28_P42_8094BZ', 'AR1_MI28_P56_7995_RZ', 'AR1_MI28_P56_8060RZ', 'AR1_MI28_P56_8060IZ', 'AR1_MI28_P56_7995_BZ', 'AR1_MI28_P56_8060AZ']


In [None]:
metadata = metadata.drop(columns=metadata.columns.to_list()[1:], axis = 1)
metadata

Unnamed: 0,orig.ident
AR1_MI28_P30_8064AZ_TTTGTTGTCCATCTGC,AR1_MI28_P30_8064AZ
AR1_MI28_P30_8064AZ_TTTGTTGTCAAATGAG,AR1_MI28_P30_8064AZ
AR1_MI28_P30_8064AZ_TTTGTTGCATGGCCAC,AR1_MI28_P30_8064AZ
AR1_MI28_P30_8064AZ_TTTGTTGAGGGCAGAG,AR1_MI28_P30_8064AZ
AR1_MI28_P30_8064AZ_TTTGGTTTCACGTCCT,AR1_MI28_P30_8064AZ
...,...
FH_Embryo_FH3_TAAGCACTCATTGCGA,FH_Embryo_FH3
FH_Embryo_FH3_TAAGCCACAAGGTCGA,FH_Embryo_FH3
FH_Embryo_FH3_TCACATTTCTGGGCGT,FH_Embryo_FH3
FH_Embryo_FH3_TAAGCCATCGAAACAA,FH_Embryo_FH3


In [None]:
#experiment and day columns
from curses import meta


experiment = []
cell_day = []

for i in tqdm(metadata["orig.ident"], desc = "adding node data"):
    #find metadata
    if i in control: 
        experiment.append("control")
    elif i in single_injury: 
        experiment.append("single_injury")
    elif i in double_injury: 
        experiment.append("double_injury")
    else:
        experiment.append("fetal")

    #find cell day
    if "P" in i:
        day = i[i.find("P"): i.find("P") + 3]
        day = day.strip("_")
        cell_day.append(day)
    else:
        cell_day.append("NA")

metadata["cell_day"] = cell_day
metadata["experiment"] = experiment
metadata["id"] = metadata.index.to_list()
metadata = metadata.reset_index()
metadata

adding node data: 100%|██████████| 121239/121239 [00:00<00:00, 860687.69it/s]


Unnamed: 0,index,orig.ident,cell_day,experiment,id
0,AR1_MI28_P30_8064AZ_TTTGTTGTCCATCTGC,AR1_MI28_P30_8064AZ,P3,double_injury,AR1_MI28_P30_8064AZ_TTTGTTGTCCATCTGC
1,AR1_MI28_P30_8064AZ_TTTGTTGTCAAATGAG,AR1_MI28_P30_8064AZ,P3,double_injury,AR1_MI28_P30_8064AZ_TTTGTTGTCAAATGAG
2,AR1_MI28_P30_8064AZ_TTTGTTGCATGGCCAC,AR1_MI28_P30_8064AZ,P3,double_injury,AR1_MI28_P30_8064AZ_TTTGTTGCATGGCCAC
3,AR1_MI28_P30_8064AZ_TTTGTTGAGGGCAGAG,AR1_MI28_P30_8064AZ,P3,double_injury,AR1_MI28_P30_8064AZ_TTTGTTGAGGGCAGAG
4,AR1_MI28_P30_8064AZ_TTTGGTTTCACGTCCT,AR1_MI28_P30_8064AZ,P3,double_injury,AR1_MI28_P30_8064AZ_TTTGGTTTCACGTCCT
...,...,...,...,...,...
121234,FH_Embryo_FH3_TAAGCACTCATTGCGA,FH_Embryo_FH3,,fetal,FH_Embryo_FH3_TAAGCACTCATTGCGA
121235,FH_Embryo_FH3_TAAGCCACAAGGTCGA,FH_Embryo_FH3,,fetal,FH_Embryo_FH3_TAAGCCACAAGGTCGA
121236,FH_Embryo_FH3_TCACATTTCTGGGCGT,FH_Embryo_FH3,,fetal,FH_Embryo_FH3_TCACATTTCTGGGCGT
121237,FH_Embryo_FH3_TAAGCCATCGAAACAA,FH_Embryo_FH3,,fetal,FH_Embryo_FH3_TAAGCCATCGAAACAA


Deploy Nodes

In [None]:
def create_nodes(tx, uid, orig_ident, cell_day, experiment) ->  None:
    query = "MERGE (p:cardiomyocite {id: $uid, orig_ident: $orig_ident, cell_day: $cell_day, experiment: $experiment})"
    tx.run(query, uid = uid, orig_ident = orig_ident, cell_day = cell_day, experiment = experiment)

In [None]:
for i in tqdm(range(len(metadata)), desc = "deploying nodes"):
    driver.session().execute_write(create_nodes, metadata["index"][i], metadata["orig.ident"][i], metadata["cell_day"][i], metadata["experiment"][i])


deploying nodes:   2%|▏         | 2330/121239 [01:11<1:00:31, 32.74it/s]


KeyboardInterrupt: 

## Using Cosine Similarity Scores as Edges

In [None]:
#TODO:
#siliarlity scores as edges
#set a threshold!!
#add in scdrs similarity scores