Author: Irsyad Adam

Created 6/13/2022

In [4]:
import pandas as pd
import numpy as np

import pyreadr # to read r object (stage5.rds)
import scanpy as sc # to read seurat object (so.Robj)
import scipy
import anndata

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

#default plt to sns
sns.set(font_scale = 1.5)
sns.set_theme()

import os

### Goal: Extract All Relevant Genes for Each Stage of Double Injury

1. Process h5ad file for gene expression matrix
2. Filter out cardiomyocites for each stage of double injury
3. Extract any genes that have expression
4. compile .txt files and output in the <code> /gene_lists/ </code> folder

#### 1. Data Preprocessing

In [5]:
#read data
print("ETA: ~40 sec")
h5ad = "../../double_injury/double_injury_seurat_h5ad/double_injury.h5ad"
seurat_clusters = sc.read_h5ad(h5ad)
print("h5ad import successful \n")

#extracting metadata
metadata = sc.get.obs_df(seurat_clusters, keys = ['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.rpl', 'percent.rps', 'time', 'location', 'RNA_snn_res.0.5', 'seurat_clusters', 'RNA_snn_res.1.8', 'ident'])

#genes
gene_index = seurat_clusters.var.index.to_numpy()
print("first 5 genes:", gene_index[0:5])

#metadata
orig_ident = metadata["orig.ident"].to_numpy()
ident_index = metadata.index.to_numpy()
print("sample metadata:", orig_ident[0:5])

#expression matrix
expr_data = seurat_clusters.X
print("\nexpr_data:", expr_data.shape)

#NOTE: EXPR DATA IS SCIPY SPARSE MATRIX

#SPARSE DATA FRAME CAUSE FILE TOO BIG
gene_expr_data = pd.DataFrame.sparse.from_spmatrix(expr_data)
print("gene df:", gene_expr_data.shape)

#replace index, column headings
gene_expr_data.index = ident_index
gene_expr_data.columns = gene_index
gene_expr_data["orig.ident"] = orig_ident

ETA: ~40 sec
h5ad import successful 

first 5 genes: ['LOC102163816' 'TTN' 'LOC100513133' 'LOC110257246' 'NEBL']
sample metadata: ['AR1_MI28_P30_8064AZ' 'AR1_MI28_P30_8064AZ' 'AR1_MI28_P30_8064AZ'
 'AR1_MI28_P30_8064AZ' 'AR1_MI28_P30_8064AZ']

expr_data: (121239, 21513)
gene df: (121239, 21513)


In [6]:
gene_expr_data[0:5]

Unnamed: 0,LOC102163816,TTN,LOC100513133,LOC110257246,NEBL,ABLIM1,CTDSP1,ANKRD1,MYBPC3,L1TD1,...,BRINP3,ADRA2A,MMP12,LOC100156522,LOC102166602,LOC110260742,ODF3B,LOC106505534,LOC110255780,orig.ident
AR1_MI28_P30_8064AZ_TTTGTTGTCCATCTGC,187.0,82.0,9.0,1.0,8.0,7.0,6.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AR1_MI28_P30_8064AZ
AR1_MI28_P30_8064AZ_TTTGTTGTCAAATGAG,665.0,43.0,5.0,3.0,8.0,5.0,3.0,4.0,2.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AR1_MI28_P30_8064AZ
AR1_MI28_P30_8064AZ_TTTGTTGCATGGCCAC,397.0,98.0,25.0,8.0,5.0,4.0,2.0,11.0,8.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AR1_MI28_P30_8064AZ
AR1_MI28_P30_8064AZ_TTTGTTGAGGGCAGAG,222.0,34.0,10.0,4.0,2.0,2.0,1.0,4.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AR1_MI28_P30_8064AZ
AR1_MI28_P30_8064AZ_TTTGGTTTCACGTCCT,769.0,203.0,14.0,14.0,20.0,14.0,7.0,13.0,8.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AR1_MI28_P30_8064AZ


In [7]:
metadata[0:5]

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.rpl,percent.rps,time,location,RNA_snn_res.0.5,seurat_clusters,RNA_snn_res.1.8,ident
AR1_MI28_P30_8064AZ_TTTGTTGTCCATCTGC,AR1_MI28_P30_8064AZ,1386.0,877,0.0,0.505051,0.21645,P30,AZ,0,10,10,10
AR1_MI28_P30_8064AZ_TTTGTTGTCAAATGAG,AR1_MI28_P30_8064AZ,2080.0,1103,0.0,0.432692,0.096154,P30,AZ,0,10,10,10
AR1_MI28_P30_8064AZ_TTTGTTGCATGGCCAC,AR1_MI28_P30_8064AZ,2134.0,1251,0.0,0.515464,0.421743,P30,AZ,0,3,3,3
AR1_MI28_P30_8064AZ_TTTGTTGAGGGCAGAG,AR1_MI28_P30_8064AZ,1291.0,865,0.0,0.542215,0.309837,P30,AZ,0,10,10,10
AR1_MI28_P30_8064AZ_TTTGGTTTCACGTCCT,AR1_MI28_P30_8064AZ,4320.0,2265,0.0,0.208333,0.25463,P30,AZ,0,3,3,3


In [8]:
double_injury_stages = set(orig_ident)
double_injury_stages = list(double_injury_stages)

day1 = [i for i in double_injury_stages if "CTL-P1" in i]
control_day28 = [i for i in double_injury_stages if "CTL-P28" in i]
control_day56 = [i for i in double_injury_stages if "CTL-P56" in i]
control_model = [day1, control_day28, control_day56]

print("Control Model:")
print("day1:", day1)
print("control_day28:", control_day28[0:5])
print("control_day56:", control_day56[0:5])


first_injury_day28 = [i for i in double_injury_stages if "AR1_P28" in i]
first_injury_day56 = [i for i in double_injury_stages if "AR1_P56" in i]
single_injury_model = [day1, first_injury_day28, first_injury_day56]

print("\nFirst Injury Model:")
print("day1:", day1)
print("first_injury_day28:", first_injury_day28[0:5])
print("first_injury_day56:", first_injury_day56[0:5])

double_injury_day28 = first_injury_day28
double_injury_day30 = [i for i in double_injury_stages if "AR1_MI28_P30" in i]
double_injury_day35 = [i for i in double_injury_stages if "AR1_MI28_P35" in i]
double_injury_day42 = [i for i in double_injury_stages if "AR1_MI28_P42" in i]
double_injury_day56 = [i for i in double_injury_stages if "AR1_MI28_P56" in i]
double_injury_model = [day1, double_injury_day28, double_injury_day30, double_injury_day35, double_injury_day42, double_injury_day56]

print("\nDouble Injury Model:")
print("day1:", day1)
print("first_injury_day28:", first_injury_day28[0:5])
print("double_injury_day30:", double_injury_day30[0:5])
print("double_injury_day35:", double_injury_day35[0:5])
print("double_injury_day42:", double_injury_day42[0:5])
print("double_injury_day56:", double_injury_day56[0:5])

Control Model:
day1: ['CTL-P1_8095', 'CTL-P1_8026_p1', 'CTL-P1_8094']
control_day28: ['CTL-P28_8046_RZ', 'CTL-P28_8046_BZ']
control_day56: ['CTL-P56_8052_AZ', 'CTL-P56_8052_RZ']

First Injury Model:
day1: ['CTL-P1_8095', 'CTL-P1_8026_p1', 'CTL-P1_8094']
first_injury_day28: ['AR1_P28_8014BZ', 'AR1_P28_8030_RZ', 'AR1_P28_8014RZ', 'AR1_P28_\t8030_CZ']
first_injury_day56: ['AR1_P56_8097CZ', 'AR1_P56_8096CZ', 'AR1_P56_8097RZ', 'AR1_P56_8096RZ']

Double Injury Model:
day1: ['CTL-P1_8095', 'CTL-P1_8026_p1', 'CTL-P1_8094']
first_injury_day28: ['AR1_P28_8014BZ', 'AR1_P28_8030_RZ', 'AR1_P28_8014RZ', 'AR1_P28_\t8030_CZ']
double_injury_day30: ['AR1_MI28_P30_8064CZ', 'AR1_MI28_P30_8064RZ', 'AR1_MI28_P30_8064AZ']
double_injury_day35: ['AR1_MI28_P35_8065AZ', 'AR1_MI28_P35_8095BZ', 'AR1_MI28_P35_8065RZ', 'AR1_MI28_P35_8095RZ', 'AR1_MI28_P35_8065CZ']
double_injury_day42: ['AR1_MI28_P42_8094BZ', 'AR1_MI28_P42_8094AZ', 'AR1_MI28_P42_8094RZ']
double_injury_day56: ['AR1_MI28_P56_8060AZ', 'AR1_MI28_P56_7995

#### 2/3. Data Filtering + Extraction

In [9]:
control_model_mask = {"control": ["day1", "control_day28", "control_day56"]}
single_injury_model_mask = {"single_injury" : ["day1", "first_injury_day28", "first_injury_day56"]}
double_injury_model_mask = {"double_injury" : ["day1", "double_injury_day28", "double_injury_day30", "double_injury_day35", "double_injury_day42", "double_injury_day56"]}

def extract_gene_lists(control_model, mask: dict) -> None:

    #-------------------------------------------
    #make folders for data store
    data_folder = "gene_lists/"
    
    #-------------------------------------------

    

In [10]:
extract_gene_lists(control_model, control_model_mask)

OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'control_day28:'