# DAISY RNAseq data projection

This notebook project RNA sequencing data into a Latent Space representing gene modules that are co expressed given various conditions, cell types, ... 

Sakaiza Rasolofomanana Rajery

12/19/2024

## load packages

In [16]:
library(tidyverse,warn.conflicts=FALSE)
library(reticulate)
library(MASS,warn.conflicts=FALSE)
library(devtools,warn.conflicts=FALSE)
library(PLIER,warn.conflicts=FALSE)
#install_github("wgmao/PLIER")

In [17]:
# srouce plier_util to get function GetnewdataB 
source('scripts/plier_util.R')

In [18]:
# use GetOrderedRowNormEM function from Marc to order matrices
GetOrderedRowNormEM <- function(exprs.mat, plier.model) {

    require(PLIER)
    
    # Z-score normalization
    exprs.norm <- rowNorm(exprs.mat)
    exprs.norm <- na.omit(exprs.norm)
    
    z.mat <- plier.model$Z
    genes.in.model <- rownames(z.mat)
    genes_in_exprs <- rownames(exprs.norm)
    
    # Find the common genes
    common_genes <- intersect(genes_in_exprs, genes.in.model)
    
    # Filter the matrices based on the common genes
    exprs.norm.filtered <- exprs.norm[common_genes, , drop = FALSE]
    z.mat.filtered <- z.mat[common_genes, , drop = FALSE]
        
  # Update the plier.model with the new Z matrix
  plier.model$Z <- z.mat.filtered

  # Return the updated plier.model and the filtered exprs.norm
  list(plier.model = plier.model, exprs.norm.filtered = exprs.norm.filtered)

}

## load data: RNA seq, Matrices, name to ID, metada

### RNA_seq data

In [19]:
# path to rna dataset
path_to_rna <- "/Users/rasolofs/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/DAISY RNA Phenoplier"
#raw value at visit 1 and 2
#RNAseq_1_Raw <- readRDS("RNA_Visit_1.RDS")
#RNAseq_2_Raw <- readRDS("RNA_Visit_2.RDS")

#r-log adjusted for age, sex and ancestry at visit 1 and 2
RNAseq_1_res <- as.matrix(readRDS(paste0(path_to_rna,"/RNA_Visit_1_Residuals.RDS")))
#RNAseq_2_res <- readRDS("RNA_Visit_2_Residuals.RDS")

#RNA_annot <- readRDS("Expression_Annotation.RDS")
#RNA_pheno <- read_csv("DAISY_RNASeq_Phenotype_Deidentified.csv")

str(RNAseq_1_res)
head(RNAseq_1_res,5)
dim(RNAseq_1_res)

 num [1:22236, 1:136] 0.031 -0.0478 -0.0553 0.0398 -0.1755 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:22236] "ENSG00000000003" "ENSG00000000419" "ENSG00000000457" "ENSG00000000460" ...
  ..$ : chr [1:136] "00041-0" "00110-0" "00139-0" "00159-0" ...


Unnamed: 0,00041-0,00110-0,00139-0,00159-0,00174-0,00177-0,00181-0,00234-0,00250-0,00266-0,⋯,39376-0,39390-0,50136-0,62291-0,81801-0,84002-0,84689-0,84731-0,84860-0,85365-0
ENSG00000000003,0.03095809,-0.07856905,0.1918862,-0.171697815,-0.05682264,-0.14436092,-0.1597369,0.15962816,0.04597725,-0.05037662,⋯,0.03481634,-0.11353805,-0.07546308,-0.24782141,-0.181882227,-0.15831031,0.042505051,0.004158849,-0.04953921,-0.07543666
ENSG00000000419,-0.04781262,0.22526729,0.1496638,0.008030305,0.07860273,-0.65566596,-0.07836598,-0.08440686,-0.07336101,0.01694056,⋯,0.03302391,-0.17723135,-0.20278517,0.05153863,0.027867915,0.14438615,0.18105738,-0.06017142,0.0227541,0.09297875
ENSG00000000457,-0.05528318,-0.0653531,0.1215783,0.042311336,-0.13641096,-0.05600516,-0.11315866,-0.02458583,0.17706909,-0.08072746,⋯,-0.05381721,-0.06526485,0.0607646,0.07453007,0.118070226,-0.02070351,-0.071758383,0.01358122,-0.08240612,0.05418788
ENSG00000000460,0.03975773,-0.03598026,0.1845973,-0.036771847,0.11839537,-0.75807634,-0.07701233,0.05610122,-0.20000546,0.26409694,⋯,0.08086216,0.0493724,-0.06226473,0.11234436,0.116699931,0.04048291,0.318198228,-0.003560572,0.09052353,0.07681467
ENSG00000000938,-0.17548539,0.13802186,-0.1033869,0.120510893,-0.11488039,-0.16473877,-0.11642225,-0.04561926,-0.01971724,-0.14657057,⋯,-0.08194073,-0.11987964,-0.04133224,0.21522278,0.006778182,0.1091256,-0.009714053,-0.062825872,-0.17994881,0.04227325


### name to ensemble_id file

In [20]:
#load pickle data to convert gene name to EnsemblID
py_data <- py_load_object("data/input/genes_mapping_id_to_name.pkl")
data_t <- data.frame(t(data.frame(py_data)))
colnames(data_t) <- "gene"
name_to_ID <- rownames_to_column(data_t, var = "EnsemblID")
head(name_to_ID,5)

Unnamed: 0_level_0,EnsemblID,gene
Unnamed: 0_level_1,<chr>,<chr>
1,ENSG00000000457,SCYL3
2,ENSG00000000938,FGR
3,ENSG00000000971,CFH
4,ENSG00000001036,FUCA2
5,ENSG00000001084,GCLC


### metadata file

In [21]:
# Import model metadata file to re-build plier model
metadata <- readRDS("data/input/multiplier_model_metadata.rds")
print(metadata)

$L1
[1] 120.5661

$L2
[1] 241.1322

$L3
[1] 0.01269676



### Multiplier matrices

In [22]:
# load matrices to rebuild plier model
# LV x samples
b_model <- readRDS("data/input/multiplier_model_b.rds")
head(b_model,5)
# gene x LV
z_model <- readRDS("data/input/multiplier_model_z.rds")
head(z_model,5)

Unnamed: 0,SRP000599.SRR013549,SRP000599.SRR013550,SRP000599.SRR013551,SRP000599.SRR013552,SRP000599.SRR013553,SRP000599.SRR013554,SRP000599.SRR013555,SRP000599.SRR013556,SRP000599.SRR013557,SRP000599.SRR013558,⋯,SRP035599.SRR1139372,SRP035599.SRR1139393,SRP035599.SRR1139388,SRP035599.SRR1139378,SRP035599.SRR1139399,SRP035599.SRR1139386,SRP035599.SRR1139375,SRP035599.SRR1139382,SRP035599.SRR1139356,SRP035599.SRR1139370
"1,REACTOME_MRNA_SPLICING",-0.059296689,-0.047909034,-0.049366085,-0.065078034,-0.036394186,-0.046432986,-0.0409805,-0.040068202,-0.046137392,-0.048547681,⋯,0.02821953,0.035137107,0.06507733,0.07814365,0.092361864,0.069042346,0.090913845,0.096341467,0.13111465,0.171751422
"2,SVM Monocytes",0.006212678,0.003625471,0.006604582,0.009258006,0.005061427,0.004132735,0.008950264,0.007226716,0.007240987,0.005709697,⋯,-0.050455152,-0.03450197,-0.03364029,-0.049702173,-0.037425739,-0.050069528,-0.022575052,-0.055091302,-0.05686929,-0.01807257
"3,REACTOME_TRANSMISSION_ACROSS_CHEMICAL_SYNAPSES",-0.026105335,-0.03223206,-0.020621382,-0.027598555,-0.035248076,-0.038700769,-0.032527087,-0.030592727,-0.028937277,-0.02740566,⋯,-0.028609689,-0.033449754,-0.030583001,-0.032399106,-0.029365381,-0.025405876,-0.033657228,-0.03131768,-0.03092424,-0.027868614
"4,REACTOME_NEURONAL_SYSTEM",-0.022079745,-0.00897091,-0.020341711,-0.016260213,-0.003022898,0.002442659,-0.020457842,-0.023735309,-0.021581483,-0.022477572,⋯,-0.037122216,-0.029658154,-0.036349546,-0.039253549,-0.035204624,-0.036345061,-0.03451388,-0.035925708,-0.04035837,-0.031131153
LV 5,0.007663157,0.007036176,0.006608393,0.003446311,0.006340665,0.007106127,0.007930485,0.009164026,0.008023601,0.007937586,⋯,-0.003055909,-0.004783739,-0.004352417,-0.004159541,-0.001084991,-0.001884109,-0.003561052,-0.003546184,-0.01210732,-0.001192709


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
GAS6,0.0,0,0.03943774,0,0.05047625,0.0,0,0.0,0.5909494,0.0,⋯,0.0501251,0.0,0.033407371,0.0,0.0,0.005963392,0.34736209,0,0.0,0.0
MMP14,0.0,0,0.0,0,0.07007159,0.0,0,0.004904131,1.7201788,2.42359463,⋯,0.0,0.0,0.001007286,0.0,0.03574724,0.0,0.0,0,0.01497801,0.0
DSP,0.0,0,0.0,0,0.0,0.04169683,0,0.005718149,0.0,0.0,⋯,0.02085321,0.0,0.0,0.0,0.0,0.00577444,0.0,0,0.0,0.4164045
MARCKSL1,0.3052117,0,0.0,0,0.0,0.0,0,0.0,0.1618435,0.14947148,⋯,0.02713418,0.05271997,0.0,0.03018947,0.06088351,0.0,0.0,0,0.0,0.44848
SPARC,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.01401441,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.06777859,0,0.12241734,0.0626649


## label z_matrix with ensemblID instead of gene name

In [23]:
# convert matrix to dataframe with gene as rownames
z_df <- cbind(gene = rownames(z_model), as.data.frame(z_model))
# merge dataframe with 'name_to_ID' (file containing name and ensembleID) by gene
EnsID_df <- merge(z_df, name_to_ID, by = "gene")
#set rownames of new dataframe to ensembleID
rownames(EnsID_df) <- EnsID_df$EnsemblID
#remove gene and ensembleID column
z_matrix <- as.matrix(EnsID_df[, !(colnames(EnsID_df) %in% c("gene", "EnsemblID"))])

str(z_matrix)
head(z_matrix,5)
dim(z_matrix)

 num [1:6455, 1:987] 0 0.2712 0.0051 0.513 1.0082 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:6455] "ENSG00000175899" "ENSG00000094914" "ENSG00000129673" "ENSG00000090861" ...
  ..$ : chr [1:987] "V1" "V2" "V3" "V4" ...


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V978,V979,V980,V981,V982,V983,V984,V985,V986,V987
ENSG00000175899,0.0,0.0,0.0,0.0784275,0,0.0,0.0,0.04677228,0.0,0.0110329,⋯,0.01091,0.0,0.052869436,0.012748534,0,0.01398231,0.1522409,0.0,0.0,0.10888358
ENSG00000094914,0.271162234,0.0,0.0,0.0,0,0.0,0.0,0.0,1.5133366,0.0,⋯,0.0,0.0,0.0,0.001385031,0,0.0,0.71189565,0.03053447,0.4471055,0.42468522
ENSG00000129673,0.005098885,0.3084969,0.02881539,0.0,0,0.0,0.3717253,0.0,0.1352375,0.0,⋯,0.117163,0.01881498,0.0,0.193141835,0,0.04040077,0.18043565,0.0,0.0,0.04206449
ENSG00000090861,0.512978085,0.0,0.45128547,0.31956792,0,0.0,0.0,0.0,0.0,0.0,⋯,0.0,0.0,0.006207784,0.021274385,0,0.0,0.06214269,0.0,0.0,0.36489213
ENSG00000124608,1.008156895,0.0,0.19742406,0.02483182,0,0.5708101,0.0,0.2286746,0.0,0.1162435,⋯,0.0,0.0,0.153344992,0.0,0,0.0,1.00457811,0.01447016,0.0,0.0


In [24]:
# rebuild plier model with z, b, L1, L2, L3
plier.model <- list(Z = z_matrix, B =b_model, L1 = metadata$L1, L2 = metadata$L2, L3 = metadata$L3)

In [25]:
# Check all variables are loaded
ls()

In [26]:
#order gene expression matrix to match plier model
ordered_mat <- GetOrderedRowNormEM(RNAseq_1_res, plier.model)

#extract gene_expressiom matrix
ordered_DAISY_gen_expr <- ordered_mat$exprs.norm.filtered

# extract plier model matrix
ordered_model <- ordered_mat$plier.model

In [27]:
#project gen expression matrix into plier model
projection <- GetNewDataB(ordered_DAISY_gen_expr,ordered_model)

In [28]:
#save projection
df_projection  <- as.data.frame(projection)
saveRDS(df_projection,"output/projection_1.rds")

In [29]:
dim(df_projection)

In [30]:
head(df_projection, 10)

Unnamed: 0_level_0,00041-0,00110-0,00139-0,00159-0,00174-0,00177-0,00181-0,00234-0,00250-0,00266-0,⋯,39376-0,39390-0,50136-0,62291-0,81801-0,84002-0,84689-0,84731-0,84860-0,85365-0
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
"1,REACTOME_MRNA_SPLICING",0.045748028,-0.08426191,-0.077697184,-0.078539199,0.022015152,0.114903283,0.033916849,-0.0007790358,-0.0550538495,0.307223472,⋯,-0.020805125,0.200051932,0.0745357673,0.0790468803,-0.116613355,-0.024169235,-0.0259985,0.03331409,0.039020003,-0.039547995
"2,SVM Monocytes",0.104307804,0.02627734,-0.049913741,0.266457766,-0.008530261,-0.048283019,-0.052436882,-0.0500619312,0.1014881826,-0.106715583,⋯,-0.110671875,0.097926147,-0.0420160001,-0.0271421926,0.084166292,0.102093727,0.02434837,-0.03841368,-0.217559542,-0.190521648
"3,REACTOME_TRANSMISSION_ACROSS_CHEMICAL_SYNAPSES",-0.034338781,-0.0214153,-0.102291965,-0.00426107,-0.006897807,0.043636271,-0.041807814,0.0247366996,0.0033781163,-0.040374505,⋯,-0.003532239,0.025953059,-0.0321982346,0.0007537681,-0.027967879,0.02562363,-0.1051815,-0.0299277,0.005014077,-0.022716857
"4,REACTOME_NEURONAL_SYSTEM",0.056377294,0.04507367,0.097356702,0.041259268,0.027453577,-0.041370318,0.049110456,0.0497693154,0.0498141323,-0.022980039,⋯,0.010174065,0.008385931,-0.0202101409,-0.1275247825,0.049648173,0.012382789,0.0554013,0.03620698,0.011494181,-0.002648721
LV 5,-0.006014439,0.00770212,-0.023535277,0.010785959,0.017448294,-0.060001645,-0.019730027,0.0274240952,0.0753578701,0.002400883,⋯,0.006542319,0.012716513,-0.0071833708,0.0320062254,-0.006807858,-0.01310825,0.02927054,-0.02252472,0.012608763,0.021911458
LV 6,0.013141592,0.04091529,0.033234436,-0.026479309,-0.004379551,0.061410724,0.008241706,0.0501137283,0.031309532,-0.003194558,⋯,-0.018788937,0.005204376,0.0004564716,-0.0563713341,0.02014282,0.05810139,0.02155339,0.03912967,0.00847216,0.018468082
"7,IRIS_Neutrophil-Resting",-0.158699344,0.06129902,-0.067108808,0.043252439,-0.080901625,-0.089031326,-0.066598034,-0.0180259054,0.007287385,-0.059118798,⋯,0.001568902,-0.108232463,-0.0593649686,0.172560986,-0.015811326,-0.015747363,0.02271727,-0.04820677,-0.104396531,0.10322553
"8,KEGG_PPAR_SIGNALING_PATHWAY",-0.014859012,0.03218188,0.008868332,0.005756415,-0.007961799,-0.100436616,0.026326809,0.0203580538,-0.0004896021,0.006190327,⋯,-0.017983589,-0.00940778,0.009748989,-0.0206361462,-0.089435105,0.010622735,0.02822831,0.00188233,0.013371025,-0.032186313
LV 9,-0.032401545,-0.0818652,-0.086891463,-0.028086861,-0.013467576,0.007585578,-0.007937155,0.0006474471,-0.0177954469,-0.059505412,⋯,-0.028677074,0.081923299,-0.018320375,0.0725068819,-0.058726972,-0.001301881,-0.13552119,-0.01361991,0.014036729,0.016115704
"10,SVM Macrophages M0",-0.01260643,0.01204398,-0.008437336,0.030484368,0.012146043,-0.0610661,-0.041958257,-0.026445082,-0.0019364648,0.010642824,⋯,-0.029438743,0.034574516,-0.0440265411,0.014650792,-0.002235631,0.060245855,-0.01860208,-0.01231577,-0.042654096,0.016234614
