## Load python packages

In [1]:
import pandas as pd
import os
import glob
import re
import csv

## Load R packages

In [4]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [5]:
%%R

.libPaths("/mnt/gs21/scratch/naultran/Rlocal4.3.2")

In [None]:
%%R

library(SummarizedExperiment)
library(recount3)

# Test API from Recount3

In [None]:
%%R
tryCatch({
    rse_gene <- recount3::create_rse_manual(
        project = "SRP104670",
        project_home = "data_sources/sra",
        organism = "mouse",
        annotation = "gencode_v23",
        type = "gene")
    
    colData(rse_gene)$sra.study_title
    
    },error=function(cond){
        print(cond)
    })


# Produce rse_gene.Rdata File

> This generates the input data for the GNN training and validation which contains all the tissues but excludes the treated samoples.

In [None]:
%%R

# Lists all projects
recount_data <- read.csv("../inputs/recount3_selection_2024-04-12.csv")

# Specifies samples to exclude treated samples
cleaned_data <- read.csv('../inputs/CleanedSamplesGNN_v3.csv', sep='.')

final_result <- NULL

print(nrow(recount_data))

for (i in 1:nrow(recount_data)) {
      if (recount_data[i, 'study_title'] %in% cleaned_data$Title) {
      print(paste("Current running", i))
      if (recount_data$n_samples[i] <= 600) {
          tryCatch({
            rse_gene <- recount3::create_rse_manual(
              project = recount_data$project[i],
              project_home = recount_data$project_home[i],
              organism = recount_data$organism[i],
              annotation = "gencode_v23",
              type = "gene"
            )

            sample_list <- cleaned_data[,'SRR']
            keep <- intersect(sample_list, colnames(rse_gene))
            temp_rse_gene <- rse_gene[,keep]
              
            gene_order <- colData(temp_rse_gene)$external_id
            cur_obj <- merge(colData(temp_rse_gene), cleaned_data, by.x="external_id", by.y = "SRR", all.x=TRUE)
            ordered_indices <- match(gene_order, cur_obj$external_id)
            cur_obj <- cur_obj[ordered_indices, ]
              
            colData(temp_rse_gene) <- cur_obj
            if (is.null(final_result)) {
                final_result <- temp_rse_gene
            } else {
                final_result <- cbind(final_result, temp_rse_gene)
            }
          },
          error = function(cond){
              print("-----------------")
              print(recount_data$n_samples[i])
              print(recount_data$project[i])
              print(i)
          }
        )
      }
    }
  
}

save(final_result, file = "/Users/kejiyuan/Desktop/RanceLab/reticula_new/rse_gene/rse_gene.Rdata")

# Produce GEO_model_validation_rse_gene.Rdata

> This is used to produce the rse_gene object for the testing (TCDD treated samples).

In [None]:
%%R

recount_data <- read.csv("../inputs/recount3_selection_2024-04-12.csv")
cleaned_data <- read.csv('../inputs/GEO_model_validation.txt', sep='\t')

final_result <- NULL

print(nrow(recount_data))
needed_project_ids <- c('SRP131784', 'SRP090688', 'SRP161461', 'SRP075814', 'SRP049440')

for (i in 1:nrow(recount_data)) {
    print(paste("Current running", i))
    # Check if the current project ID is one of the needed ones
    if (!(recount_data$project[i] %in% needed_project_ids)) {
        next  # Skip the rest of this iteration if the project ID is not needed
    }

    tryCatch({
        rse_gene <- recount3::create_rse_manual(
          project = recount_data$project[i],
          project_home = recount_data$project_home[i],
          organism = recount_data$organism[i],
          annotation = "gencode_v23",
          type = "gene"
        )

        sample_list <- cleaned_data[,'SRR']
        keep <- intersect(sample_list, colnames(rse_gene))
        temp_rse_gene <- rse_gene[,keep]
        
        gene_order <- colData(temp_rse_gene)$external_id
        cur_obj <- merge(colData(temp_rse_gene), cleaned_data, by.x="external_id", by.y = "SRR", all.x=TRUE)
        ordered_indices <- match(gene_order, cur_obj$external_id)
        cur_obj <- cur_obj[ordered_indices, ]
        
        colData(temp_rse_gene) <- cur_obj
        if (is.null(final_result)) {
            final_result <- temp_rse_gene
        } else {
            final_result <- cbind(final_result, temp_rse_gene)
        }
    },
    error = function(cond){
        print("-----------------")
        print(recount_data$n_samples[i])
        print(recount_data$project[i])
        print(i)
    })

}

save(final_result, file = "/Users/kejiyuan/Desktop/RanceLab/reticula_new/rse_gene/GEO_model_validation_rse_gene.Rdata")


# Generate reaction network mapping from Reactome data

## Produce ReactionNetwork_Rel.txt

1. Install Docker

2. Find `reactome/graphdb` on Docker Hub and run:  
   `docker run -p 7474:7474 -p 7687:7687 -e NEO4J_dbms_memory_heap_maxSize=8g reactome/graphdb:latest`

3. The username is "Neo4j" and the password is "admin".

4. Use the following two queries to retrieve the data:  
   `MATCH (r2:ReactionLikeEvent {speciesName:"Mus musculus"})-[:precedingEvent]->(r1:ReactionLikeEvent {speciesName:"Mus musculus"}) RETURN r1.stId as Preceding Reaction, r2.stId as Following Reaction`

   `MATCH (r1:ReactionLikeEvent {speciesName:"Mus musculus"})-[:output]->(PhysicalEntity)<-[:physicalEntity]-(CatalystActivity)<-[:catalystActivity]-(r2:ReactionLikeEvent {speciesName:"Mus musculus"}) RETURN r1.stId as Preceding Reaction, r2.stId as Following Reaction`

5. Download the result as `Mouse1.csv` and `Mouse2.csv` files and place in inputs folder.

6. Use the following code to integrate the data together:

In [None]:
file1 = pd.read_csv("../inputs/Mouse1.csv")
file2 = pd.read_csv("../inputs/Mouse2.csv")

def insertCol(file):
    file['Relationship'] = "Preceding"
    col_names = file.columns.tolist()
    new_col_order = [col_names[0], "Relationship"] + col_names[1:-1]
    file = file[new_col_order]
    return file

file1 = insertCol(file1)
file2 = insertCol(file2)

merged_df = pd.concat([file1, file2], ignore_index=True)

merged_df.to_csv("../inputs/ReactionNetwork_Rel.txt", sep="\t", index=False, quoting=csv.QUOTE_NONNUMERIC)

## Produce ReactionToPathway_Rel.csv file

1. Install Docker

2. Find reactome/graphdb on Docker Hub and run:  
   `docker run -p 7474:7474 -p 7687:7687 -e NEO4J_dbms_memory_heap_maxSize=8g reactome/graphdb:latest`

3. The username is "Neo4j" and the password is "admin".

4. Use the following query to retrieve the relationship between pathways and reactions:  
   `MATCH (p:Pathway {speciesName:"Mus musculus"})-[:hasEvent]->(r:ReactionLikeEvent)`  
   `RETURN p.stId as Pathway, r.stId as ReactionLikeEvent, r.displayName as Title`

5. Download the result as `ReactionToPathway_Rel.csv` and place it in the `inputs` folder.
