In [None]:
%load_ext rpy2.ipython

## Load python packages

In [None]:
import pandas as pd
import os
import glob
import re
import csv

## Load R packages

In [None]:
%%R
library(SummarizedExperiment)
library(recount3)

# Test API from Recount3

In [None]:
%%R
tryCatch({
    rse_gene <- recount3::create_rse_manual(
        project = "SRP104670",
        project_home = "data_sources/sra",
        organism = "mouse",
        annotation = "gencode_v23",
        type = "gene")
    
    colData(rse_gene)$sra.study_title
    
    },error=function(cond){
        print(cond)
    })


# Download Data From Recount3

This R code processes a manually curated list of RNA-seq projects from recount3, specified in a CSV file. It iterates through each project, retrieving metadata for those with 600 or fewer samples using create_rse_manual(), and saves the extracted metadata as a text file. Errors are handled gracefully by printing the sample count of failed projects instead of interrupting execution.

In [None]:
%%R

data <- read.csv("/mnt/home/yuankeji/RanceLab/reticula_new/recount3_selection_2024-04-12.csv")

for (i in 1:nrow(data)) {

  if (data$n_samples[i] <= 600) {
      tryCatch({
        rse_gene <- recount3::create_rse_manual(
          project = data$project[i],
          project_home = data$project_home[i],
          organism = data$organism[i],
          annotation = "gencode_v23",
          type = "gene"
        )

        output_path <- paste0("/mnt/home/yuankeji/RanceLab/reticula_new/downloadFromRecount3/rse_gene_", data$project[i], ".txt")
        write.table(data.frame(colData(rse_gene)), file = output_path, sep = '\t', quote = FALSE)
      },
      error = function(cond){
          print(data$n_samples[i])
      }
    )
}
}

# Extract Useful Data (sra.experiment_attributes) From Recount3 Into .csv File

In [None]:
def parse_attributes(attr_str):
    """ Parse the attribute string into a dictionary """
    if pd.isna(attr_str):
        return {}
    attrs = attr_str.split('|')
    attr_dict = {}
    for attr in attrs:
        if ';;' in attr:
            key, value = attr.split(';;', 1)
            attr_dict[key] = value
    return attr_dict

# Directory containing the files
input_directory = '/mnt/home/yuankeji/RanceLab/reticula_new/downloadFromRecount3/'
output_directory = '/mnt/home/yuankeji/RanceLab/reticula_new/'
output_file = 'combined_data_all_duplicate.csv'

# List all text files in the directory
file_paths = glob.glob(os.path.join(input_directory, '*.txt'))

# Initialize an empty DataFrame
combined_df = pd.DataFrame()
# Process each file
for index, file_path in enumerate(file_paths):
    try:
        print(f"Processing file {index}: {file_path}")
        # Read the file
        data = pd.read_csv(file_path, sep='\t', index_col=0)

        # Drop duplicate entries in 'sra.experiment_attributes' before parsing
        # unique_attributes = data['sra.sample_attributes'].drop_duplicates()
        unique_attributes = data['sra.sample_attributes']

        # Apply the parsing function to the unique 'sra.experiment_attributes' column
        unique_attributes = unique_attributes.apply(parse_attributes)

        # Convert the dictionary column to separate columns
        attributes_df = pd.DataFrame(unique_attributes.tolist(), index=unique_attributes.index)

        # Set the index name to the file name for clarity
        attributes_df.index.name = os.path.splitext(os.path.basename(file_path))[0]

        # Combine with the main DataFrame
        combined_df = pd.concat([combined_df, attributes_df])
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        break

combined_df.to_csv(os.path.join(output_directory, output_file), sep='\t')


print(f"Data combined and saved to {os.path.join(output_directory, output_file)}")


# If the combined_data_all_duplicate.csv file is too large, you can use following code to split it into several part so that can run the following code without interrupting

In [None]:
# split file at the begining
df = pd.read_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/original_data/combined_data_all_duplicate.csv', sep='\t', dtype=str)

part_size = len(df) // 3

df.iloc[:part_size].to_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/original_data/part1.csv', sep='\t', index=False)
df.iloc[part_size:2*part_size].to_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/original_data/part2.csv', sep='\t', index=False)
df.iloc[2*part_size:].to_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/original_data/part3.csv', sep='\t', index=False)

# 
# combine files together after removing and filtering all useless data
df_part1 = pd.read_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/removingFile_part1.6.csv', sep='\t', dtype=str)
df_part2 = pd.read_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/removingFile_part2.6.csv', sep='\t', dtype=str)
df_part3 = pd.read_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/removingFile_part3.6.csv', sep='\t', dtype=str)

df_combined = pd.concat([df_part1, df_part2, df_part3], ignore_index=True)

df_combined.to_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/original_data/combined_data_all_duplicate_restored.csv', sep='\t', index=False)


# Filter Data Based On Comparison File (You need to manually generate the comparison file)

In [None]:
input_file_path = '/Users/kejiyuan/Desktop/RanceLab/reticula_new/renamed_data3.csv'
output_file_path = '/Users/kejiyuan/Desktop/RanceLab/reticula_new/renamed_data4.csv'
df = pd.read_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/original_data/delete_these_smaller1.txt', sep='\t', encoding='utf-8')
# df = pd.read_excel('/Users/kejiyuan/Desktop/RanceLab/reticula_new/original_data/DeleteTheseTerms.xlsx')
df_input = pd.read_csv(input_file_path, sep=',', dtype=str)

def remove_rows_and_columns(df, column_name, keywords):
    if column_name not in df.columns:
        print(f"Column '{column_name}' does not exist in the DataFrame.")
        return df
    
    if keywords[0] == '*':
        # Delete rows where the column has non-empty values
        condition = df[column_name].notna()
        indices_to_delete = df[condition].index
        df.drop(indices_to_delete, inplace=True)
    else:
        # Escape special characters in keywords
        escaped_keywords = [re.escape(keyword) for keyword in keywords]
        # Generate regular expression pattern
        pattern = '|'.join(escaped_keywords)
        # indices_to_delete = df[df[column_name].isin(keywords)].index
        condition = df[column_name].str.contains(pattern, case=False, na=False, regex=True)

        indices_to_delete = df[condition].index

        df.drop(indices_to_delete, inplace=True)

    df.dropna(axis=1, how='all', inplace=True)

    df.dropna(axis=0, how='all', inplace=True)

    return df

columns_as_row = pd.DataFrame([df.columns.tolist()], columns=df.columns)
# df = pd.concat([columns_as_row, df], ignore_index=True)

for index, row in df.iterrows():
    rest_of_values = row.iloc[1:].dropna().values  

    if len(rest_of_values) > 0:  
        first_column_value = row.iloc[0] 
        print(index, df_input.shape, first_column_value, rest_of_values)
        df_input = remove_rows_and_columns(df_input, first_column_value, rest_of_values)


df_input.to_csv(output_file_path, sep='\t', index=False)

# Export a file based on key-value pairs of column names to facilitate manual deletion of useless data

In [None]:
# Define a function that takes a sequence and moves nulls to the end
def move_nulls_to_end(series):
    non_nulls = series.dropna()
    nulls = series[series.isnull()]
    return pd.concat([non_nulls, nulls]).reset_index(drop=True)

def process_csv(input_file_path, output_file_path):
    df = pd.read_csv(input_file_path, sep='\t', dtype=str)
    del df[df.columns[0]]

    unique_data = {}
    
    # Iterate through each column and remove duplicate values
    for column in df.columns:
        unique_data[column] = df[column].drop_duplicates().reset_index(drop=True)

    unique_df = pd.DataFrame.from_dict(unique_data)

    unique_df = unique_df.transpose()

    sorted_df = unique_df.apply(move_nulls_to_end, axis=1)

    sorted_df.to_csv(output_file_path, sep='\t', index=True)

# Call the function, which needs to be replaced with the actual file path
input_file_path = '/Users/kejiyuan/Desktop/RanceLab/reticula_new/renamed_data4.csv'
output_file_path = '/Users/kejiyuan/Desktop/RanceLab/reticula_new/key_val_pair.txt'
process_csv(input_file_path, output_file_path)


# Produce Project ID File in Bash

In [None]:
input_directory="/Users/kejiyuan/Desktop/RanceLab/reticula_new/original_data/downloadFromRecount3/"

for file in "$input_directory"/*.txt; do
    # Cut the first column and print each row with the filename
    awk -v fname="$file" '{print fname "\t" $1}' "$file"
done > "/Users/kejiyuan/Desktop/RanceLab/reticula_new/SRR.csv"

# Merge Project ID Into Final File

In [None]:
# Load the CSV files
srr_df = pd.read_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/SRR.csv', header=None, names=['path', 'SRR'], sep='\t', quotechar='"')
recount3_selection_df = pd.read_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/original_data/recount3_selection_2024-04-12.csv', sep=',', quotechar='"')
renamed_data4_df = pd.read_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/renamed_data4.csv', sep='\t', quotechar='"')

# Extract the project ID from the file path in SRR.csv

srr_df['project_id'] = srr_df['path'].str.replace(r'/Users/kejiyuan/Desktop/RanceLab/reticula_new/original_data/downloadFromRecount3//rse_gene_', '').str.replace(r'.txt', '')

# Remove rows where SRR is 'rail_id'
srr_df = srr_df[srr_df['SRR'] != 'rail_id']

# Merge SRR.csv with recount3_selection.csv on the extracted project ID
merged_df = pd.merge(srr_df, recount3_selection_df, left_on='project_id', right_on='project', how='left')

# Merge the result with renamed_data4.csv on the SRR identifier
final_df = pd.merge(merged_df, renamed_data4_df, on='SRR', how='left')

# Select columns to keep from recount3_selection_2024-04-12.csv excluding 'project'
columns_to_keep = ['organism', 'project_home', 'n_samples', 'study_title', 'study_abstract'] + list(renamed_data4_df.columns)
final_df = final_df[columns_to_keep]

# Save the result to a new CSV file
final_df.to_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/renamed_data5.csv', index=False)

print("Merging completed. Output saved as 'renamed_data5.csv'.")

# Merge Columns Name Based on Specific File (If it is necessary.)

In [None]:
# Load the data
data_df = pd.read_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/original_data/combined_data_all_duplicate_restored.csv', sep='\t')

# Load the mapping file
mapping_df = pd.read_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/original_data/ColumnRenaming.txt', sep='\t', dtype=str)
mapping_df.loc[mapping_df['Old Column Name'] == 'Unnamed: 0', 'New Column'] = 'SRR'  # Make sure this mapping is correct

# Apply renaming
mapping_dict = dict(zip(mapping_df['Old Column Name'], mapping_df['New Column']))
data_df.rename(columns=mapping_dict, inplace=True)

# Make sure 'SRR' is the first column
# We assume 'SRR' is already in the DataFrame from the renaming step
columns = ['SRR'] + [col for col in data_df.columns if col != 'SRR']
data_df = data_df[columns]

# Define the function for merging columns
def same_merge(x):
    if len(x.dropna()) == 0:
        return pd.NA
    return ','.join(x.dropna().astype(str))

# Apply merging and save
grouped = data_df.groupby(by=data_df.columns, axis=1)
data_df = grouped.apply(lambda x: x.apply(same_merge, axis=1))

# Confirm 'SRR' is the first column and print final columns
print("Final DataFrame columns:", data_df.columns)

# Save the results
data_df.to_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/renamed_data.csv', index=False)

# Final Check On Data
# You can delete by number of tissue and specific tissue

In [None]:
data = pd.read_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/rse_gene/CleanedSamplesGNN_v1.txt', sep='\t')

# Count the number of each issue
tissue_counts = data['Major_tissue'].value_counts()

# Delete all rows with number of tissue less than or equal to 20
tissues_to_remove = tissue_counts[tissue_counts <= 20].index
data = data[~data['Major_tissue'].isin(tissues_to_remove)]

# Delete all rows of spleen
data = data[data['Major_tissue'] != 'Spleen']

data.to_csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/rse_gene/CleanedSamplesGNN_v2.txt', sep='\t', index=False)


# Produce rse_gene.Rdata File

In [None]:
%%R

recount_data <- read.csv("/Users/kejiyuan/Desktop/RanceLab/reticula_new/recount3_selection_2024-04-12.csv")
cleaned_data <- read.csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/rse_gene/CleanedSamplesGNN_v3.csv', sep='.')

final_result <- NULL

print(nrow(recount_data))

for (i in 1:nrow(recount_data)) {
      if (recount_data[i, 'study_title'] %in% cleaned_data$Title) {
      print(paste("Current running", i))
      if (recount_data$n_samples[i] <= 600) {
          tryCatch({
            rse_gene <- recount3::create_rse_manual(
              project = recount_data$project[i],
              project_home = recount_data$project_home[i],
              organism = recount_data$organism[i],
              annotation = "gencode_v23",
              type = "gene"
            )

            sample_list <- cleaned_data[,'SRR']
            keep <- intersect(sample_list, colnames(rse_gene))
            temp_rse_gene <- rse_gene[,keep]
              
            gene_order <- colData(temp_rse_gene)$external_id
            cur_obj <- merge(colData(temp_rse_gene), cleaned_data, by.x="external_id", by.y = "SRR", all.x=TRUE)
            ordered_indices <- match(gene_order, cur_obj$external_id)
            cur_obj <- cur_obj[ordered_indices, ]
              
            colData(temp_rse_gene) <- cur_obj
            if (is.null(final_result)) {
                final_result <- temp_rse_gene
            } else {
                final_result <- cbind(final_result, temp_rse_gene)
            }
          },
          error = function(cond){
              print("-----------------")
              print(recount_data$n_samples[i])
              print(recount_data$project[i])
              print(i)
          }
        )
      }
    }
  
}

save(final_result, file = "/Users/kejiyuan/Desktop/RanceLab/reticula_new/rse_gene/rse_gene.Rdata")

# Produce GEO_model_validation_rse_gene.Rdata

In [None]:
%%R

recount_data <- read.csv("/Users/kejiyuan/Desktop/RanceLab/reticula_new/recount3_selection_2024-04-12.csv")
cleaned_data <- read.csv('/Users/kejiyuan/Desktop/RanceLab/reticula_new/rse_gene/GEO_model_validation.txt', sep='\t')

final_result <- NULL

print(nrow(recount_data))
needed_project_ids <- c('SRP131784', 'SRP090688', 'SRP161461', 'SRP075814', 'SRP049440')

for (i in 1:nrow(recount_data)) {
    print(paste("Current running", i))
    # Check if the current project ID is one of the needed ones
    if (!(recount_data$project[i] %in% needed_project_ids)) {
        next  # Skip the rest of this iteration if the project ID is not needed
    }

    tryCatch({
        rse_gene <- recount3::create_rse_manual(
          project = recount_data$project[i],
          project_home = recount_data$project_home[i],
          organism = recount_data$organism[i],
          annotation = "gencode_v23",
          type = "gene"
        )

        sample_list <- cleaned_data[,'SRR']
        keep <- intersect(sample_list, colnames(rse_gene))
        temp_rse_gene <- rse_gene[,keep]
#         rownames(cleaned_data) <- cleaned_data$SRR
        
        gene_order <- colData(temp_rse_gene)$external_id
        cur_obj <- merge(colData(temp_rse_gene), cleaned_data, by.x="external_id", by.y = "SRR", all.x=TRUE)
        ordered_indices <- match(gene_order, cur_obj$external_id)
        cur_obj <- cur_obj[ordered_indices, ]
        
        colData(temp_rse_gene) <- cur_obj
        if (is.null(final_result)) {
            final_result <- temp_rse_gene
        } else {
            final_result <- cbind(final_result, temp_rse_gene)
        }
    },
    error = function(cond){
        print("-----------------")
        print(recount_data$n_samples[i])
        print(recount_data$project[i])
        print(i)
    })

}

save(final_result, file = "/Users/kejiyuan/Desktop/RanceLab/reticula_new/rse_gene/GEO_model_validation_rse_gene.Rdata")


# Produce ReactionNetwork_Rel.txt

1. Install Docker 
2. Find reactome/graph on dockerhub and running:
    docker run -p 7474:7474 -p 7687:7687 -e NEO4J_dbms_memory_heap_maxSize=8g reactome/graphdb:latest
3. The username is "Neo4j" and the password is "admin".
4. Using the following two queries to get the data:
    "MATCH (r2:ReactionLikeEvent {speciesName:"Mus musculus"})-[:precedingEvent]->(r1:ReactionLikeEvent {speciesName:"Mus musculus"}) RETURN r1.stId as `Preceding Reaction`, r2.stId as `Following Reaction`"
    
    "MATCH (r1:ReactionLikeEvent {speciesName:"Mus musculus"})-[:output]->(PhysicalEntity)<-[:physicalEntity]-(CatalystActivity)<-[:catalystActivity]-(r2:ReactionLikeEvent {speciesName:"Mus musculus"}) RETURN r1.stId as `Preceding Reaction`,r2.stId as `Following Reaction`"
   
5. Download teh result as Mouse1.csv and Mouse2.csv file.
6. Using following code to integrate data together:


In [None]:
file1 = pd.read_csv(os.getcwd() + "/Mouse1.csv")
file2 = pd.read_csv(os.getcwd() + "/Mouse2.csv")

def insertCol(file):
    file['Relationship'] = "Preceding"
    col_names = file.columns.tolist()
    new_col_order = [col_names[0], "Relationship"] + col_names[1:-1]
    file = file[new_col_order]
    return file

file1 = insertCol(file1)
file2 = insertCol(file2)

merged_df = pd.concat([file1, file2], ignore_index=True)

merged_df.to_csv("ReactionNetwork_Rel.txt", sep="\t", index=False, quoting=csv.QUOTE_NONNUMERIC)

# Produce ReactionToPathway_Rel.csv file

1. Install Docker 
2. Find reactome/graph on dockerhub and running:
    docker run -p 7474:7474 -p 7687:7687 -e NEO4J_dbms_memory_heap_maxSize=8g reactome/graphdb:latest
3. The username is "Neo4j" and the password is "admin".
4. Using following query to get the relationship between pathway and reaction
    "MATCH (p:Pathway {speciesName:"Mus musculus"})-[:hasEvent]->(r:ReactionLikeEvent)
    RETURN p.stId as Pathway, r.stId as ReactionLikeEvent, r.displayName as Title"
5. Download teh result as ReactionToPathway_Rel.csv file.