# load packages

In [None]:
import pandas as pd

In [None]:
from scipy.stats import zscore

In [None]:
import requests

In [None]:
import sys

# read in input files

## reactome databases

In [None]:
reactome_gene = pd.read_csv('raw_databases/ReactomePathways.gene.tsv',
                            sep='\t',
                            header=None)
reactome_gene.head()

In [None]:
reactome_protein = pd.read_csv('raw_databases/UniProt2Reactome_All_Levels.txt',
                            sep='\t',
                            header=None)
reactome_protein.head()

In [None]:
reactome_path = pd.read_csv('raw_databases/ReactomePathways.txt',
                            sep='\t',
                            header=None)
reactome_path.head()

## gene/protein lists

### gene list

In [None]:
meta_gene_list = pd.read_csv('merged_outputs/AOU_ALL.UKBB.metasoft.ADSP.all.VEP_v113.gene_by_position.r2_0.1_clump_variants_excluded.RE_pval_threshold_0.05.gene_score_list.txt')
print(len(meta_gene_list.index))
meta_gene_list.head()

### rnaseq gene list

In [None]:
rnaseq_gene = pd.read_csv('rosmap/ROSMAP_RNAseq_genes.txt',
                        header = None)
print(len(rnaseq_gene.index))
rnaseq_gene.head()

In [None]:
msbb_rnaseq_gene = pd.read_csv('msbb/MSBB.RNAseq.BA_ALL.19batch.gene_symbol.individualID.mvalue_norm.tpm.log2.coding_genes_only.covar_corrected.gene_list.txt')
print(len(msbb_rnaseq_gene.index))
msbb_rnaseq_gene.head()

### methylation gene list

In [None]:
methyl_gene = pd.read_csv('rosmap/ROSMAP_arrayMethylation_imputed.gene_list.txt')
print(len(methyl_gene.index))
methyl_gene.head()

In [None]:
msbb_methyl_gene = pd.read_csv('msbb/MSBB.methylation_array.19batch.gene_symbol.individualID.mvalue_norm.gene_list.txt')
print(len(msbb_methyl_gene.index))
msbb_methyl_gene.head()

### LC SRM protein output from uniprot

In [None]:
srm_protein = pd.read_csv('raw_databases/idmapping_reviewed_true_AND_organism_id_2025_04_30.rosmap.proteomics.srm.peptide.round1.tsv.gz',
                          sep='\t')
print(len(srm_protein.index))
srm_protein.head()

### TMT protein list

In [None]:
tmt_protein = pd.read_csv('raw_databases/idmapping_model_organism_9606_AND_revie_2025_04_30.rosmap.proteomics.tmt.round1.tsv.gz',
                          sep = '\t')
print(len(tmt_protein.index))
tmt_protein.head()

In [None]:
msbb_tmt_gene = pd.read_csv('msbb/MSBB.TMT_proteomics.19batch.normalized.gene_symbol.individualID.log2_transformed.gene_list.txt')
print(len(msbb_tmt_gene.index))
msbb_tmt_gene.head()

### somoscan

In [None]:
somoscan_gene = pd.read_csv('rosmap/ROSMAP.proteomics.somoscan.individualID.entrez_gene_symbol.gene_list.txt')
print(len(somoscan_gene.index))
somoscan_gene.head()

In [None]:
somoscan_protein = pd.read_csv('rosmap/ROSMAP.proteomics.somoscan.individualID.uniprot.protein_list.txt')
print(len(somoscan_protein.index))
somoscan_protein.head()

## raw data

### gene scores

In [None]:
gene_score = pd.read_csv('merged_outputs/AOU_ALL.UKBB.metasoft.ADSP.all.VEP_v113.gene_by_position.r2_0.1_clump_variants_excluded.RE_pval_threshold_0.05.average_gene_score.merged.txt.gz',
                        sep = '\t')
print(gene_score.shape)
gene_score.head()

### rnaseq exp data

In [None]:
rnaseq_exp = pd.read_csv('rosmap/ROSMAP_RNAseq_TPM5_log2norm_individualids_codinggenes.tsv',
                         sep = '\t')
print(rnaseq_exp.shape)
rnaseq_exp.head()

## gene map

In [None]:
gene_map = pd.read_csv('ensembl_start_stop/Homo_sapiens.GRCh38.113.gene_start_stop.autosomes.500kb_upstream_downstream.gtf.txt',
                       sep = '\t')
gene_map.head()

# clean pathway files

## gene

In [None]:
reactome_gene_split = reactome_gene.assign(GENE = reactome_gene[2].str.split('|')).explode('GENE').reset_index(drop = True)
reactome_gene_split.drop(columns = [2],inplace = True)
reactome_gene_split.rename(columns = {0: 'PATHWAY_NAME',
                                      1: 'PATHWAY_ID'}, inplace = True)
reactome_gene_split.head()

In [None]:
print(len(reactome_gene_split['PATHWAY_NAME'].unique()))
reactome_gene_split['PATHWAY_NAME'].unique()

## protein

In [None]:
reactome_protein_human = reactome_protein[reactome_protein[5].isin(['Homo sapiens'])]
reactome_protein_human = reactome_protein_human[[3, 1, 0]]
reactome_protein_human.rename(columns = {3 : 'PATHWAY_NAME',
                                         1 : 'PATHWAY_ID',
                                         0 : 'PROTEIN'}, inplace = True)
reactome_protein_human.head()

## pathway

In [None]:
reactome_path_human = reactome_path[reactome_path[2].isin(['Homo sapiens'])]
reactome_path_human.head()

# map ENSIDs to gene names for gene scores

## map

In [None]:
meta_gene_map = gene_map[gene_map['ENS_ID'].isin(meta_gene_list['GENE'])]
print(len(meta_gene_map.index))
print(len(meta_gene_map['ENS_ID'].unique()))
print(len(meta_gene_map['GENE'].unique()))
meta_gene_map.head()

## investigate dups

In [None]:
meta_gene_map_dups = gene_map[gene_map['GENE'].isin(meta_gene_map['GENE'][meta_gene_map['GENE'].duplicated()])]
meta_gene_map_dups

## fix duplicates

In [None]:
meta_gene_map_no_dup = meta_gene_map[~meta_gene_map['GENE'].isin(meta_gene_map_dups['GENE'])]
print(len(meta_gene_map.index))
print(len(meta_gene_map_no_dup.index))

In [None]:
# no finngen meta fix
meta_gene_map_dup_fixed = meta_gene_map_dups[meta_gene_map_dups['ENS_ID'].isin(['ENSG00000113231',
                                                                                'ENSG00000310517',
                                                                                'ENSG00000253147',
                                                                                'ENSG00000182957',
                                                                                'ENSG00000105650'])]
meta_gene_map_dup_fixed

In [None]:
meta_gene_map_final = pd.concat([meta_gene_map_no_dup, meta_gene_map_dup_fixed], axis = 0)
print(len(meta_gene_map_final.index))

## subset

In [None]:
meta_gene_map_final = meta_gene_map_final[['ENS_ID', 'GENE']]
meta_gene_map_final.head()

# map ENSIDs to gene names for RNAseq genes

## map

In [None]:
gene_map_rnaseq = gene_map[gene_map['ENS_ID'].isin(rnaseq_gene[0])]
print(len(rnaseq_gene.index))
print(len(gene_map_rnaseq.index))
print(len(gene_map_rnaseq['ENS_ID'].unique()))
print(len(gene_map_rnaseq['GENE'].unique()))
gene_map_rnaseq.head()

## subset

In [None]:
gene_map_rnaseq = gene_map_rnaseq[['ENS_ID','GENE']]
gene_map_rnaseq.head()

# split protein genes into the distinct rows

## split srm

In [None]:
srm_protein_split = srm_protein.assign(GENE=srm_protein['Gene Names'].str.split(r'[ ,/]')).explode('GENE').reset_index(drop=True)
print(len(srm_protein_split.index))
print(len(srm_protein.index))
srm_protein_split.head()

## split tmt

In [None]:
tmt_protein_split = tmt_protein.assign(GENE=tmt_protein['Gene Names'].str.split(' ')).explode('GENE').reset_index(drop=True)
print(len(tmt_protein_split.index))
print(len(tmt_protein.index))
tmt_protein_split.head()

In [None]:
tmt_slash_exclude = tmt_protein_split[tmt_protein_split['GENE'].str.contains('cl|H4/')]
tmt_slash_exclude['GENE'].unique()

In [None]:
tmt_slash_include = tmt_protein_split[~tmt_protein_split.index.isin(tmt_slash_exclude.index)]
print(len(tmt_slash_include.index))
print(len(tmt_slash_exclude.index))
print(len(tmt_protein_split.index))

In [None]:
print(len(tmt_slash_include.index))
tmt_slash_include_split = tmt_slash_include.assign(GENE = tmt_slash_include['GENE'].str.split('/')).explode('GENE').reset_index(drop=True)
print(len(tmt_slash_include_split.index))
tmt_slash_include_split.head()

In [None]:
tmt_protein_split = pd.concat([tmt_slash_include_split, tmt_slash_exclude], axis = 0)
print(len(tmt_protein_split.index))
tmt_protein_split.head()

## remove extra characters for tmt

In [None]:
tmt_protein_split['GENE'] = tmt_protein_split['GENE'].str.replace(r"[;']", "", regex = True)
print(tmt_protein_split[tmt_protein_split['GENE'].str.contains(";|'")]['GENE'].unique())
print(len(tmt_protein_split.index))

## check for remaining special characters

In [None]:
mask = srm_protein_split['GENE'].str.contains(r'[^a-zA-Z0-9-]', regex = True)
srm_protein_split[mask]

In [None]:
mask = tmt_protein_split['GENE'].str.contains(r'[^a-zA-Z0-9-._;]', regex = True)
invalid = tmt_protein_split[mask]
invalid[~invalid['GENE'].str.contains('UNQ|cl')]

In [None]:
tmt_protein_split[tmt_protein_split['GENE'].str.contains(";|'")]['GENE'].unique()

# map RNAseq exp data ENSIDs to gene names

## transpose df

In [None]:
rnaseq_exp_transpose = rnaseq_exp.transpose().reset_index().rename(columns = {"index" : "ENS_ID"})
rnaseq_exp_transpose.head()

## extract ID row

In [None]:
rnaseq_exp_id = rnaseq_exp_transpose.iloc[:1]
rnaseq_exp_id['GENE'] = rnaseq_exp_id['ENS_ID']
rnaseq_exp_id.head()

## filter to gene map

In [None]:
rnaseq_exp_gene_map = rnaseq_exp_transpose.merge(gene_map, on = 'ENS_ID', how = 'inner')
rnaseq_exp_gene_map.drop(columns = ['CHR','START','STOP','START_500kb_upstream','STOP_500kb_downstream'], inplace = True)
print(len(rnaseq_exp_gene_map.index))
rnaseq_exp_gene_map.head()

## add id row back in

In [None]:
rnaseq_exp_gene_map_id = pd.concat([rnaseq_exp_id, rnaseq_exp_gene_map], axis = 0)
rnaseq_exp_gene_map_id.head()

## update index

In [None]:
rnaseq_exp_gene_map_id.set_index('GENE', inplace = True)
rnaseq_exp_gene_map_id.head()

## remove ENSID column

In [None]:
rnaseq_exp_gene_map_id.drop(columns=['ENS_ID'], inplace = True)
rnaseq_exp_gene_map_id.head()

## transpose back

In [None]:
rnaseq_exp_ensembl_gene_map_tranposed = rnaseq_exp_gene_map_id.transpose()
rnaseq_exp_ensembl_gene_map_tranposed.head()

# map gene scores data ENS IDs to gene names

## remove avg label from colnames

In [None]:
gene_score.columns = gene_score.columns.str.replace('_AVG', '', regex = False)
gene_score.columns

## extract pheno + covariate columns

In [None]:
gene_score_covar = gene_score[['ID', 'ALZ_STATUS', 'AGE', 'SEX', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8']]

## drop pheno + covariate columns

In [None]:
gene_score.drop(columns = ['ALZ_STATUS', 'AGE', 'SEX', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8'], inplace = True)
gene_score.columns

## transpose

In [None]:
gene_score_transpose = gene_score.transpose().reset_index().rename(columns = {"index" : "ENS_ID"})
gene_score_transpose.head()

## extract ID row

In [None]:
gene_score_id = gene_score_transpose.iloc[:1]
gene_score_id['GENE'] = gene_score_id['ENS_ID']
gene_score_id.head()

## filter to gene map

In [None]:
gene_score_gene_map = gene_score_transpose.merge(gene_map, on = 'ENS_ID', how = 'inner')
gene_score_gene_map.drop(columns = ['CHR','START','STOP','START_500kb_upstream','STOP_500kb_downstream'], inplace = True)
print(len(gene_score_gene_map.index))
gene_score_gene_map.head()

## add id row back in

In [None]:
gene_score_gene_map_id = pd.concat([gene_score_id, gene_score_gene_map], axis = 0)
gene_score_gene_map_id.head()

## update index

In [None]:
gene_score_gene_map_id.set_index('GENE', inplace = True)
gene_score_gene_map_id.head()

## drop ensid column

In [None]:
gene_score_gene_map_id.drop(columns = ['ENS_ID'], inplace = True)
gene_score_gene_map_id.head()

## transpose back

In [None]:
gene_score_ensembl_gene_map_tranposed = gene_score_gene_map_id.transpose()
gene_score_ensembl_gene_map_tranposed.head()

## add covariates back in

In [None]:
gene_score_ensembl_gene_map_tranposed = gene_score_covar.merge(gene_score_ensembl_gene_map_tranposed, on = 'ID', how = 'inner')
print(gene_score_ensembl_gene_map_tranposed.shape)
gene_score_ensembl_gene_map_tranposed.head()

# map genes to pathways

## gene scores

In [None]:
gs_reactome = reactome_gene_split[reactome_gene_split['GENE'].isin(meta_gene_map_final['GENE'])]
gs_reactome['SOURCE'] = 'ADSP_GS'
print(len(gs_reactome.index))
print(len(gs_reactome['GENE'].unique()))
print(len(reactome_gene_split['GENE'].unique()))
print(len(meta_gene_map_final.index))
print(len(gs_reactome['PATHWAY_ID'].unique())) 
gs_reactome.head()

## rnaseq

In [None]:
rs_reactome = reactome_gene_split[reactome_gene_split['GENE'].isin(gene_map_rnaseq['GENE'])]
rs_reactome['SOURCE'] = 'ROSMAP_RS'
print(len(rs_reactome.index))
print(len(rs_reactome['GENE'].unique()))
print(len(reactome_gene_split['GENE'].unique()))
print(len(gene_map_rnaseq.index))
print(len(rs_reactome['PATHWAY_ID'].unique())) 
rs_reactome.head()

In [None]:
msbb_rs_reactome = reactome_gene_split[reactome_gene_split['GENE'].isin(msbb_rnaseq_gene['GENE'])]
msbb_rs_reactome['SOURCE'] = 'MSBB_RS'
print(len(msbb_rs_reactome.index))
print(len(msbb_rs_reactome['GENE'].unique()))
print(len(reactome_gene_split['GENE'].unique()))
print(len(msbb_rnaseq_gene.index))
print(len(msbb_rs_reactome['PATHWAY_ID'].unique())) 
msbb_rs_reactome.head()

## methylation

In [None]:
ma_reactome = reactome_gene_split[reactome_gene_split['GENE'].isin(methyl_gene['RefGene'])]
ma_reactome['SOURCE'] = 'ROSMAP_MA'
print(len(ma_reactome.index))
print(len(ma_reactome['GENE'].unique()))
print(len(reactome_gene_split['GENE'].unique()))
print(len(methyl_gene.index))
print(len(ma_reactome['PATHWAY_ID'].unique())) 
ma_reactome.head()

In [None]:
msbb_ma_reactome = reactome_gene_split[reactome_gene_split['GENE'].isin(msbb_methyl_gene['GENE'])]
msbb_ma_reactome['SOURCE'] = 'MSBB_MA'
print(len(msbb_ma_reactome.index))
print(len(msbb_ma_reactome['GENE'].unique()))
print(len(reactome_gene_split['GENE'].unique()))
print(len(msbb_methyl_gene.index))
print(len(msbb_ma_reactome['PATHWAY_ID'].unique())) 
msbb_ma_reactome.head()

## tmt protein

In [None]:
msbb_tmt_reactome_gene = reactome_gene_split[reactome_gene_split['GENE'].isin(msbb_tmt_gene['GENE'])]
msbb_tmt_reactome_gene['SOURCE'] = 'MSBB_TP'
print(len(msbb_tmt_reactome_gene.index))
print(len(msbb_tmt_reactome_gene['GENE'].unique()))
print(len(reactome_gene_split['GENE'].unique()))
print(len(msbb_tmt_gene.index))
print(len(msbb_tmt_reactome_gene['PATHWAY_ID'].unique())) 
msbb_tmt_reactome_gene.head()

## somoscan protein

In [None]:
somoscan_reactome_gene = reactome_gene_split[reactome_gene_split['GENE'].isin(somoscan_gene['GENE'])]
somoscan_reactome_gene['SOURCE'] = 'ROSMAP_SP'
print(len(somoscan_reactome_gene.index))
print(len(somoscan_reactome_gene['GENE'].unique()))
print(len(reactome_gene_split['GENE'].unique()))
print(len(somoscan_gene['GENE'].unique()))
#print(len(somoscan_protein.index))
print(len(somoscan_reactome_gene['PATHWAY_ID'].unique())) 
somoscan_reactome_gene.head()

In [None]:
print(len(srm_reactome_gene['PATHWAY_ID'].unique()))
print(len(srm_reactome_protein['PATHWAY_ID'].unique()))

In [None]:
print(len(srm_reactome_protein[srm_reactome_protein['PATHWAY_ID'].isin(srm_reactome_gene['PATHWAY_ID'])]['PATHWAY_ID'].unique()))
print(len(srm_reactome_gene[srm_reactome_gene['PATHWAY_ID'].isin(srm_reactome_protein['PATHWAY_ID'])]['PATHWAY_ID'].unique()))

In [None]:
srm_reactome_gene[~srm_reactome_gene['PATHWAY_ID'].isin(srm_reactome_protein['PATHWAY_ID'])][['PATHWAY_NAME','PATHWAY_ID']].drop_duplicates()

# merge

In [None]:
all_gene = pd.concat([gs_reactome, rs_reactome, msbb_rs_reactome, ma_reactome, msbb_ma_reactome, msbb_tmt_reactome_gene], axis = 0)
all_gene = all_gene.groupby(['PATHWAY_NAME', 'PATHWAY_ID', 'GENE'])['SOURCE'].agg(lambda x: ';'.join(sorted(set(x)))).reset_index()
print(len(all_gene.index))
all_gene.head()

In [None]:
all_gene_somoscan_protein_gene = pd.concat([all_gene, somoscan_reactome_gene], axis = 0)
all_gene_somoscan_protein_gene = all_gene_somoscan_protein_gene.groupby(['PATHWAY_NAME', 'PATHWAY_ID', 'GENE'])['SOURCE'].agg(lambda x: ';'.join(sorted(set(x)))).reset_index()
print(len(all_gene_somoscan_protein_gene.index))
all_gene_somoscan_protein_gene.head()

In [None]:
pd.set_option('display.max_rows', None) 
all_gene_somoscan_protein_gene.drop_duplicates(subset = ['PATHWAY_NAME', 'PATHWAY_ID', 'SOURCE'])['SOURCE'].value_counts(dropna = False)

# export

## rnaseq exp data

In [None]:
rnaseq_exp_norm_id.to_csv('gene_mapping/ROSMAP.RNAseq.TPM5_log2norm.individualids.codinggenes.VEP_v113_genes.gene_symbol.zscore_norm.txt',
                          sep = '\t',
                          index = None)

## gene scores data

In [None]:
gene_score_norm_id.to_csv('merged_outputs/AOU_ALL.UKBB.metasoft.ADSP.all.VEP_v113.gene_by_position.r2_0.1_clump_variants_excluded.RE_pval_threshold_0.05.gene_symbol.zscore_norm.average_gene_score.merged.txt',
                                     sep = '\t',
                                     index = None)

## pathway maps

In [None]:
gs_reactome.to_csv('reactome/AOU_ALL.UKBB.metasoft.ADSP.gene_score.reactome.pathway_mapping.txt',
                   sep = '\t',
                   index = None)

In [None]:
rs_reactome.to_csv('reactome/ROSMAP.RNAseq.reactome.pathway_mapping.txt',
                   sep = '\t',
                   index = None)

In [None]:
msbb_rs_reactome.to_csv('reactome/MSBB.RNAseq.reactome.pathway_mapping.txt',
                   sep = '\t',
                   index = None)

In [None]:
ma_reactome.to_csv('reactome/ROSMAP.arrayMethylation_imputed.ref_gene.reactome.pathway_mapping.txt',
                   sep = '\t',
                   index = None)

In [None]:
msbb_ma_reactome.to_csv('reactome/MSBB.methylation.reactome.pathway_mapping.txt',
                        sep = '\t',
                        index = None)

In [None]:
srm_reactome_protein.to_csv('reactome/ROSMAP.proteomics.LC_SRM.round1.reactome.protein_to_pathway.pathway_mapping.txt',
                            sep = '\t',
                            index = None)

In [None]:
srm_reactome_gene.to_csv('reactome/ROSMAP.proteomics.LC_SRM.round1.reactome.gene_to_pathway.pathway_mapping.txt',
                         sep = '\t',
                        index = None)

In [None]:
tmt_reactome_protein.to_csv('reactome/ROSMAP.proteomics.TMT.round1.reactome.protein_to_pathway.pathway_mapping.txt',
                            sep = '\t',
                            index = None)

In [None]:
tmt_reactome_gene.to_csv('reactome/ROSMAP.proteomics.TMT.round1.reactome.gene_to_pathway.pathway_mapping.txt',
                         sep = '\t',
                        index = None)

In [None]:
msbb_tmt_reactome_gene.to_csv('reactome/MSBB.proteomics.TMT.reactome.gene_to_pathway.pathway_mapping.txt',
                              sep = '\t',
                              index = None)

In [None]:
somoscan_reactome_gene.to_csv('reactome/ROSMAP.proteomics.somoscan.reactome.gene_to_pathway.pathway_mapping.txt',
                              sep = '\t',
                              index = None)

In [None]:
all_gene_srm_protein.to_csv('reactome/AD_KMI.ADSP.ROSMAP.all_omics.LC_SRM_proteomics_round1.reactome.protein_to_pathway.pathway_mapping.txt',
                            sep = '\t',
                            index = None)

In [None]:
all_gene_srm_protein_gene.to_csv('reactome/AD_KMI.ADSP.ROSMAP.all_omics.LC_SRM_proteomics_round1.reactome.gene_to_pathway.pathway_mapping.txt',
                                 sep = '\t',
                                 index = None)

In [None]:
all_gene_tmt_protein.to_csv('reactome/AD_KMI.ADSP.ROSMAP.all_omics.TMT_proteomics_round1.reactome.protein_to_pathway.pathway_mapping.txt',
                            sep = '\t',
                            index = None)

In [None]:
all_gene_tmt_protein_gene.to_csv('reactome/AD_KMI.ADSP.ROSMAP.all_omics.TMT_proteomics_round1.reactome.gene_to_pathway.pathway_mapping.txt',
                                 sep = '\t',
                                 index = None)

In [None]:
all_gene_somoscan_protein.to_csv('reactome/AD_KMI.ADSP.ROSMAP.all_omics.somoscan.reactome.protein_to_pathway.pathway_mapping.txt',
                                 sep = '\t',
                                 index = None)

In [None]:
all_gene_somoscan_protein_gene.to_csv('reactome/AD_KMI.ADSP.ROSMAP.all_omics.somoscan.MSBB.all_omics.reactome.gene_to_pathway.pathway_mapping.txt',
                                      sep = '\t',
                                      index = None)