# Process Genedex 2024 version

---

This is an example notebook to generate the standard patwhay table expected by the first step of the pipeline using gene lists from Genedex. 

#### Table format

The standard table must have only two columns named `set_name` and `genes` containing the gene set or pathway name and entrez gene IDs. If a gene set has 10 genes in it, there should be 10 rows for that particular gene set, all with the same gene set name. The genes MUST be represented with their Entrez ID. 



In [None]:
suppressPackageStartupMessages({
    library(dplyr)
    library(plyr)
    library(biomaRt)
    library(data.table)
})
options(stringsAsFactors=FALSE)

### Input settings

In [None]:
# Unprocessed gene sets 
indir <- '~/projects/pdxn_2.0/data/gene_sets/genedex_2024/gene_lists'
outfile<-'../../input/std_gene_tables/genedex_pathway_table.csv'

### Load raw data

In [None]:
# Load gene lists
gsfiles<-list.files(indir,full.names = T)
names(gsfiles)<-sub('\\.txt','',basename(gsfiles))

gs<-lapply(names(gsfiles),function(name){
        df<-fread(gsfiles[[name]],header=TRUE,blank.lines.skip=TRUE,fill=TRUE) %>%
            mutate(set_name=name)
        return(df)
    })
gs_df<-do.call(rbind,gs) %>% as.data.frame()
message('Total number of pathways in custom set = ',length(unique(gs_df$set_name)))

### Processing

In [None]:
# Map gene symbol to entrez IDs
allgenes <- unique(gs_df$GENE_SYMBOL)
hsmart <- useMart(dataset = "hsapiens_gene_ensembl", biomart = "ensembl")
mapping <- getBM(values = allgenes,
                  attributes = c('hgnc_symbol','entrezgene_id'), 
                  filters = 'hgnc_symbol',
                  mart = hsmart) %>%
            dplyr::rename('GENE_SYMBOL'='hgnc_symbol',
                           'genes'='entrezgene_id') %>%
            group_by(genes) %>%
            slice_min(n=1,order_by = GENE_SYMBOL) # In case of one-to-many mappings
message('Uniquely mapped ',length(unique(mapping$genes)),' out of ', length(allgenes), ' gene symbols')

In [None]:
# Convert identifiers
gs_df <- gs_df %>%
         left_join(.,mapping,relationship = "many-to-many",by = join_by(GENE_SYMBOL)) %>%
         dplyr::select(set_name,genes) %>%
         distinct()

### Visualize standard table

In [None]:
head(gs_df)

In [None]:
tail(gs_df)

In [None]:
nrow(gs_df)

In [None]:
message('Total number of pathways = ',length(unique(gs_df$set_name)))
message('Total number of unique genes = ',length(unique(gs_df$genes)))

### Write pathway table

In [None]:
write.table(gs_df,
            file = outfile,
            quote = F,
            sep = ",",
            row.names = F)