# Layer Specific Gene GO

In [1]:
import joblib
import pandas as pd
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj

## Load marker gene lists

In [2]:
gene_meta = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
                        sep='\t', index_col='gene_id')
gene_id_base_to_name = {k.split('.')[0]: v for k, v in gene_meta['gene_name'].items()}

In [3]:
dec_genes = pd.read_csv('../plot/Decreasing_gene_ids.txt', header=None, index_col=0).index
inc_genes = pd.read_csv('../plot/Increasing_gene_ids.txt', header=None, index_col=0).index

In [4]:
ensembl_id_to_ncbi = pd.read_csv(
    '/home/hanliu/ref/ncbi/gene2ensembl.mouse.tsv.gz', sep='\t',
    index_col=2)['GeneID'].to_dict()
ncbi_to_ensembl_id = {v:k for k, v in ensembl_id_to_ncbi.items()}

In [5]:
dec_entrez = dec_genes.map(lambda i: ensembl_id_to_ncbi.get(i.split('.')[0], None)).dropna().astype(int)
inc_entrez = inc_genes.map(lambda i: ensembl_id_to_ncbi.get(i.split('.')[0], None)).dropna().astype(int)

## Setup GO

In [6]:
obo_fname = download_go_basic_obo()
fin_gene2go = download_ncbi_associations()
obodag = GODag("go-basic.obo")

# Read NCBI's gene2go. Store annotations in a list of namedtuples
objanno = Gene2GoReader(fin_gene2go, taxids=[10090])
# Get namespace2association where:
#    namespace is:
#        BP: biological_process               
#        MF: molecular_function
#        CC: cellular_component
#    assocation is a dict:
#        key: NCBI GeneID
#        value: A set of GO IDs associated with that gene
ns2assoc = objanno.get_ns2assc()

for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated mouse genes".format(NS=nspc, N=len(id2gos)))

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-01-01) 47,337 GO Terms
HMS:0:01:28.658138 367,364 annotations READ: gene2go 
1 taxids stored: 10090
CC 18,826 annotated mouse genes
BP 17,860 annotated mouse genes
MF 16,723 annotated mouse genes


In [7]:
# filter short gene, Jingtian said short gene make FP more sig
bg_genes = []
for k, v in GeneID2nt_mus.items():
    try:
        length = v.end_position_on_the_genomic_accession - v.start_position_on_the_genomic_accession
    except TypeError:
        continue
    if length < 5000:
        continue
    bg_genes.append(k)


In [8]:
goeaobj = GOEnrichmentStudyNS(
        bg_genes, # List of mouse protein-coding genes
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = ['fdr_bh']) # defult multipletest correction method


Load BP Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 76% 13,391 of 17,572 population items found in association

Load CC Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 82% 14,415 of 17,572 population items found in association

Load MF Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 74% 13,003 of 17,572 population items found in association


In [9]:
# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.

gene_type = 'decrease'
genes = dec_entrez

goea_results_all = goeaobj.run_study(list(genes))
goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
goeaobj.wr_tsv(f"{gene_type}.GO.tsv", goea_results_sig)
plot_results(gene_type + ".GO{NS}.png", goea_results_sig)

# add more gene info
sig_go_df = pd.read_csv(f"{gene_type}.GO.tsv", sep='\t', index_col=0)
sig_go_df['study_items'] = sig_go_df['study_items'].fillna('')
sig_go_df['gene_ids'] = sig_go_df['study_items'].apply(lambda i: ','.join(
    [ncbi_to_ensembl_id[int(g)] for g in i.split(',') if g != '']))
sig_go_df['gene_names'] = sig_go_df['gene_ids'].apply(lambda i: ','.join(
    [gene_id_base_to_name[g] for g in i.split(',') if g != '']))
sig_go_df.to_csv(f"{gene_type}.GO.tsv", sep='\t')


Run BP Gene Ontology Analysis: current study set of 104 IDs ...
 92%     90 of     98 study items found in association
 94%     98 of    104 study items found in population(17572)
Calculating 11,804 uncorrected p-values using fisher_scipy_stats
  11,804 GO terms are associated with 13,391 of 17,572 population items
     766 GO terms are associated with     90 of    104 study items
  METHOD fdr_bh:
       0 GO terms found significant (< 0.05=alpha) (  0 enriched +   0 purified): statsmodels fdr_bh
       0 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)

Run CC Gene Ontology Analysis: current study set of 104 IDs ...
 95%     93 of     98 study items found in association
 94%     98 of    104 study items found in population(17572)
Calculating 1,695 uncorrected p-values using fisher_scipy_stats
   1,695 GO terms are associated with 14,415 of 17,572 population items
     220 GO terms are associated with     93 o

In [10]:
sig_go_df

Unnamed: 0_level_0,NS,enrichment,name,ratio_in_study,ratio_in_pop,p_uncorrected,depth,study_count,p_fdr_bh,study_items,gene_ids,gene_names
# GO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GO:0005886,CC,e,plasma membrane,46/98,3492/17572,1.685373e-09,3,46,3e-06,"11732, 11931, 12387, 12561, 13429, 14339, 1458...","ENSMUSG00000022265,ENSMUSG00000026576,ENSMUSG0...","Ank,Atp1b1,Ctnnb1,Cdh4,Dnm1,Aktip,Gfra1,Kcnj4,..."
GO:0045202,CC,e,synapse,18/98,627/17572,9.893095e-09,2,18,8e-06,"12217, 12387, 13429, 16520, 17755, 18011, 1895...","ENSMUSG00000032589,ENSMUSG00000006932,ENSMUSG0...","Bsn,Ctnnb1,Dnm1,Kcnj4,Map1b,Neurl1a,Sept5,Tiam..."
GO:0014069,CC,e,postsynaptic density,11/98,296/17572,7.85985e-07,4,11,0.000444,"12217, 13429, 17755, 18011, 18195, 21844, 5472...","ENSMUSG00000032589,ENSMUSG00000026825,ENSMUSG0...","Bsn,Dnm1,Map1b,Neurl1a,Nsf,Tiam1,Cadm1,Shank3,..."
GO:0098685,CC,e,Schaffer collateral - CA1 synapse,7/98,105/17572,1.968793e-06,3,7,0.000823,"12217, 12387, 17919, 54725, 70530, 109934, 432530","ENSMUSG00000032589,ENSMUSG00000006932,ENSMUSG0...","Bsn,Ctnnb1,Myo5b,Cadm1,Lrfn2,Abr,Adcy1"
GO:0098978,CC,e,glutamatergic synapse,13/98,477/17572,2.426402e-06,3,13,0.000823,"12217, 13429, 17919, 18754, 21844, 54725, 5617...","ENSMUSG00000032589,ENSMUSG00000026825,ENSMUSG0...","Bsn,Dnm1,Myo5b,Prkce,Tiam1,Cadm1,Olfm1,Cacna1h..."
GO:0030054,CC,e,cell junction,16/98,773/17572,5.458115e-06,2,16,0.001542,"12217, 12387, 16520, 17755, 18011, 21844, 5472...","ENSMUSG00000032589,ENSMUSG00000006932,ENSMUSG0...","Bsn,Ctnnb1,Kcnj4,Map1b,Neurl1a,Tiam1,Cadm1,Olf..."
GO:0043196,CC,e,varicosity,3/98,12/17572,3.568069e-05,2,3,0.00864,"13429, 17755, 54725","ENSMUSG00000026825,ENSMUSG00000052727,ENSMUSG0...","Dnm1,Map1b,Cadm1"
GO:0045211,CC,e,postsynaptic membrane,8/98,254/17572,8.754984e-05,5,8,0.01855,"12387, 13429, 16520, 18011, 58234, 70530, 8190...","ENSMUSG00000006932,ENSMUSG00000026825,ENSMUSG0...","Ctnnb1,Dnm1,Kcnj4,Neurl1a,Shank3,Lrfn2,Tmem108..."
GO:0036477,CC,e,somatodendritic compartment,3/98,21/17572,0.000207982,2,3,0.03917,"17755, 21844, 81907","ENSMUSG00000052727,ENSMUSG00000002489,ENSMUSG0...","Map1b,Tiam1,Tmem108"
GO:0043025,CC,e,neuronal cell body,11/98,558/17572,0.0002764749,3,11,0.046862,"12217, 14585, 16520, 17196, 17755, 17919, 2184...","ENSMUSG00000032589,ENSMUSG00000025089,ENSMUSG0...","Bsn,Gfra1,Kcnj4,Mbp,Map1b,Myo5b,Tiam1,Olfm1,Se..."
