# Layer Specific Gene GO

In [1]:
import joblib
import pandas as pd
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj

## Load marker gene lists

In [2]:
gene_meta = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
                        sep='\t', index_col='gene_id')

In [3]:
layer_markers = joblib.load('../layer_hypo_genes.obj')
layer_markers_names = {k: [gene_meta.at[g, 'gene_name'] for g in v] for k, v in layer_markers.items()}
gene_id_base_to_name = {g.split('.')[0]: gene_meta.loc[g, 'gene_name'] for v in layer_markers.values() for g in v}
for k, v in layer_markers.items():
    print(k, len(v))

IT-L5 355
IT-L23 448
IT-L4 222
IT-L6 308


In [4]:
ensembl_id_to_ncbi = pd.read_csv(
    '/home/hanliu/ref/ncbi/gene2ensembl.mouse.tsv.gz', sep='\t',
    index_col=2)['GeneID'].to_dict()
ncbi_to_ensembl_id = {v:k for k, v in ensembl_id_to_ncbi.items()}

In [5]:
# convert ensembl id to entrez id, some missing id is usually low conf gene
layer_markers_ncbi = {}
for k, v in layer_markers.items():
    v = [ensembl_id_to_ncbi[i.split('.')[0]] for i in v if i.split('.')[0] in ensembl_id_to_ncbi]
    layer_markers_ncbi[k] = v
    print(k, len(v))

IT-L5 310
IT-L23 390
IT-L4 196
IT-L6 274


## Setup GO

In [6]:
obo_fname = download_go_basic_obo()
fin_gene2go = download_ncbi_associations()
obodag = GODag("go-basic.obo")

# Read NCBI's gene2go. Store annotations in a list of namedtuples
objanno = Gene2GoReader(fin_gene2go, taxids=[10090])
# Get namespace2association where:
#    namespace is:
#        BP: biological_process               
#        MF: molecular_function
#        CC: cellular_component
#    assocation is a dict:
#        key: NCBI GeneID
#        value: A set of GO IDs associated with that gene
ns2assoc = objanno.get_ns2assc()

for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated mouse genes".format(NS=nspc, N=len(id2gos)))

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-01-01) 47,337 GO Terms
HMS:0:00:04.050274 367,364 annotations READ: gene2go 
1 taxids stored: 10090
MF 16,723 annotated mouse genes
CC 18,826 annotated mouse genes
BP 17,860 annotated mouse genes


In [7]:
goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_mus.keys(), # List of mouse protein-coding genes
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = ['fdr_bh']) # defult multipletest correction method


Load BP Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 60% 16,821 of 28,212 population items found in association

Load CC Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 64% 18,172 of 28,212 population items found in association

Load MF Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 58% 16,337 of 28,212 population items found in association


In [9]:
# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.
for layer, genes in layer_markers_ncbi.items():
    print(layer, len(genes))
    goea_results_all = goeaobj.run_study(genes)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
    goeaobj.wr_tsv(f"{layer}.GO.tsv", goea_results_sig)
    plot_results(layer + ".GO{NS}.png", goea_results_sig)

    # add more gene info
    sig_go_df = pd.read_csv(f"{layer}.GO.tsv", sep='\t', index_col=0)
    sig_go_df['study_items'] = sig_go_df['study_items'].fillna('')
    sig_go_df['gene_ids'] = sig_go_df['study_items'].apply(lambda i: ','.join(
        [ncbi_to_ensembl_id[int(g)] for g in i.split(',') if g != '']))
    sig_go_df['gene_names'] = sig_go_df['gene_ids'].apply(lambda i: ','.join(
        [gene_id_base_to_name[g] for g in i.split(',') if g != '']))
    sig_go_df.to_csv(f"{layer}.GO.tsv", sep='\t')

IT-L5 310

Run BP Gene Ontology Analysis: current study set of 310 IDs ...
 91%    268 of    294 study items found in association
 95%    294 of    310 study items found in population(28212)
Calculating 12,254 uncorrected p-values using fisher_scipy_stats
  12,254 GO terms are associated with 16,821 of 28,212 population items
   1,445 GO terms are associated with    268 of    310 study items
  METHOD fdr_bh:
      13 GO terms found significant (< 0.05=alpha) ( 13 enriched +   0 purified): statsmodels fdr_bh
     112 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)

Run CC Gene Ontology Analysis: current study set of 310 IDs ...
 94%    276 of    294 study items found in association
 95%    294 of    310 study items found in population(28212)
Calculating 1,724 uncorrected p-values using fisher_scipy_stats
   1,724 GO terms are associated with 18,172 of 28,212 population items
     295 GO terms are associated wit

   38 usr  74 GOs  WROTE: IT-L6.GOCC.png
    6 usr  19 GOs  WROTE: IT-L6.GOMF.png
