In [3]:
import loompy
import numpy as np
import logging
import os
import matplotlib.pyplot as plt
from cytograph.preprocessing import Normalizer
from cytograph.enrichment.binary_differential_expression import BinaryDifferentialExpression
from diffxpy.api.test import pairwise
from goatools import obo_parser
import Bio

import logging
logger = logging.getLogger()
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%H:%M:%S')

import pickle as pkl

In [4]:
f_dir = '/proj/GBM/FRESH_20210607'
f = os.path.join(f_dir, 'data' , 'GBM_Tumor.loom')
f_agg = os.path.join(f_dir, 'data' , 'GBM_Tumor.agg.loom')

In [15]:
import sys
from Bio import Entrez

# *Always* tell NCBI who you are
Entrez.email = "john.doe@mail.com"
 
def retrieve_annotation(id_list):
 
    """Annotates Entrez Gene IDs using Bio.Entrez, in particular epost (to
    submit the data to NCBI) and esummary to retrieve the information. 
    Returns a list of dictionaries with the annotations."""
 
    # This below tests for search by gene symbol
    request = Entrez.epost("gene",id=",".join(id_list))
    try:
        result = Entrez.read(request)
    except RuntimeError as e:
        #FIXME: How generate NAs instead of causing an error with invalid IDs?
        print("An error occurred while retrieving the annotations.")
        print("The error returned was %s" % e)
        sys.exit(-1)
 
    webEnv = result["WebEnv"]
    queryKey = result["QueryKey"]
    data = Entrez.esummary(db="gene", webenv=webEnv, query_key =
            queryKey)
    annotations = Entrez.read(data)
 
    print("Retrieved %d annotations for %d genes" % (len(annotations), len(id_list)))
    return annotations

with loompy.connect(f) as ds:
    #Search for Gene ID, then find annotation
    id_list = []
    for x in ds.ra.Gene:
        sterm = x + '[sym] "Homo Sapiens"[orgn]'
        handle = Entrez.esearch(db="gene", retmode = "xml", term = x )
        record = Entrez.read(handle)
        if len(record) > 0:
            IDArray = record["IdList"]
            toString = str(IDArray[0])
            id_list.append(toString)

    annotation = retrieve_annotation(id_list)


.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.


IndexError: list index out of range

In [13]:
?Entrez.esearch

[0;31mSignature:[0m [0mEntrez[0m[0;34m.[0m[0mesearch[0m[0;34m([0m[0mdb[0m[0;34m,[0m [0mterm[0m[0;34m,[0m [0;34m**[0m[0mkeywds[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Run an Entrez search and return a handle to the results.

ESearch searches and retrieves primary IDs (for use in EFetch, ELink
and ESummary) and term translations, and optionally retains results
for future use in the user's environment.

See the online documentation for an explanation of the parameters:
http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch

Return a handle to the results which are always in XML format.

Raises an IOError exception if there's a network error.

Short example:

>>> from Bio import Entrez
>>> Entrez.email = "Your.Name.Here@example.org"
>>> handle = Entrez.esearch(db="nucleotide", retmax=10, term="opuntia[ORGN] accD", idtype="acc")
>>> record = Entrez.read(handle)
>>> handle.close()
>>> int(record["Count"]) >= 2
True
>>> "EF590893.1" in record

In [7]:
with loompy.connect(f) as ds:
    print(ds.ra.Gene[:5])
    print(ds.ra.Accession[:5])
#     with loompy.connect(f_agg) as dsagg:        
#         Tumor_clusters = np.unique(ds.ca.Clusters[np.where(ds.ca.Subset=='Tumor')[0]])
#         print(dsagg.ca.AutoAnnotation[Tumor_clusters])

['CDH19' 'KCNK10' 'SYT6' 'SCN9A' 'U91319.1']
['ENSG00000071991' 'ENSG00000100433' 'ENSG00000134207' 'ENSG00000169432'
 'ENSG00000262801']


In [6]:
# with loompy.connect(f) as ds:
#     Tumor_clusters = set(np.unique(ds.ca.Clusters[np.where(ds.ca.Subset=='Tumor')[0]]))
#     labels = np.array(['Tumor' if x in Tumor_clusters else 'Control' for x in ds.ca.Clusters])
    
#     BDE = BinaryDifferentialExpression()
#     logging.info(f'Fitting data')
#     BDE.fit(ds, labels)
#     logging.info(f'Calculate differential expression')
#     selected = BDE.select('Tumor', 'Control')
    
#     with open('/proj/GBM/FRESH_20210607/misc/differntial.pkl', "wb") as output_file:
#         pkl.dump([selected, BDE], output_file)

In [8]:
obo_path = '/datb/sl/camiel/tumor/ref/go-basic.obo'

go = obo_parser.GODag(obo_path)

/datb/sl/camiel/tumor/ref/go-basic.obo: fmt(1.2) rel(2021-06-16) 47,230 GO Terms


In [18]:
relevant_terms = []
key_word = 'cell surface'

for go_term in go:
    if key_word in go[go_term].name:
        relevant_terms.append(go_term)
        
logging.info(f'{len(relevant_terms)} relevant terms out of {len(go.keys())}')

17:03:19 INFO     32 relevant terms out of 47230


In [21]:
for i in relevant_terms:
    print(f'{i} {go[i].name}')

GO:0002220 innate immune response activating cell surface receptor signaling pathway
GO:0002429 immune response-activating cell surface receptor signaling pathway
GO:0002433 immune response-regulating cell surface receptor signaling pathway involved in phagocytosis
GO:0002752 cell surface pattern recognition receptor signaling pathway
GO:0002767 immune response-inhibiting cell surface receptor signaling pathway
GO:0002768 immune response-regulating cell surface receptor signaling pathway
GO:0007166 cell surface receptor signaling pathway
GO:0009930 longitudinal side of cell surface
GO:0009986 cell surface
GO:0020030 infected host cell surface knob
GO:0033575 protein glycosylation at cell surface
GO:0033580 protein galactosylation at cell surface
GO:0033626 positive regulation of integrin activation by cell surface receptor linked signal transduction
GO:0034394 protein localization to cell surface
GO:0038184 cell surface bile acid receptor signaling pathway
GO:0044228 host cell surface


In [24]:
go['GO:0009928']

GOTerm('GO:0009986'):
  id:GO:0009986
  item_id:GO:0009986
  name:cell surface
  namespace:cellular_component
  _parents: 1 items
    GO:0110165
  parents: 1 items
    GO:0110165	level-01	depth-01	cellular anatomical entity [cellular_component]
  children: 0 items
  level:2
  depth:2
  is_obsolete:False
  alt_ids: 2 items
    GO:0009929
    GO:0009928

In [25]:
go

{'GO:0000001': GOTerm('GO:0000001'):
   id:GO:0000001
   item_id:GO:0000001
   name:mitochondrion inheritance
   namespace:biological_process
   _parents: 2 items
     GO:0048308
     GO:0048311
   parents: 2 items
     GO:0048308	level-05	depth-05	organelle inheritance [biological_process]
     GO:0048311	level-05	depth-06	mitochondrion distribution [biological_process]
   children: 0 items
   level:6
   depth:7
   is_obsolete:False
   alt_ids: 0 items,
 'GO:0000002': GOTerm('GO:0000002'):
   id:GO:0000002
   item_id:GO:0000002
   name:mitochondrial genome maintenance
   namespace:biological_process
   _parents: 1 items
     GO:0007005
   parents: 1 items
     GO:0007005	level-05	depth-05	mitochondrion organization [biological_process]
   children: 0 items
   level:6
   depth:6
   is_obsolete:False
   alt_ids: 0 items,
 'GO:0000003': GOTerm('GO:0000003'):
   id:GO:0000003
   item_id:GO:0000003
   name:reproduction
   namespace:biological_process
   _parents: 1 items
     GO:0008150
  