# How to index GO-CAMs with LinkML-Store





In [1]:
import pandas as pd
import yaml

path = "input/gocam-models.yaml"

In [2]:
models = list(yaml.safe_load_all(open(path)))

## Creating a client and attaching to a database

First we will create a client as normal:

In [3]:
from linkml_store import Client

client = Client()

Next we'll attach to a MongoDB instance. this assumes you have one running already.

We will make a database called "GO-CAMs" and recreate it if it already exists

(note for people running this notebook locally - if you happen to have a database with this name in your current mongo instance it will be deleted!)

In [4]:
db = client.attach_database("mongodb://localhost:27017/gocams", "gocams", recreate_if_exists=True)

## Creating a collection

We'll create a simple test collection. The concept of collection in linkml-store maps directly to mongodb collections

In [5]:
collection = db.create_collection("main", recreate_if_exists=True)

## Inserting objects into the store

We'll use the standard `insert` method to insert the GO-CAMs into the collection. At this stage there is no explicit schema.

In [6]:
collection.insert(models)

## Check contents

We can check the number of rows in the collection, to ensure everything was inserted correctly:

In [7]:
collection.find({}, limit=1).num_rows

793

In [8]:
assert collection.find({}, limit=1).num_rows == len(models)

In [9]:
qr = collection.find({"taxon": "NCBITaxon:6239"}, limit=3)
qr.rows_dataframe

Unnamed: 0,id,title,taxon,status,comments,activities,objects
0,gomodel:568b0f9600000284,Antibacterial innate immune response in the in...,NCBITaxon:6239,production,[Automated change 2023-03-16: RO:0002212 repla...,[{'id': 'gomodel:568b0f9600000284/57ec3a7e0000...,"[{'id': 'WB:WBGene00006599', 'label': 'tpa-1 C..."
1,gomodel:5b528b1100000489,XBP-1 is a cell-nonautonomous regulator of str...,NCBITaxon:6239,production,[Automated change 2023-03-16: RO:0002213 repla...,[{'id': 'gomodel:5b528b1100000489/5b528b110000...,"[{'id': 'WB:WBGene00006959', 'label': 'xbp-1 C..."
2,gomodel:5b91dbd100002057,Antifungal innate immune response in the hypod...,NCBITaxon:6239,production,,[{'id': 'gomodel:5b91dbd100002057/5b91dbd10000...,"[{'id': 'WB:WBGene00010700', 'label': 'nipi-3 ..."


Let's check with pandas just to make sure it looks as expected; we'll query for a specific OMIM disease:

In [10]:
qr = collection.find({"activities.enabled_by": "WB:WBGene00006575"}, limit=3)
qr.rows_dataframe

Unnamed: 0,id,title,taxon,status,comments,activities,objects
0,gomodel:568b0f9600000284,Antibacterial innate immune response in the in...,NCBITaxon:6239,production,[Automated change 2023-03-16: RO:0002212 repla...,[{'id': 'gomodel:568b0f9600000284/57ec3a7e0000...,"[{'id': 'WB:WBGene00006599', 'label': 'tpa-1 C..."
1,gomodel:5b91dbd100002057,Antifungal innate immune response in the hypod...,NCBITaxon:6239,production,,[{'id': 'gomodel:5b91dbd100002057/5b91dbd10000...,"[{'id': 'WB:WBGene00010700', 'label': 'nipi-3 ..."


As expected, there are three rows with the OMIM disease 618499.

## Query faceting

We will now demonstrate faceted queries, allowing us to count the number of instances of different categorical values or categorical value combinations.

First we'll facet on the subject sex. We can use path notation, e.g. `subject.sex` here:

In [11]:
collection.query_facets({}, facet_columns=["taxon"])

{'taxon': [('NCBITaxon:9606', 541),
  ('NCBITaxon:10090', 185),
  ('NCBITaxon:4896', 15),
  ('NCBITaxon:7955', 14),
  ('NCBITaxon:7227', 13),
  ('NCBITaxon:559292', 6),
  ('NCBITaxon:9823', 4),
  ('NCBITaxon:6239', 4),
  ('NCBITaxon:5074', 1),
  ('NCBITaxon:1735992', 1),
  ('NCBITaxon:229533', 1),
  ('NCBITaxon:1403190', 1),
  ('NCBITaxon:8355', 1),
  ('NCBITaxon:425011', 1),
  ('NCBITaxon:28576', 1),
  ('NCBITaxon:602072', 1),
  ('NCBITaxon:8364', 1),
  ('NCBITaxon:227321', 1),
  ('NCBITaxon:99287', 1)]}

In [12]:
collection.query_facets({}, facet_columns=[("taxon", "status")])

{('taxon',
  'status'): [({'taxon': 'NCBITaxon:9606', 'status': 'production'},
   541), ({'taxon': 'NCBITaxon:10090',
    'status': 'production'}, 185), ({'taxon': 'NCBITaxon:4896', 'status': 'production'},
   15), ({'taxon': 'NCBITaxon:7955', 'status': 'production'},
   14), ({'taxon': 'NCBITaxon:7227',
    'status': 'production'}, 13), ({'taxon': 'NCBITaxon:559292', 'status': 'production'},
   6), ({'taxon': 'NCBITaxon:6239', 'status': 'production'},
   4), ({'taxon': 'NCBITaxon:9823',
    'status': 'production'}, 4), ({'taxon': 'NCBITaxon:227321', 'status': 'production'},
   1), ({'taxon': 'NCBITaxon:8364', 'status': 'production'},
   1), ({'taxon': 'NCBITaxon:1735992',
    'status': 'production'}, 1), ({'taxon': 'NCBITaxon:8355', 'status': 'production'},
   1), ({'taxon': 'NCBITaxon:602072', 'status': 'production'},
   1), ({'taxon': 'NCBITaxon:229533',
    'status': 'production'}, 1), ({'taxon': 'NCBITaxon:1403190', 'status': 'production'},
   1), ({'taxon': 'NCBITaxon:425011', 's

We can also facet by the disease name/label. We'll restrict this to the top 20

In [13]:
collection.query_facets({}, facet_columns=["activities.molecular_function.term"], facet_limit=20)


{'activities.molecular_function.term': [(['GO:0004930',
    'GO:0019706',
    'GO:0004674'],
   2),
  (['GO:0048018', 'GO:0038023', 'GO:0004252', 'GO:0004252'], 2),
  (['GO:0008083', 'GO:0016167', 'GO:0004714'], 2),
  (['GO:0004714', 'GO:0004713', 'GO:0048018'], 2),
  (['GO:0005179', 'GO:0016500', 'GO:0005125'], 2),
  (['GO:0061630', 'GO:0004879', 'GO:0030374'], 2),
  (['GO:0140311', 'GO:0003700', 'GO:0140311', 'GO:1990756'], 1),
  (['GO:0004674', 'GO:0061665', 'GO:1990931', 'GO:0070139', 'GO:0004674'], 1),
  (['GO:0060090',
    'GO:0004674',
    'GO:0060090',
    'GO:0060090',
    'GO:0061630',
    'GO:0060090',
    'GO:0061630'],
   1),
  (['GO:0061630', 'GO:0170011', 'GO:0061630', 'GO:0061630'], 1),
  (['GO:0140463', 'GO:0140463', 'GO:0004674', 'GO:0004674', 'GO:0003887'], 1),
  (['GO:0038023', 'GO:0060090', 'GO:0060090', 'GO:0140311'], 1),
  (['GO:0003846', 'GO:0004144', 'GO:0005488'], 1),
  (['GO:0019706', 'GO:0140693', 'GO:0003690'], 1),
  (['GO:0004810', 'GO:0004521', 'GO:000454

## Semantic Search

We will index GO-CAMs using a template that extracts key elements

First we will create a textualization template for a GO-CAM. We will keep it minimal for simplicity - this doesn't include treatments, families, etc.

In [14]:
template = """
id: {{id}}
title: {{title}}
taxon: {{taxon}}
status: {{status}}
objects: {% for o in objects %} {{o.id}} "{{o.label}}"; {% endfor %}
"""

Next we will create an indexer using the template. This will use the Jinja2 syntax for templating.
We will also cache LLM embedding queries, so if we want to incrementally add new GO-CAMs we can avoid re-running the LLM embeddings calls.

In [15]:
from linkml_store.index.implementations.llm_indexer import LLMIndexer

index = LLMIndexer(
    name="gocam", 
    cached_embeddings_database="tmp/llm_gocam_cache.db",
    text_template=template,
    text_template_syntax="jinja2",
)

We can test the template on the first row of the collection:

In [16]:
print(index.object_to_text(qr.rows[0]))


id: gomodel:568b0f9600000284
title: Antibacterial innate immune response in the intestine via MAPK cascade (C. elegans)
taxon: NCBITaxon:6239
status: production
objects:  WB:WBGene00006599 "tpa-1 Cele";  GO:0004674 "protein serine/threonine kinase activity";  GO:0002225 "positive regulation of antimicrobial peptide production";  ECO:0000501 "evidence used in automatic assertion";  ECO:0000315 "mutant phenotype evidence used in manual assertion";  ECO:0000318 "biological aspect of ancestor evidence used in manual assertion";  WB:WBGene00006923 "vhp-1 Cele";  GO:0017017 "MAP kinase tyrosine/serine/threonine phosphatase activity";  GO:1900425 "negative regulation of defense response to bacterium";  ECO:0000314 "direct assay evidence used in manual assertion";  ECO:0000316 "genetic interaction evidence used in manual assertion";  WB:WBGene00002187 "kgb-1 Cele";  GO:0005515 "protein binding";  GO:1900181 "negative regulation of protein localization to nucleus";  ECO:0000353 "physical inter

That looks as expected. We can now attach the indexer to the collection and index the collection:

In [17]:
collection.attach_indexer(index, auto_index=True)

## Semantic Search

Let's query based on text criteria:

In [18]:
qr = collection.search("pathways involving cell death")
qr.rows_dataframe[0:5]

Unnamed: 0,score,id,title,taxon,status,activities,objects,comments
0,0.821816,gomodel:64e7eefa00001233,Extrinsic apoptotic signaling pathway via deat...,NCBITaxon:10090,production,[{'id': 'gomodel:64e7eefa00001233/64e7eefa0000...,"[{'id': 'GO:0035591', 'label': 'signaling adap...",
1,0.814937,gomodel:62b4ffe300000240,Perforin maturation leading to granzyme-mediat...,NCBITaxon:10090,production,[{'id': 'gomodel:62b4ffe300000240/62b4ffe30000...,"[{'id': 'GO:0005509', 'label': 'calcium ion bi...",[Automated change 2022-09-22: GO:0005887 repla...
2,0.810594,gomodel:62b4ffe300000335,Perforin maturation leading to granzyme-mediat...,NCBITaxon:9606,production,[{'id': 'gomodel:62b4ffe300000335/62b4ffe30000...,"[{'id': 'GO:0140375', 'label': 'immune recepto...",[Automated change 2022-09-22: GO:0005887 repla...
3,0.809383,gomodel:663d668500001246,Pyroptotic cell death mediated by GSDMD and NI...,NCBITaxon:9606,production,[{'id': 'gomodel:663d668500001246/663d66850000...,"[{'id': 'GO:0004197', 'label': 'cysteine-type ...",
4,0.805333,gomodel:62b4ffe300001804,Cleavage and inactivation of PARP1 by CASP3 an...,NCBITaxon:9606,production,[{'id': 'gomodel:62b4ffe300001804/62b4ffe30000...,"[{'id': 'GO:0097200', 'label': 'cysteine-type ...",[Automated change 2023-03-16: RO:0002212 repla...


Let's check the first one

In [19]:
qr.ranked_rows[0]

(0.8218155512474502,
 {'id': 'gomodel:64e7eefa00001233',
  'title': 'Extrinsic apoptotic signaling pathway via death domain receptors 1(Mouse)',
  'taxon': 'NCBITaxon:10090',
  'status': 'production',
  'activities': [{'id': 'gomodel:64e7eefa00001233/64e7eefa00001250',
    'enabled_by': 'MGI:MGI:109200',
    'molecular_function': {'evidence': [{'term': 'ECO:0000266',
       'reference': 'PMID:8565075',
       'with_objects': ['UniProtKB:Q15628'],
       'provenances': [{'contributor': 'https://orcid.org/0000-0001-7476-6306',
         'date': '2023-09-14'}]}],
     'provenances': [],
     'term': 'GO:0035591'},
    'occurs_in': {'evidence': [], 'term': 'GO:0005829'},
    'part_of': {'evidence': [{'term': 'ECO:0000266',
       'reference': 'PMID:8565075',
       'with_objects': ['UniProtKB:Q15628'],
       'provenances': [{'contributor': 'https://orcid.org/0000-0001-7476-6306',
         'date': '2023-09-14'}]}],
     'term': 'GO:1900119'},
    'causal_associations': [{'evidence': [],
   

We can combine semantic search with queries:

In [20]:
qr = collection.search("cell death pathways", where={"taxon": "NCBITaxon:10090"})
qr.rows_dataframe[0:5]

Unnamed: 0,score,id,title,taxon,status,activities,objects,comments
0,0.829536,gomodel:64e7eefa00001233,Extrinsic apoptotic signaling pathway via deat...,NCBITaxon:10090,production,[{'id': 'gomodel:64e7eefa00001233/64e7eefa0000...,"[{'id': 'GO:0035591', 'label': 'signaling adap...",
1,0.822032,gomodel:62b4ffe300000240,Perforin maturation leading to granzyme-mediat...,NCBITaxon:10090,production,[{'id': 'gomodel:62b4ffe300000240/62b4ffe30000...,"[{'id': 'GO:0005509', 'label': 'calcium ion bi...",[Automated change 2022-09-22: GO:0005887 repla...
2,0.80532,gomodel:645d887900001077,"Cell type specific, p53-independent mitotic G2...",NCBITaxon:10090,production,[{'id': 'gomodel:645d887900001077/645d88790000...,"[{'id': 'GO:0004674', 'label': 'protein serine...",
3,0.801173,gomodel:5ce58dde00001215,Mouse-Aatf-antiapoptosis,NCBITaxon:10090,production,[{'id': 'gomodel:5ce58dde00001215/5ce58dde0000...,"[{'id': 'MGI:MGI:87986', 'label': 'Akt1 Mmus'}...",[Automated change 2023-03-16: RO:0002213 repla...
4,0.795273,gomodel:6516135700000211,Tumor necrosis factor-mediated signaling pathw...,NCBITaxon:10090,production,[{'id': 'gomodel:6516135700000211/651613570000...,"[{'id': 'GO:0005125', 'label': 'cytokine activ...",


## Validation

Next we will demonstrate validation over a whole collection.

Currently validating depends on a LinkML schema - we have previously copied this schema into the test folder.
We will load the schema into the database object:

In [21]:
db.load_schema_view("input/gocam-models-schema.yaml")

Quick sanity check to ensure that worked:

In [22]:
list(db.schema_view.all_classes())[0:10]

['Model',
 'Activity',
 'EvidenceItem',
 'Association',
 'CausalAssociation',
 'TermAssociation',
 'MolecularFunctionAssociation',
 'BiologicalProcessAssociation',
 'CellularAnatomicalEntityAssociation',
 'MoleculeAssociation']

In [23]:
collection.metadata.type = "Model"

In [24]:
from linkml_runtime.dumpers import yaml_dumper
for r in db.iter_validate_database():
    # known issue - https://github.com/monarch-initiative/GO-CAM-store/issues/97
    if "is not of type 'integer'" in r.message:
        continue
    print(r.message[0:100])
    print(r)
    raise ValueError("Unexpected validation error")

## Command Line Usage

We can also use the command line for all of the above operations.

For example, feceted queries:

In [26]:
!linkml-store -d mongodb://localhost:27017/gocams -c main fq -S taxon

{
  "taxon": {
    "NCBITaxon:9606": 541,
    "NCBITaxon:10090": 185,
    "NCBITaxon:4896": 15,
    "NCBITaxon:7955": 14,
    "NCBITaxon:7227": 13,
    "NCBITaxon:559292": 6,
    "NCBITaxon:6239": 4,
    "NCBITaxon:9823": 4,
    "NCBITaxon:227321": 1,
    "NCBITaxon:8355": 1,
    "NCBITaxon:1403190": 1,
    "NCBITaxon:1735992": 1,
    "NCBITaxon:229533": 1,
    "NCBITaxon:5074": 1,
    "NCBITaxon:602072": 1,
    "NCBITaxon:425011": 1,
    "NCBITaxon:28576": 1,
    "NCBITaxon:99287": 1,
    "NCBITaxon:8364": 1
  }
}


In [29]:
!linkml-store -d mongodb://localhost:27017/gocams -c main fq -S activities.enabled_by,taxon -O yaml


activities.enabled_by:
  MGI:MGI:109482: 91
  MGI:MGI:98973: 55
  UniProtKB:P42345: 53
  UniProtKB:Q8N884: 43
  UniProtKB:P57764: 37
  UniProtKB:Q9UHD2: 31
  MGI:MGI:109349: 30
  UniProtKB:Q9HB90: 25
  UniProtKB:Q13315: 23
  UniProtKB:Q13501: 23
  MGI:MGI:95294: 23
  UniProtKB:P29466: 22
  UniProtKB:Q7L523: 22
  UniProtKB:Q86WV6: 22
  UniProtKB:P09874: 21
  UniProtKB:P62877: 21
  UniProtKB:Q15382: 19
  UniProtKB:Q7Z434: 17
  UniProtKB:O43318: 17
  UniProtKB:Q9Y4K3: 16
  UniProtKB:P62753: 16
  MGI:MGI:2686159: 16
  UniProtKB:P23443: 16
  UniProtKB:Q9UBS0: 15
  UniProtKB:P23458: 15
  UniProtKB:Q96P20: 15
  UniProtKB:P49662: 14
  UniProtKB:Q13541: 14
  UniProtKB:P31749: 14
  UniProtKB:P49959: 14
  UniProtKB:Q04206: 14
  UniProtKB:Q14653: 14
  MGI:MGI:1916396: 14
  MGI:MGI:3647519: 13
  MGI:MGI:98907: 13
  UniProtKB:Q9C000: 13
  MGI:MGI:1916142: 13
  UniProtKB:Q92993: 13
  MGI:MGI:97365: 13
  UniProtKB:Q9UBF6: 12
  UniProtKB:O60934: 12
  MGI:MGI:95

In [30]:
!linkml-store -d mongodb://localhost:27017/gocams -c main fq -S taxon+activities.molecular_function.term

{
  "taxon+activities.molecular_function.term": {
    "('NCBITaxon:9606', 'GO:0004674')": 280,
    "('NCBITaxon:9606', 'GO:0061630')": 167,
    "('NCBITaxon:9606', 'GO:0030674')": 88,
    "('NCBITaxon:9606', 'GO:0003700')": 86,
    "('NCBITaxon:10090', 'GO:0003674')": 82,
    "('NCBITaxon:9606', 'GO:0060090')": 76,
    "('NCBITaxon:9606', 'GO:0005125')": 73,
    "('NCBITaxon:9606', 'GO:0004197')": 68,
    "('NCBITaxon:9606', 'GO:0140311')": 65,
    "('NCBITaxon:9606', 'GO:0035591')": 64,
    "('NCBITaxon:9606', 'GO:1990756')": 54,
    "('NCBITaxon:9606', 'GO:0004713')": 54,
    "('NCBITaxon:9606', 'GO:0043539')": 51,
    "('NCBITaxon:10090', 'GO:0048018')": 48,
    "('NCBITaxon:4896', 'GO:0003674')": 41,
    "('NCBITaxon:9606', 'GO:0003674')": 39,
    "('NCBITaxon:7955', 'GO:0003674')": 38,
    "('NCBITaxon:9606', 'GO:0043495')": 37,
    "('NCBITaxon:9606', 'GO:0022829')": 37,
    "('NCBITaxon:9606', 'GO:0140693')": 37,
    "('NCBITaxon:9606', 'GO:0048018')": 32,