# How to index Phenopackets with LinkML-Store





In [24]:
import pystow
path = pystow.ensure_untar("tmp", "phenopackets", url="https://github.com/monarch-initiative/phenopacket-store/releases/download/0.1.8/all_phenopackets.tgz")

In [25]:
path

PosixPath('/Users/cjm/.data/tmp/phenopackets/all_phenopackets')

In [26]:
# iterate over all *.json files in the phenopackets directory and parse to an object
# we will recursively walk the path using os.walk ( we don't worry about loading yet)
import os
import json
objs = []
print(path)
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith(".json"):
            with open(os.path.join(root, file)) as stream:
                obj = json.load(stream)
                objs.append(obj)


/Users/cjm/.data/tmp/phenopackets/all_phenopackets


In [27]:
len(objs)

4293

## Creating a client and attaching to a database

First we will create a client as normal:

In [28]:
from linkml_store import Client

client = Client()

Next we'll attach to a MongoDB instance. this assumes you have one running already.

In [29]:
db = client.attach_database("mongodb://localhost:27017", "phenopackets", recreate_if_exists=True)

## Creating a collection

We'll create a simple test collection. The concept of collection in linkml-store maps directly to mongodb collections

In [30]:
collection = db.create_collection("main", recreate_if_exists=True)

## Loading

In [31]:
collection.insert(objs)

In [35]:
collection.find({}, limit=1).num_rows

100

In [36]:
qr = collection.find({"diseases.term.id": "OMIM:618499"}, limit=3)
qr.rows_dataframe

Let's check with pandas just to make sure it looks as expected:

## Semantic Search

In [37]:
template = """
subject: {{subject}}
phenotypes: {% for p in phenotypicFeatures %}{{p.type.label}}{% endfor %}
diseases: {% for d in diseases %}{{d.term.label}}{% endfor %}
"""

In [38]:
from linkml_store.index.implementations.llm_indexer import LLMIndexer

index = LLMIndexer(
    name="ppkt", 
    cached_embeddings_database="tmp/llm_pheno_cache.db",
    text_template=template,
    text_template_syntax="jinja2",
)

In [39]:
index.object_to_text(qr.rows[0])

IndexError: list index out of range

In [40]:
collection.attach_indexer(index, auto_index=True)

  columns = self._get_columns_info(rows, domains, enums, schema)  # type: ignore[attr-defined]


## Queries

We can specify key-value constraints:

In [23]:
qr = collection.search("older males with liver disease")
qr.rows_dataframe

## Validation