# How to index the caDSR metadata element registry with LinkML-Store





In [1]:

import os
import json
path = "cadsr/cde-json"
objs = []
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith(".json"):
            with open(os.path.join(root, file)) as stream:
                obj = json.load(stream)
                objs.append(obj)


In [2]:
len(objs)

74229

In [4]:
import yaml
print(yaml.dump(objs[1]))

DataElement:
  AlternateNames: []
  ClassificationSchemes: []
  DataElementConcept:
    ConceptualDomain:
      administrativeNotes: null
      beginDate: '2006-09-28'
      changeDescription: null
      context: CCR
      contextVersion: '1'
      createdBy: REEVESD
      dateCreated: '2006-09-28'
      dateModified: '2008-11-19'
      deletedIndicator: 'No'
      endDate: null
      id: 1E838B40-6636-0A25-E044-0003BA3F9857
      latestVersionIndicator: 'Yes'
      longName: MEASURE/INSTRUMENT TESTING
      modifiedBy: REEVESD
      origin: CCR:Center for Cancer Research
      preferredDefinition: Process and results associated with self-reported measures
        and instruments, surveys, other tools
      preferredName: Person Measure/Instrument Testing
      publicId: '2524082'
      registrationStatus: Application
      unresolvedIssues: null
      version: '1'
      workflowStatus: RELEASED
    ObjectClass:
      Concepts:
      - conceptCode: C15747
        definition: Supportive

## Creating a client and attaching to a database

First we will create a client as normal:

In [5]:
from linkml_store import Client

client = Client()

Next we'll attach to a MongoDB instance. this assumes you have one running already.

In [6]:
db = client.attach_database("mongodb://localhost:27017", "cadsr", recreate_if_exists=True)

## Creating a collection

We'll create a simple test collection. The concept of collection in linkml-store maps directly to mongodb collections

In [7]:
collection = db.create_collection("cdes", recreate_if_exists=True)

## Loading

In [8]:
collection.insert(objs)

In [9]:
collection.find({}, limit=1).num_rows

74229

Let's check with pandas just to make sure it looks as expected:

In [10]:
qr = collection.find({}, limit=3)
qr.rows_dataframe

Unnamed: 0,DataElement
0,"{'publicId': '2869761', 'version': '1', 'prefe..."
1,"{'publicId': '7571389', 'version': '1', 'prefe..."
2,"{'publicId': '2773112', 'version': '1', 'prefe..."
3,"{'publicId': '2971930', 'version': '1', 'prefe..."
4,"{'publicId': '7637945', 'version': '1', 'prefe..."
...,...
74224,"{'publicId': '4561278', 'version': '1', 'prefe..."
74225,"{'publicId': '7787595', 'version': '1', 'prefe..."
74226,"{'publicId': '6703581', 'version': '1', 'prefe..."
74227,"{'publicId': '2220287', 'version': '1', 'prefe..."


In [None]:
qr.rows[1]

## Semantic Search

We will index phenopackets using a template that extracts the subject, phenotypic features and diseases.

In [11]:
template = """
subject: {{subject}}
phenotypes: {% for p in phenotypicFeatures %}{{p.type.label}}{% endfor %}
diseases: {% for d in diseases %}{{d.term.label}}{% endfor %}
"""

In [12]:
from linkml_store.index.implementations.llm_indexer import LLMIndexer

index = LLMIndexer(
    name="ppkt", 
    cached_embeddings_database="tmp/llm_pheno_cache.db",
    text_template=template,
    text_template_syntax="jinja2",
)

In [13]:
index.object_to_text(qr.rows[0])

"\nsubject: {'id': 'Higgins-Patient-1', 'timeAtLastEncounter': {'age': {'iso8601duration': 'P17Y'}}, 'sex': 'FEMALE'}\nphenotypes: Ventricular hypertrophyHeart murmurHypertrophic cardiomyopathyShort statureHypertelorismLow-set earsPosteriorly rotated earsGlobal developmental delayCognitive impairmentCardiac arrest\ndiseases: Noonan syndrome-11"

In [14]:
collection.attach_indexer(index, auto_index=True)

  columns = self._get_columns_info(rows, domains, enums, schema)  # type: ignore[attr-defined]


## Queries

We can specify key-value constraints:

In [16]:
qr = collection.search("older males with liver disease")
qr.rows_dataframe[0:10]

Unnamed: 0,score,id,subject,phenotypicFeatures,interpretations,diseases,metaData
0,0.79436,PMID_30658709_patient,"{'id': 'patient', 'timeAtLastEncounter': {'age...","[{'type': {'id': 'HP:0031956', 'label': 'Eleva...","[{'id': 'patient', 'progressStatus': 'SOLVED',...","[{'term': {'id': 'OMIM:615878', 'label': 'Chol...","{'created': '2024-05-05T09:03:25.388371944Z', ..."
1,0.786465,PMID_37303127_6,"{'id': '6', 'timeAtLastEncounter': {'age': {'i...","[{'type': {'id': 'HP:0001397', 'label': 'Hepat...","[{'id': '6', 'progressStatus': 'SOLVED', 'diag...","[{'term': {'id': 'OMIM:151660', 'label': 'Lipo...","{'created': '2024-03-23T17:41:42.999521017Z', ..."
2,0.785974,PMID_22508010_22508010_P1,"{'id': '22508010_P1', 'timeAtLastEncounter': {...","[{'type': {'id': 'HP:0006562', 'label': 'Viral...","[{'id': '22508010_P1', 'progressStatus': 'SOLV...","[{'term': {'id': 'OMIM:256810', 'label': 'Mito...","{'created': '2024-03-23T19:28:35.860860824Z', ..."
3,0.785179,PMID_27536553_27536553_P3,"{'id': '27536553_P3', 'timeAtLastEncounter': {...","[{'type': {'id': 'HP:0001396', 'label': 'Chole...","[{'id': '27536553_P3', 'progressStatus': 'SOLV...","[{'term': {'id': 'OMIM:256810', 'label': 'Mito...","{'created': '2024-03-23T19:28:35.688389062Z', ..."
4,0.781917,PMID_27536553_27536553_P2,"{'id': '27536553_P2', 'timeAtLastEncounter': {...","[{'type': {'id': 'HP:0001396', 'label': 'Chole...","[{'id': '27536553_P2', 'progressStatus': 'SOLV...","[{'term': {'id': 'OMIM:256810', 'label': 'Mito...","{'created': '2024-03-23T19:28:35.674263954Z', ..."
5,0.77876,PMID_25129007_25129007_P1,"{'id': '25129007_P1', 'timeAtLastEncounter': {...","[{'type': {'id': 'HP:0000952', 'label': 'Jaund...","[{'id': '25129007_P1', 'progressStatus': 'SOLV...","[{'term': {'id': 'OMIM:256810', 'label': 'Mito...","{'created': '2024-03-23T19:28:36.169033050Z', ..."
6,0.776784,PMID_24894789_24894789_P1,"{'id': '24894789_P1', 'timeAtLastEncounter': {...","[{'type': {'id': 'HP:0000952', 'label': 'Jaund...","[{'id': '24894789_P1', 'progressStatus': 'SOLV...","[{'term': {'id': 'OMIM:256810', 'label': 'Mito...","{'created': '2024-03-23T19:28:36.148879051Z', ..."
7,0.776577,PMID_27536553_27536553_P1,"{'id': '27536553_P1', 'timeAtLastEncounter': {...","[{'type': {'id': 'HP:0001396', 'label': 'Chole...","[{'id': '27536553_P1', 'progressStatus': 'SOLV...","[{'term': {'id': 'OMIM:256810', 'label': 'Mito...","{'created': '2024-03-23T19:28:35.665367126Z', ..."
8,0.776096,PMID_34023347_34023347_P1,"{'id': '34023347_P1', 'timeAtLastEncounter': {...","[{'type': {'id': 'HP:0006554', 'label': 'Acute...","[{'id': '34023347_P1', 'progressStatus': 'SOLV...","[{'term': {'id': 'OMIM:256810', 'label': 'Mito...","{'created': '2024-03-23T19:28:35.581506967Z', ..."
9,0.775122,PMID_28209105_28209105_P1,"{'id': '28209105_P1', 'timeAtLastEncounter': {...","[{'type': {'id': 'HP:0001508', 'label': 'Failu...","[{'id': '28209105_P1', 'progressStatus': 'SOLV...","[{'term': {'id': 'OMIM:256810', 'label': 'Mito...","{'created': '2024-03-23T19:28:35.655704975Z', ..."


In [17]:
qr.ranked_rows[0]

(0.7943603537606876,
 {'id': 'PMID_30658709_patient',
  'subject': {'id': 'patient',
   'timeAtLastEncounter': {'age': {'iso8601duration': 'P1Y11M'}},
   'sex': 'FEMALE'},
  'phenotypicFeatures': [{'type': {'id': 'HP:0031956',
     'label': 'Elevated circulating aspartate aminotransferase concentration'},
    'onset': {'age': {'iso8601duration': 'P1Y11M'}}},
   {'type': {'id': 'HP:0031964',
     'label': 'Elevated circulating alanine aminotransferase concentration'},
    'onset': {'age': {'iso8601duration': 'P1Y11M'}}},
   {'type': {'id': 'HP:0003573', 'label': 'Increased total bilirubin'},
    'onset': {'age': {'iso8601duration': 'P6M'}}},
   {'type': {'id': 'HP:0012202',
     'label': 'Increased serum bile acid concentration'},
    'onset': {'age': {'iso8601duration': 'P6M'}}},
   {'type': {'id': 'HP:0002908', 'label': 'Conjugated hyperbilirubinemia'},
    'onset': {'age': {'iso8601duration': 'P6M'}}},
   {'type': {'id': 'HP:0001433', 'label': 'Hepatosplenomegaly'},
    'onset': {'ag

## Validation

__TODO__    