In [2]:
import csv

snomeds = []
# open the CSV file in read mode
with open('SNOMEDCT_CORE_SUBSET_202211/SNOMEDCT_CORE_SUBSET_202211.txt', 'r') as csvfile:
    # create a CSV reader object
    csvreader = csv.reader(csvfile, delimiter="|")
    
    # get the header row and create a dictionary of column indexes
    header = next(csvreader)
    indexes = {header[i]: i for i in range(len(header))}

    # loop over the remaining rows and create a dictionary for each row
    for row in csvreader:
        snomed = {}
        for key, index in indexes.items():
            snomed[key] = row[index]
        snomed["text"] = snomed["SNOMED_FSN"]
        snomeds.append(snomed)

len(snomeds)

6667

In [3]:
import sqlite3
conn = sqlite3.connect("snomed.db")

# Create a cursor object
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS vocab")
cur.execute("CREATE VIRTUAL TABLE IF NOT EXISTS vocab USING fts5(code, display, related, common_test_rank, units, tokenize = porter)")
cur.execute("INSERT into vocab(vocab, rank) values('rank', ' bm25(100.0, 10, 5, 0)')")

for r in snomeds:
    code = r['SNOMED_CID']
    display = r['text']
    related = ""
    common_test_rank = 0
    units = ""
    cur.execute("INSERT INTO vocab (code, display, related, common_test_rank, units) VALUES (?, ?, ?, ?, ?)", (code, display, related, common_test_rank, units));

# Define a function to perform a full-text search on the table
def search_loinc(query, limit = 100, skip = 0):
  cur.execute("select * from (SELECT *, rank FROM vocab WHERE vocab MATCH ? ORDER BY rank limit ? offset ?) order by common_test_rank", (" OR ".join(query.replace("-", " ").split(" ")), limit, skip,))
  results = cur.fetchall()
  print(f"---\nSearch results for '{query}':")
  for row in results:
    print(f"{row[0]}, {row[1]} ctr {row[3]} {row[-1]}")

# Perform some sample searches
search_loinc("post traumatic")
conn.commit()
conn.close()

---
Search results for 'post traumatic':
269406001, Post-traumatic wound infection (disorder) ctr 0 -21.955338918668765
313182004, Chronic post-traumatic stress disorder (disorder) ctr 0 -21.693896069825378
27151001, Post-laminectomy syndrome (disorder) ctr 0 -12.274184592849156
314116003, Post infarct angina (disorder) ctr 0 -12.274184592849156
31097004, Post poliomyelitis syndrome (disorder) ctr 0 -12.274184592849156
239783001, Post-infective arthritis (disorder) ctr 0 -12.274184592849156
238699007, Post-inflammatory hyperpigmentation (disorder) ctr 0 -12.274184592849156
236664000, Post-micturition incontinence (finding) ctr 0 -12.274184592849156
202725007, Lumbar post-laminectomy syndrome (disorder) ctr 0 -12.126262902026289
202723000, Cervical post-laminectomy syndrome (disorder) ctr 0 -12.126262902026289
373108000, Post percutaneous transluminal coronary angioplasty (finding) ctr 0 -11.981864096311027
226007004, Post-surgical wound care (regime/therapy) ctr 0 -11.981864096311027
1

In [9]:
from txtai.embeddings import Embeddings

embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2", "content": True})

embeddings.index([(s["SNOMED_CID"], s, None) for s in snomeds])


  from .autonotebook import tqdm as notebook_tqdm


In [30]:
for r in embeddings.search("SELECT id, text, score FROM txtai WHERE similar('major depressive disorder, recurrent, severe')  limit 100"):
    if r['score'] >= .65:
        print(r['id'], r['text'])

268621008 Recurrent major depressive episodes (disorder)
28475009 Severe recurrent major depression with psychotic features (disorder)
370143000 Major depressive disorder (disorder)
66344007 Recurrent major depression (disorder)
18818009 Moderate recurrent major depression (disorder)
73867007 Severe major depression with psychotic features (disorder)
191604000 Single major depressive episode, severe, with psychosis (disorder)
430852001 Severe major depression, single episode, with psychotic features (disorder)
35489007 Depressive disorder (disorder)
40568001 Recurrent brief depressive disorder (disorder)
191616006 Recurrent depression (disorder)
268620009 Single major depressive episode (disorder)
59617007 Severe depressed bipolar I disorder with psychotic features (disorder)
33135002 Recurrent major depression in partial remission (disorder)
191623007 Bipolar affective disorder, currently manic, severe, with psychosis (disorder)
40379007 Mild recurrent major depression (disorder)
8320

In [44]:
import nltk
token_counts = {}
from nltk.stem import PorterStemmer
ps = PorterStemmer()

for r in snomeds:
    r["textTokens"] = set([ps.stem(v.lower()) for v in nltk.tokenize.word_tokenize(r['text'] )])
    for t in r["textTokens"]:
        if t not in token_counts:
            token_counts[t] = 0
        token_counts[t] += 1
len(token_counts)

def find_hits(q):
    qtokens = set([ps.stem(v.lower()) for v in nltk.tokenize.word_tokenize(q)])
    print(qtokens)
    results = []
    for r in snomeds:
        matchText = r['textTokens'] & qtokens
        #print(r['tokens'] & qtokens, len(r['tokens'] & qtokens))
        if len(matchText) > 0:
            results.append([sum([10 * 60000/token_counts[t] for t in matchText]), r])
    results.sort(key=lambda v: v[0], reverse=True)
    return results[:100]


In [46]:

for r in find_hits("major depressive disorder, recurrent, severe")[:50]:
    print(r[1]['SNOMED_CID'],r[1]['text'])#, f"(units: {r[1]['EXAMPLE_UCUM_UNITS']})")

len(snomeds)
#from txtai.embeddings import Embeddings

{'recurr', 'depress', ',', 'disord', 'major', 'sever'}
36474008 Severe recurrent major depression without psychotic features (disorder)
28475009 Severe recurrent major depression with psychotic features (disorder)
76441001 Severe major depression, single episode, without psychotic features (disorder)
191604000 Single major depressive episode, severe, with psychosis (disorder)
430852001 Severe major depression, single episode, with psychotic features (disorder)
73867007 Severe major depression with psychotic features (disorder)
75084000 Severe major depression without psychotic features (disorder)
18818009 Moderate recurrent major depression (disorder)
268621008 Recurrent major depressive episodes (disorder)
46244001 Recurrent major depression in full remission (disorder)
40379007 Mild recurrent major depression (disorder)
66344007 Recurrent major depression (disorder)
33135002 Recurrent major depression in partial remission (disorder)
68019004 Recurrent major depression in remission (d

6667