In [2]:
!pip install openai

Collecting openai
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0


### Load ontology codes

In [6]:
import os
import csv
import pandas as pd

# os.chdir('/mnt/c/Users/Justin/Downloads/LOINC')

df_loinc = pd.read_csv('LoincTableCore.csv', dtype=object)
df_loinc = df_loinc[df_loinc['CLASSTYPE'] == str(1)]
df_loinc.drop(df_loinc[df_loinc.STATUS != 'ACTIVE'].index, inplace=True)
df_loinc.drop(['CLASSTYPE', 'STATUS', 'EXTERNAL_COPYRIGHT_NOTICE', 'VersionFirstReleased', 'VersionLastChanged'],
              axis=1,
              inplace=True)
print(f"LOINC codes (CLASSTYPE=1, Laboratory Terms Class) loaded and processed.\n")

df_loinc

LOINC codes (CLASSTYPE=1, Laboratory Terms Class) loaded and processed.



Unnamed: 0,LOINC_NUM,COMPONENT,PROPERTY,TIME_ASPCT,SYSTEM,SCALE_TYP,METHOD_TYP,CLASS,LONG_COMMON_NAME,SHORTNAME
21,100019-9,ALK gene targeted mutation analysis,Prid,Pt,Bld/Tiss,Nom,Molgen,MOLPATH.MUT,ALK gene mutations found [Identifier] in Blood...,ALK gene Mut Anl Bld/T
22,100020-7,GNA11 gene targeted mutation analysis,Prid,Pt,Bld/Tiss,Nom,Molgen,MOLPATH.MUT,GNA11 gene mutations found [Identifier] in Blo...,GNA11 gene Mut Anl Bld/T
23,100021-5,GNAQ gene targeted mutation analysis,Prid,Pt,Bld/Tiss,Nom,Molgen,MOLPATH.MUT,GNAQ gene mutations found [Identifier] in Bloo...,GNAQ gene Mut Anl Bld/T
24,100022-3,IDH1 gene targeted mutation analysis,Prid,Pt,Bld/Tiss,Nom,Molgen,MOLPATH.MUT,IDH1 gene mutations found [Identifier] in Bloo...,IDH1 gene Mut Anl Bld/T
25,100023-1,IDH2 gene targeted mutation analysis,Prid,Pt,Bld/Tiss,Nom,Molgen,MOLPATH.MUT,IDH2 gene mutations found [Identifier] in Bloo...,IDH2 gene Mut Anl Bld/T
...,...,...,...,...,...,...,...,...,...,...
101599,99968-0,Thrombotic microangiopathy multigene analysis,Find,Pt,Bld/Tiss,Doc,Molgen,MOLPATH,Thrombotic microangiopathy multigene analysis ...,TMA multigene analysis Bld/T
101600,99969-8,PLG gene full mutation analysis,Find,Pt,Bld/Tiss,Doc,Sequencing,MOLPATH,PLG gene full mutation analysis in Blood or Ti...,PLG Full Mut Anl Bld/T Seq
101601,99970-6,Hereditary thrombocytopenia multigene analysis,Find,Pt,Bld/Tiss,Doc,Molgen,MOLPATH,Hereditary thrombocytopenia multigene analysis...,IT multigene analysis Bld/T
101602,99971-4,Hemophagocytic lymphohistiocytosis multigene a...,Find,Pt,Bld/Tiss,Doc,Molgen,MOLPATH,Hemophagocytic lymphohistiocytosis multigene a...,HLH multigene analysis Bld/T


### Generate descriptions with OpenAI's model


In [3]:
import openai
import numpy as np
from scipy.spatial import distance
import plotly.express as px
from sklearn.cluster import KMeans
# from umap import UMAP

In [4]:
openai.api_key = "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"

In [8]:
import time
import openai

import pyarrow as pa
import pyarrow.parquet as pq
from tenacity import retry

@retry
def get_response(system_prompt, user_message):
    model = 'gpt-3.5-turbo'

    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": user_message
            }
        ],
        temperature=0,
        max_tokens=2000,
    )

    return response['choices'][0]['message']['content']


system_prompt = "You are an expert medical annotator with experience using the LOINC ontology."

user_message = """
I found the following facts about "{concept}" in LOINC:
    - The PROPERTY of "{concept}" is "{prop}"
    - The TIME_ASPCT of "{concept}" is "{time_aspct}"
    - The SYSTEM of "{concept}" is "{system}"
    - The SCALE_TYP of "{concept}" is "{scale_typ}"
    - The METHOD_TYP of "{concept}" is "{method_typ}"
    - The CLASS of "{concept}" is "{loinc_class}"
    - The LONG_COMMON_NAME of "{concept}" is "{long_common_name}"

Based on the above, write a short general description of "{concept}".
"""

# weird checkpoint system because code keeps failing...
current_index = 0
batch_size = 100

# list to store the rows that will be written to the Parquet file
rows_to_write = []
checkpoint_file = 'checkpoint.txt'

if os.path.exists(checkpoint_file):
    # if it exists, read the index from the file
    with open(checkpoint_file, 'r') as f:
        current_index = int(f.read())

for index, row in df_loinc.iterrows():
    if index < current_index:
        continue
    loinc_num = row['LOINC_NUM']
    concept=row['COMPONENT']
    prop=row['PROPERTY']
    time_aspct=row['TIME_ASPCT']
    system=row['SYSTEM']
    scale_typ=row['SCALE_TYP']
    method_typ=row['METHOD_TYP']
    loinc_class=row['CLASS']
    long_common_name=row['LONG_COMMON_NAME']

    prompt = user_message.format(
                concept=concept,
                prop=prop,
                time_aspct=time_aspct,
                system=system,
                scale_typ=scale_typ,
                method_typ=method_typ,
                loinc_class=loinc_class,
                long_common_name=long_common_name
                )

    description = get_response(system_prompt, prompt)
    print(f"Description for LOINC_NUM {loinc_num}: {description}")

    data = {
        'LOINC_NUM': loinc_num,
        'Description': description
    }

    rows_to_write.append(data)

    if len(rows_to_write) == batch_size:
        batch_df = pd.DataFrame(rows_to_write)
        batch_table = pa.Table.from_pandas(batch_df)
        pq.write_table(batch_table, f'batch_{index + 1}.parquet')
        rows_to_write.clear()

        current_index = index + 1
        with open(checkpoint_file, 'w') as f:
            f.write(str(current_index))

# wrap up save
if len(rows_to_write) > 0:
    final_df = pd.DataFrame(rows_to_write)
    final_table = pa.Table.from_pandas(final_df)
    pq.write_table(final_table, f'batch_final.parquet')

Description for LOINC_NUM 100019-9: ALK gene targeted mutation analysis is a molecular genetics method used to identify and analyze mutations in the ALK gene. This analysis can be performed on blood or tissue samples. The purpose of this test is to detect and characterize specific mutations in the ALK gene, which is associated with certain diseases or conditions. The analysis is performed using a nominal scale, and the results provide information about the presence or absence of ALK gene mutations.
Description for LOINC_NUM 100020-7: "GNA11 gene targeted mutation analysis" is a molecular genetics test that identifies mutations in the GNA11 gene. This test is performed on blood or tissue samples. It is used to detect specific genetic alterations in the GNA11 gene, which may be associated with certain medical conditions or diseases. The test utilizes a nominal scale and molecular genetics methods to analyze the targeted mutations in the GNA11 gene.


KeyboardInterrupt: ignored