In [1]:
import pandas as pd
import numpy as np
import json, os, pprint
import matplotlib.pyplot as plt
import plotly.express as px
import random
from langchain_openai import OpenAIEmbeddings
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain_core.tools import tool
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers import JsonOutputToolsParser, JsonOutputKeyToolsParser
from langchain.agents import AgentExecutor, create_openai_tools_agent, create_react_agent, Tool
from langchain.agents.format_scratchpad.openai_tools import format_to_openai_tool_messages
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from langchain_experimental.utilities import PythonREPL
from langchain_experimental.tools import PythonREPLTool
from langchain import hub
from typing import List
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.callbacks import Callbacks
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import Chroma

In [2]:
os.environ["OPENAI_API_KEY"] = ""

In [3]:
llm = ChatOpenAI(model="gpt-4-turbo-preview", temperature=0.1, streaming=True)

In [4]:
# Download files from https://athena.ohdsi.org/
ndc_dir = "/Users/jzamalloa/Documents/PROJECTS/LLM/DBs/033024_ndc"
concept_ndc = pd.read_csv(ndc_dir + "/CONCEPT.csv", sep="\t")

print(concept_ndc.shape)
concept_ndc.head()

(1403710, 10)


  concept_ndc = pd.read_csv(ndc_dir + "/CONCEPT.csv", sep="\t")


Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,36189414,hemorrhoidal cream 10mg/g / 144mg/g / 150mg/g ...,Drug,NDC,9-digit NDC,,3630641,20180325,20991231,
1,1220863,fulvestrant 250mg/5mL INTRAMUSCULAR INJECTION,Drug,NDC,9-digit NDC,,167290436,20210121,20991231,
2,35110579,"kali muriaticum, carbo vegetabilis, lung (suis...",Drug,NDC,11-digit NDC,,43742164901,20200626,20280919,D
3,36321712,"pulsatilla (pratensis), euphorbium officinarum...",Drug,NDC,11-digit NDC,,43742206101,20221201,20280908,D
4,36321592,"influenzinum (2022-2023), herpes simplex 1 nos...",Drug,NDC,11-digit NDC,,43742206201,20221205,20281117,D


In [30]:
(concept_ndc
 .query("standard_concept==standard_concept")
#  .concept_class_id.unique()
 .query("vocabulary_id=='NDC'")
 )

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
2135,45201605,SM BANDAGES FLEXIBLE,Device,NDC,Device,S,10939005233,20130805,20991231,
2136,45304229,SM FABRIC BANDAGES,Device,NDC,Device,S,10939005933,20130805,20991231,
2137,45355381,SM FABRIC BANDAGES,Device,NDC,Device,S,10939008511,20130805,20991231,
2138,44979982,FLEXIBLE EX-LARGE BANDAGE,Device,NDC,Device,S,10939008611,20130805,20991231,
2139,45235976,SUNBLOCK SPF15 LOTION,Device,NDC,Device,S,10939036711,20130805,20991231,
...,...,...,...,...,...,...,...,...,...,...
1403226,37140214,sunscreen spf 30 3g/100g / 5g/100g / 10g/100g ...,Device,NDC,Device,S,80489023201,20240101,20991231,
1403227,37140215,sunscreen spf 30 3g/100g / 5g/100g / 10g/100g ...,Device,NDC,Device,S,80489023202,20240101,20991231,
1403228,37140216,sunscreen spf 50 3g/100g / 5g/100g / 10g/100g ...,Device,NDC,Device,S,80489023501,20240101,20991231,
1403229,37140217,sunscreen spf 50 3g/100g / 5g/100g / 10g/100g ...,Device,NDC,Device,S,80489023502,20240101,20991231,


In [5]:
concept_ndc_filtered = (concept_ndc
 .query("standard_concept!=standard_concept")
 .query("domain_id=='Drug'")
#  .query("vocabulary_id!='NDC'")
 .loc[:,["concept_id", "concept_name", "concept_class_id", "concept_code"]]
 )

concept_ndc_filtered

Unnamed: 0,concept_id,concept_name,concept_class_id,concept_code
0,36189414,hemorrhoidal cream 10mg/g / 144mg/g / 150mg/g ...,9-digit NDC,003630641
1,1220863,fulvestrant 250mg/5mL INTRAMUSCULAR INJECTION,9-digit NDC,167290436
2,35110579,"kali muriaticum, carbo vegetabilis, lung (suis...",11-digit NDC,43742164901
3,36321712,"pulsatilla (pratensis), euphorbium officinarum...",11-digit NDC,43742206101
4,36321592,"influenzinum (2022-2023), herpes simplex 1 nos...",11-digit NDC,43742206201
...,...,...,...,...
1403704,37143425,zolmitriptan 2.5 MG Oral Tablet,11-digit NDC,62332046206
1403705,37143427,zolmitriptan 5 MG Oral Tablet,11-digit NDC,62332046303
1403706,37143429,zolpidem tartrate 10 MG Oral Tablet,11-digit NDC,72789032314
1403707,37143426,zolmitriptan 2.5 MG Disintegrating Oral Tablet,11-digit NDC,62332018116


In [18]:
# Test Vectorizing sample sub-sample first

(
    pd.concat(
        [
            (concept_ndc_filtered
             .sample(10000)
             ),
             ()
        ]
    )
)

(concept_ndc_filtered
 .sample(10000)
 .to_csv(ndc_dir + "/CONCEPT_FILTERED.csv", index=False, sep="\t")
 )


In [19]:
loader = CSVLoader(file_path=ndc_dir + "/CONCEPT_FILTERED.csv", source_column="concept_class_id",
                   csv_args={'delimiter':'\t'})
ndc_loaded = loader.load()

In [20]:
print(len(ndc_loaded))
ndc_loaded[:3]

10000


[Document(page_content='concept_id: 45316949\nconcept_name: choline salicylate 587 MG / Magnesium Salicylate 725 MG Oral Tablet\nconcept_class_id: 11-digit NDC\nconcept_code: 00185099701', metadata={'source': '11-digit NDC', 'row': 0}),
 Document(page_content='concept_id: 45067475\nconcept_name: nettle pollen extract 50 MG/ML Injectable Solution\nconcept_class_id: 11-digit NDC\nconcept_code: 36987329904', metadata={'source': '11-digit NDC', 'row': 1}),
 Document(page_content='concept_id: 45187400\nconcept_name: meclizine hydrochloride 25 MG Oral Tablet\nconcept_class_id: 11-digit NDC\nconcept_code: 47682047999', metadata={'source': '11-digit NDC', 'row': 2})]

In [21]:
ndc_loaded[0].page_content

'concept_id: 45316949\nconcept_name: choline salicylate 587 MG / Magnesium Salicylate 725 MG Oral Tablet\nconcept_class_id: 11-digit NDC\nconcept_code: 00185099701'

### Embed NDC Docs into VectoStore

In [22]:
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [23]:
ndc_db = Chroma.from_documents(ndc_loaded, embedding=embeddings_model, 
                               persist_directory="/Users/jzamalloa/Documents/PROJECTS/LLM/DBs/033024_ndc")

In [11]:
# 100 samples - Instantaneous
# 1000 samples - 5.8s
# 10000 samples - 50.5s

<langchain_community.vectorstores.chroma.Chroma at 0x280573a30>