# Import required packages

In [2]:
!pip install langchain openai chromadb langchainhub pypdf tiktoken pymupdf PyPDF2 streamlit unstructured langchain-community

Collecting langchain
  Using cached langchain-0.3.1-py3-none-any.whl.metadata (7.1 kB)
Collecting openai
  Downloading openai-1.50.2-py3-none-any.whl.metadata (24 kB)
Collecting chromadb
  Downloading chromadb-0.5.11-py3-none-any.whl.metadata (6.8 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting pypdf
  Downloading pypdf-5.0.1-py3-none-any.whl.metadata (7.4 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting pymupdf
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting streamlit
  Downloading streamlit-1.38.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting unstructured
  Downloading unstructured-0.15.13-py3-none-any.whl.metadata (29 kB)
Collecting langchain-community
  Using cached langchain_community-0.3.1-py3-none-

In [1]:
import os
from dotenv import load_dotenv
import openai
from openai import OpenAI
import pandas as pd
import warnings
import langchain
import argparse
from dataclasses import dataclass
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
from google.colab import userdata
from langchain import hub
from langchain.schema.runnable import RunnablePassthrough


# Suppress FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load GPT

In [2]:
from google.colab import userdata

with open('/content/.env', 'w') as file:
    gpt_key = userdata.get('gpt_key')
    if gpt_key:
        file.write(f"OPENAI_API_KEY={gpt_key}\n")
    else:
        print("gpt_key not found in userdata.")

# Load the environment variables from the .env file
load_dotenv('/content/.env')

# Access the OpenAI API key and org id
api_key = os.getenv('OPENAI_API_KEY')
org_id = userdata.get('org_id')

if org_id and api_key:
    client = OpenAI(api_key=api_key, organization=org_id)
    print(f"OpenAI client initialized with org_id: {org_id}")
else:
    print("Failed to initialize OpenAI client. Check org_id and API key.")

OpenAI client initialized with org_id: org-sD1mhBLcKZViamsY2rWXNqzB


# Load Dataset

In [3]:
def excel_to_markdown(file_path):
    # Load the Excel file
    xls = pd.ExcelFile(file_path, engine='openpyxl')

    # Markdown string to hold all sheets data
    all_sheets_markdown = ""

    # Process each sheet
    for sheet_name in xls.sheet_names:
        df = pd.read_excel(xls, sheet_name=sheet_name)

        # Convert DataFrame to Markdown
        markdown = df.to_markdown(index=False)

        # Add a header for the sheet and append the Markdown
        all_sheets_markdown += f"## {sheet_name}\n{markdown}\n\n"

    return all_sheets_markdown

In [4]:
# Specify the path to your Excel file
markdown_output = excel_to_markdown("/content/Tabel-ICD-10-English-Indonesia-Lengkap.xlsx") # Please adjust this path to the location of your Excel file

# Print or save the Markdown output
print(markdown_output)

# Save to a file
with open("output.md", "w") as file:
    file.write(markdown_output)

## GOL A
| Category   |   Subcategory | English_name                                                     | Indonesian_name                                                                |
|:-----------|--------------:|:-----------------------------------------------------------------|:-------------------------------------------------------------------------------|
| A00        |           nan | Cholera                                                          | Kolera                                                                         |
| A00        |             0 | Cholera due to Vibrio cholerae 01, biovar cholerae               | Kolera yang disebabkan oleh Vibrio cholerae 01, biovar cholerae                |
| A00        |             1 | Cholera due to Vibrio cholerae 01, biovar el tor                 | Kolera yang disebabkan oleh Vibrio cholerae 01, biovar el tor                  |
| A00        |             9 | Cholera, unspecified                                             

# Split Reference

In [5]:
# Load and splitting files
loader = DirectoryLoader('/content', glob="*.md")
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 300, chunk_overlap = 100)
all_splits = text_splitter.split_documents(data)

all_splits[100]

Document(metadata={'source': '/content/output.md'}, page_content='Gonokokus pada saluran genital-kemih bawah d A54 2 Gonococcal pelviperitonitis and other gonococcal genitou rinar Peritonitis panggul dan infeksi genital-kemih lain yang d A54 3 Gonococcal infection of eye Infeksi Gonokokus pada mata A54 4 Gonococcal infection of musculoskeletal system Infeksi')

# Store Reference

In [6]:
# Storing embeddings for text retrieval
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

  vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())


In [7]:
# Retrieving context from the vectorstore
rag_prompt = hub.pull("rlm/rag-prompt")



In [8]:
# Define ChatGPT model
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

  llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


In [9]:
# Define RAG chain setup
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
)

In [10]:
# Try invoke the chain
rag_chain.invoke("What is the ICD code for meningitis")

AIMessage(content='The ICD code for meningitis is G00.', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 11, 'prompt_tokens': 475, 'total_tokens': 486, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-cce817f8-b87e-4354-9984-2debd8665127-0')

In [11]:
# Check reference pulled out
question = "What is the ICD code for meningitis"
docs = vectorstore.similarity_search(question)
len(docs)

4

In [12]:
# Check reference pulled out
docs

[Document(metadata={'source': '/content/output.md'}, page_content='Category Subcategory English_name Indonesian_name G00 0 Hemophilus meningitis Meningitis hemofilus G00 1 Pneumococcal meningitis Meningitis pneumokokkus G00 2 Streptococcal meningitis Meningitis streptokokkus G00 3 Staphylococcal meningitis Meningitis stafilokokkus G00 8 Other bacterial meningitis'),
 Document(metadata={'source': '/content/output.md'}, page_content='infection Infeksi meningokokus A39 0 Meningococcal meningitis Meningitis meningokokus A39 1 Waterhouse-Friderichsen syndrome Sindrom Waterhouse-Friderichsen A39 2 Acute meningococcemia Meningokoksemia akut A39 3 Chronic meningococcemia Meningokoksemia kronik A39 4 Meningococcemia, unspecified'),
 Document(metadata={'source': '/content/output.md'}, page_content='G00 3 Staphylococcal meningitis Meningitis stafilokokkus G00 8 Other bacterial meningitis Meningitis bakterial lainnya G00 9 Bacterial meningitis, unspecified Meningitis bakterial, tidak terspesifikas

# Generate Response

In [13]:
# Define response pipeline
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever()
)

## Simple Prompt

In [14]:
# Try with simple prompt
question = "What is the ICD code for meningitis"
result = qa_chain({"query": question})
result["result"]

  result = qa_chain({"query": question})


'The ICD code for meningitis is G00.'

## Template Prompt

In [17]:
# Try with template prompt in English
question = input("Tell us the symptoms: ")

PROMPT_TEMPLATE = """
Answer this question with context provided below. If the question cannot be answered using the information provided answer with "I don't know".

---

{context}

---
Give ICD code recommendation based on the question in english and indonesia according to the context above: {question}, DO NOT CREATE OR ASSUME NEW CODE! STRICLY USE THE CODE PROVIDED!

---

Generate response with this template:

ICD Code: <ICD Code>
Description (English): <Description>
Deskripsi (Indonesia): <Deskripsi>

"""

# Prepare the DB.
embedding_function = OpenAIEmbeddings()

# Search the DB.
results = vectorstore.similarity_search_with_relevance_scores(question, k=3)
if len(results) == 0 or results[0][1] < 0.6:
    print(f"Unable to find matching results.")

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=question)
# print(prompt)

model = ChatOpenAI()
response_text = model.predict(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Tell us the symptoms: i have a fever for almost 2 days
Response: ICD Code: R50 1
Description (English): Persistent fever
Deskripsi (Indonesia): Demam menetap
Sources: ['/content/output.md', '/content/output.md', '/content/output.md']


In [20]:
# Try with template prompt in Indonesia
question = input("Deskripsikan gejala yang dirasakan: ")

PROMPT_TEMPLATE = """
Answer this question with context provided below. If the question cannot be answered using the information provided answer with "I don't know".

---

{context}

---
Give ICD code recommendation based on the question in english and indonesia according to the context above: {question}, DO NOT CREATE OR ASSUME NEW CODE! STRICLY USE THE CODE PROVIDED!

---

Generate response with this template:

ICD Code: <ICD Code>
Description (English): <Description>
Deskripsi (Indonesia): <Deskripsi>

"""

# Prepare the DB.
embedding_function = OpenAIEmbeddings()

# Search the DB.
results = vectorstore.similarity_search_with_relevance_scores(question, k=3)
if len(results) == 0 or results[0][1] < 0.6:
    print(f"Unable to find matching results.")

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=question)
# print(prompt)

model = ChatOpenAI()
response_text = model.predict(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Deskripsikan gejala yang dirasakan: saya merasakan demam terus menerus selama 2 hari
Response: ICD Code: R50 9
Description (English): Fever, unspecified
Deskripsi (Indonesia): Demam, tidak terspesifikasi
Sources: ['/content/output.md', '/content/output.md', '/content/output.md']
