In [2]:
!ls

README.md                        mimic_simple_rag_synthetic.ipynb
admissions_data_generator.py     synthetic_admissions.csv


In [3]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Import LangChain components
from langchain_community.chat_models import ChatOpenAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.messages import SystemMessage, HumanMessage

In [5]:
# Import FAISS for vector database
import faiss

In [6]:
# Load environment variables (for OpenAI API key)
from dotenv import load_dotenv
load_dotenv()

True

In [7]:
# Check if OpenAI API key is available
if "OPENAI_API_KEY" not in os.environ:
    print("Warning: OPENAI_API_KEY not found in environment variables.")
    print("Please set your OpenAI API key in .env file or directly in this notebook.")

In [8]:
# Load synthetic admissions data
synthetic_data = pd.read_csv("synthetic_admissions.csv")

In [9]:
# Convert to text format
data_texts = []
data_metadata = []
for _, row in synthetic_data.iterrows():
    text_str = f"Subject {row['subject_id']}, HADM {row['hadm_id']}, admitted on {row['admittime']}"
    data_texts.append(text_str)
    metadata = {
        "subject_id": row["subject_id"],
        "hadm_id": row["hadm_id"],
        "admittime": row["admittime"]
    }
    data_metadata.append(metadata)
print(f"Loaded {len(data_texts)} synthetic admission records")

Loaded 20 synthetic admission records


In [10]:
# Define improved chunking with RecursiveCharacterTextSplitter
def chunk_texts(texts, metadata, chunk_size=100, chunk_overlap=20):
    """Split texts into chunks with the specified size and overlap"""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", ", ", " ", ""]
    )
    
    all_chunks = []
    chunk_metadata = []
    
    for i, txt in enumerate(texts):
        chunks = splitter.split_text(txt)
        for chunk in chunks:
            all_chunks.append(chunk)
            # Copy metadata from the original text to each chunk
            chunk_metadata.append(metadata[i])
    
    return all_chunks, chunk_metadata

In [11]:
# Apply chunking to our admission texts
chunks, chunk_metadata = chunk_texts(data_texts, data_metadata)

In [12]:
print(f"Generated {len(chunks)} chunks from {len(data_texts)} admission records")
print("\nThe chunks:")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}: {chunk}")

Generated 20 chunks from 20 admission records

The chunks:
Chunk 1: Subject 2454, HADM 872390, admitted on 2024-06-10 14:13:22
Chunk 2: Subject 5319, HADM 256217, admitted on 2024-11-12 14:13:22
Chunk 3: Subject 4866, HADM 581271, admitted on 2024-08-08 14:13:22
Chunk 4: Subject 1657, HADM 122498, admitted on 2024-11-21 14:13:22
Chunk 5: Subject 3222, HADM 191836, admitted on 2024-05-29 14:13:22
Chunk 6: Subject 3343, HADM 103309, admitted on 2024-12-10 14:13:22
Chunk 7: Subject 4943, HADM 984392, admitted on 2024-10-29 14:13:22
Chunk 8: Subject 3761, HADM 682863, admitted on 2025-01-28 14:13:22
Chunk 9: Subject 3330, HADM 343504, admitted on 2025-02-10 14:13:22
Chunk 10: Subject 2411, HADM 397528, admitted on 2024-06-09 14:13:22
Chunk 11: Subject 7523, HADM 384362, admitted on 2024-03-14 14:13:22
Chunk 12: Subject 4993, HADM 811660, admitted on 2024-06-03 14:13:22
Chunk 13: Subject 1316, HADM 366414, admitted on 2024-09-02 14:13:22
Chunk 14: Subject 7484, HADM 120955, admitted on 2024

In [13]:
# Initialize the embedding model
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [14]:
# Check for cached embeddings or generate new ones
import pickle

In [15]:
cached_embeddings = None
if os.path.exists("embeddings_cache.pkl"):
    with open("embeddings_cache.pkl", "rb") as f:
        cached_embeddings = pickle.load(f)

In [16]:
if cached_embeddings is not None:
    print("Loading embeddings from cache...")
    chunk_embeddings = cached_embeddings
    print(f"Loaded {len(chunk_embeddings)} embeddings from cache")
else:
    print("Generating embeddings...")
    chunk_embeddings = embedder.embed_documents(chunks)
    with open("embeddings_cache.pkl", "wb") as f:
        pickle.dump(chunk_embeddings, f)

Generating embeddings...


In [17]:
# Convert to numpy array for FAISS
embeddings_array = np.array(chunk_embeddings).astype('float32')

In [18]:
# Create a FAISS index
dimension = embeddings_array.shape[1]  # Get the embedding dimension
index = faiss.IndexFlatL2(dimension)   # Using L2 distance for similarity
index.add(embeddings_array)            # Add vectors to the index

In [19]:
print(f"Created FAISS index with {index.ntotal} vectors of dimension {dimension}")

Created FAISS index with 20 vectors of dimension 384


In [20]:
# Show the first few values of the first embedding vector
print(f"\nSample embedding values: {chunk_embeddings[0][:5]}...")


Sample embedding values: [-0.0009614374139346182, 0.008348588831722736, 0.03792175278067589, 0.019212789833545685, 0.03785982355475426]...


In [21]:
# Define retrieval and generation functions
def retrieve_with_faiss(query, index, chunks, metadata, top_k=3):
    """Retrieve relevant chunks using FAISS index"""
    query_vector = np.array([embedder.embed_query(query)]).astype('float32')
    
    # Search the index
    distances, indices = index.search(query_vector, top_k)
    
    results = []
    for i, idx in enumerate(indices[0]):
        if idx < len(chunks):  # Ensure index is valid
            results.append({
                "chunk": chunks[idx],
                "distance": distances[0][i],
                "metadata": metadata[idx] if idx < len(metadata) else {}
            })
    
    return results

In [22]:
def answer_query(user_query, chunks, index, chunk_meta, chat_model, top_k=3):
    """Answer a query using RAG"""
    results = retrieve_with_faiss(user_query, index, chunks, chunk_meta, top_k)
    retrieved_context = "\n".join([f"- {r['chunk']}" for r in results])
    
    # Generate answer
    messages = [
        SystemMessage(content="You are a helpful medical assistant. Answer based only on the context provided."),
        HumanMessage(content=f"Context:\n{retrieved_context}\n\nUser query: {user_query}\nAnswer in a concise way:")
    ]
    
    response = chat_model.invoke(messages)
    
    return {
        "query": user_query,
        "retrieved_contexts": [r["chunk"] for r in results],
        "relevance_scores": [r["distance"] for r in results],
        "answer": response.content
    }

In [23]:
# Initialize chat model
chat = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)

  chat = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)


In [24]:
# Example queries
example_queries = [
    "Who was admitted in February 2139?",
    "When was Subject 10348 admitted?",
    "Are there any admissions from the 2180s?",
    "Which patient has the most recent admission date?",
    "How many subjects were admitted after 2150?",
    "List all patients admitted in the year 2109",
    "Was anyone admitted at night after 10 PM?",
    "What's the earliest admission date in the dataset?",
    "How many subjects have ID numbers below 5000?",
    "Are there more admissions before or after 2150?"
]

In [25]:
# Test each query
for query in example_queries:
    print(f"\nQuery: {query}")
    result = answer_query(query, chunks, index, chunk_metadata, chat)
    print("\nRetrieved contexts:")
    for i, context in enumerate(result["retrieved_contexts"]):
        print(f"{i+1}. {context} (distance: {result['relevance_scores'][i]:.4f})")
    print(f"\nAnswer: {result['answer']}")
    print("-" * 80)


Query: Who was admitted in February 2139?

Retrieved contexts:
1. Subject 9278, HADM 420114, admitted on 2024-06-12 14:13:22 (distance: 0.7842)
2. Subject 2454, HADM 872390, admitted on 2024-06-10 14:13:22 (distance: 0.8034)
3. Subject 4993, HADM 811660, admitted on 2024-06-03 14:13:22 (distance: 0.8124)

Answer: There is no information provided about any admissions in February 2139.
--------------------------------------------------------------------------------

Query: When was Subject 10348 admitted?

Retrieved contexts:
1. Subject 8602, HADM 829266, admitted on 2024-07-30 14:13:22 (distance: 0.5706)
2. Subject 2454, HADM 872390, admitted on 2024-06-10 14:13:22 (distance: 0.5904)
3. Subject 3343, HADM 103309, admitted on 2024-12-10 14:13:22 (distance: 0.5912)

Answer: There is no information provided about Subject 10348's admission date.
--------------------------------------------------------------------------------

Query: Are there any admissions from the 2180s?

Retrieved conte