In [1]:
import os
import shutil
import pandas as pd
import logging
import json
import getpass
from dotenv import load_dotenv
from datetime import datetime
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.prompts.chat import ChatPromptTemplate
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Provide your Google API key here: ")

In [4]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [5]:
MERGED_DATA_PATH = os.getenv("MERGED_DATA_PATH", "hospital_data/merged_data/merged_calculated_hospital_data.csv")
CHROMA_PATH = os.getenv("CHROMA_PATH", "chroma_db")
DATA_DICTIONARY_PATH = os.getenv("DATA_DICTIONARY_PATH", "hospital_data/dictionary/data_dictionary.csv")

In [26]:
# Load the merged CSV file into a DataFrame with validation and logging
def load_merged_document():
    try:
        df = pd.read_csv(MERGED_DATA_PATH, dtype={'ZIP': str})  # Ensure ZIP codes are loaded as strings
        # Handle missing values by filling appropriately based on dtype
        for column in df.columns:
            if df[column].dtype == 'float64' or df[column].dtype == 'int64':
                df[column] = df[column].fillna(0)  # Fill numeric columns with 0 or another default value
            else:
                df[column] = df[column].fillna("N/A")  # Fill non-numeric columns with "N/A"
        logging.info(f"Loaded merged CSV successfully with {len(df)} rows.")
        return df
    except Exception as e:
        logging.error(f"Error loading merged CSV: {e}")
        return None

In [27]:
merged_data_df = load_merged_document()

2024-11-07 20:03:55,530 - INFO - Loaded merged CSV successfully with 60922 rows.


In [28]:
merged_data_df.head()

Unnamed: 0,Id_encounter,START,STOP,PATIENT,ORGANIZATION,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,...,DESCRIPTION_procedure,BASE_COST,REASONCODE_procedure,REASONDESCRIPTION_procedure,Age,DURATION_HOURS,Age Group,MONTH,HOUR,OUT_OF_POCKET
0,32c84703-2481-49cd-d571-3899d5820253,2011-01-02 09:26:36,2011-01-02 12:58:36,3de74169-7f67-9304-91d4-757e0f3a14d2,d78e84ec-30aa-3bba-a33a-f29a3a454662,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,ambulatory,185347001,Encounter for problem (procedure),85.55,...,Renal dialysis (procedure),903.0,0.0,,96,3.533333,66+,2011-01,9,1018.02
1,c98059da-320a-c0a6-fced-c8815f3e3f39,2011-01-03 05:44:39,2011-01-03 06:01:42,d9ec2e44-32e9-9148-179a-1653348cc4e2,d78e84ec-30aa-3bba-a33a-f29a3a454662,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,outpatient,308335008,Patient encounter procedure,142.58,...,Intramuscular injection,2477.0,0.0,,60,0.284167,51-65,2011-01,5,2619.36
2,4ad28a3a-2479-782b-f29c-d5b3f41a001e,2011-01-03 14:32:11,2011-01-03 14:47:11,73babadf-5b2b-fee7-189e-6f41ff213e01,d78e84ec-30aa-3bba-a33a-f29a3a454662,7caa7254-5050-3b5e-9eae-bd5ea30e809c,outpatient,185349003,Encounter for check up (procedure),85.55,...,,0.0,0.0,,100,0.25,66+,2011-01,14,156.32
3,c3f4da61-e4b4-21d5-587a-fbc89943bc19,2011-01-03 16:24:45,2011-01-03 16:39:45,3b46a0b7-0f34-9b9a-c319-ace4a1f58c0b,d78e84ec-30aa-3bba-a33a-f29a3a454662,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,162673000,General examination of patient (procedure),136.8,...,,0.0,0.0,,101,0.25,,2011-01,16,1784.24
4,a9183b4f-2572-72ea-54c2-b3cd038b4be7,2011-01-03 17:36:53,2011-01-03 17:51:53,fa006887-d93c-d302-8b89-f3c25f88c0e1,d78e84ec-30aa-3bba-a33a-f29a3a454662,42c4fca7-f8a9-3cd1-982a-dd9751bf3e2a,ambulatory,390906007,Follow-up encounter,85.55,...,,0.0,0.0,,72,0.25,66+,2011-01,17,234.72


In [29]:
# Delete unwanted columns (nominal data)
merged_data_df = merged_data_df.drop(
    columns=['Id_encounter', 'PATIENT', 'ORGANIZATION', 'PAYER', 'CODE', 
             'REASONCODE', 'Id_patient', 'LAT', 'LON', 'Id', 'LAT_organization', 'LON_organization', 'Id_payer', 'ENCOUNTER', 
             'CODE_procedure', 'REASONCODE_procedure', 'REASONDESCRIPTION_procedure', 'MONTH', 'HOUR']
)

In [30]:
merged_data_df.head()

Unnamed: 0,START,STOP,ENCOUNTERCLASS,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONDESCRIPTION,BIRTHDATE,DEATHDATE,...,ZIP_payer,PHONE,START_procedure,STOP_procedure,DESCRIPTION_procedure,BASE_COST,Age,DURATION_HOURS,Age Group,OUT_OF_POCKET
0,2011-01-02 09:26:36,2011-01-02 12:58:36,ambulatory,Encounter for problem (procedure),85.55,1018.02,0.0,,1928-08-25,2017-02-04,...,0.0,,2011-01-02T09:26:36Z,2011-01-02T12:58:36Z,Renal dialysis (procedure),903.0,96,3.533333,66+,1018.02
1,2011-01-03 05:44:39,2011-01-03 06:01:42,outpatient,Patient encounter procedure,142.58,2619.36,0.0,,1964-01-05,2020-06-02,...,0.0,,2011-01-03T05:44:39Z,2011-01-03T06:01:42Z,Intramuscular injection,2477.0,60,0.284167,51-65,2619.36
2,2011-01-03 14:32:11,2011-01-03 14:47:11,outpatient,Encounter for check up (procedure),85.55,461.59,305.27,,1924-06-30,,...,21244.0,1-800-633-4227,,,,0.0,100,0.25,66+,156.32
3,2011-01-03 16:24:45,2011-01-03 16:39:45,wellness,General examination of patient (procedure),136.8,1784.24,0.0,,1923-05-21,2021-01-04,...,0.0,,,,,0.0,101,0.25,,1784.24
4,2011-01-03 17:36:53,2011-01-03 17:51:53,ambulatory,Follow-up encounter,85.55,234.72,0.0,Hyperlipidemia,1952-11-02,,...,46204.0,1-800-331-1476,,,,0.0,72,0.25,66+,234.72


In [31]:
# Renaming columns for better context
merged_data_df = merged_data_df.rename(
    columns={
        'START': 'encounter_start_time',
        'STOP': 'encounter_stop_time',
        'ENCOUNTERCLASS': 'encounter_class',
        'DESCRIPTION': 'encounter_description',
        'BASE_ENCOUNTER_COST': 'base_encounter_cost',
        'TOTAL_CLAIM_COST': 'total_claim_cost',
        'PAYER_COVERAGE': 'payer_coverage_amount',
        'REASONDESCRIPTION': 'diagnosis_description',
        'BIRTHDATE': 'patient_birth_date',
        'DEATHDATE': 'patient_death_date',
        'PREFIX': 'patient_prefix',
        'FIRST': 'patient_first_name',
        'LAST': 'patient_last_name',
        'SUFFIX': 'patient_suffix',
        'MAIDEN': 'patient_maiden_name',
        'MARITAL': 'patient_marital_status',
        'RACE': 'patient_race',
        'ETHNICITY': 'patient_ethnicity',
        'GENDER': 'patient_gender',
        'BIRTHPLACE': 'patient_birth_place',
        'ADDRESS': 'patient_address',
        'CITY': 'patient_city',
        'STATE': 'patient_state',
        'COUNTY': 'patient_county',
        'ZIP': 'patient_zip',
        'NAME': 'hospital_name',
        'ADDRESS_organization': 'hospital_address',
        'CITY_organization': 'hospital_city',
        'STATE_organization': 'hospital_state',
        'ZIP_organization': 'hospital_zip',
        'NAME_payer': 'insurer_name',
        'ADDRESS_payer': 'insurer_address',
        'CITY_payer': 'insurer_city',
        'STATE_HEADQUARTERED': 'insurer_headquartered_state',
        'ZIP_payer': 'insurer_zip',
        'PHONE': 'insurer_phone',
        'START_procedure': 'procedure_start_time',
        'STOP_procedure': 'procedure_stop_time',
        'DESCRIPTION_procedure': 'procedure_description',
        'BASE_COST': 'procedure_base_cost',
        'Age': 'patient_age',
        'DURATION_HOURS': 'encounter_duration_hours',
        'Age Group': 'patient_age_group',
        'OUT_OF_POCKET': 'patient_out_of_pocket_amount'
    }
)

In [32]:
merged_data_df.head()

Unnamed: 0,encounter_start_time,encounter_stop_time,encounter_class,encounter_description,base_encounter_cost,total_claim_cost,payer_coverage_amount,diagnosis_description,patient_birth_date,patient_death_date,...,insurer_zip,insurer_phone,procedure_start_time,procedure_stop_time,procedure_description,procedure_base_cost,patient_age,encounter_duration_hours,patient_age_group,patient_out_of_pocket_amount
0,2011-01-02 09:26:36,2011-01-02 12:58:36,ambulatory,Encounter for problem (procedure),85.55,1018.02,0.0,,1928-08-25,2017-02-04,...,0.0,,2011-01-02T09:26:36Z,2011-01-02T12:58:36Z,Renal dialysis (procedure),903.0,96,3.533333,66+,1018.02
1,2011-01-03 05:44:39,2011-01-03 06:01:42,outpatient,Patient encounter procedure,142.58,2619.36,0.0,,1964-01-05,2020-06-02,...,0.0,,2011-01-03T05:44:39Z,2011-01-03T06:01:42Z,Intramuscular injection,2477.0,60,0.284167,51-65,2619.36
2,2011-01-03 14:32:11,2011-01-03 14:47:11,outpatient,Encounter for check up (procedure),85.55,461.59,305.27,,1924-06-30,,...,21244.0,1-800-633-4227,,,,0.0,100,0.25,66+,156.32
3,2011-01-03 16:24:45,2011-01-03 16:39:45,wellness,General examination of patient (procedure),136.8,1784.24,0.0,,1923-05-21,2021-01-04,...,0.0,,,,,0.0,101,0.25,,1784.24
4,2011-01-03 17:36:53,2011-01-03 17:51:53,ambulatory,Follow-up encounter,85.55,234.72,0.0,Hyperlipidemia,1952-11-02,,...,46204.0,1-800-331-1476,,,,0.0,72,0.25,66+,234.72


In [33]:
# Load Data Dictionary
def load_data_dictionary():
    if os.path.exists(DATA_DICTIONARY_PATH):
        try:
            data_dict = pd.read_csv(DATA_DICTIONARY_PATH)
            logging.info("Data dictionary loaded successfully.")
            return data_dict
        except Exception as e:
            logging.error(f"Error loading data dictionary: {e}")
    return None

In [34]:
data_dictionary_df = load_data_dictionary()

2024-11-07 20:05:00,483 - INFO - Data dictionary loaded successfully.


In [35]:
data_dictionary_df

Unnamed: 0,Table,Field,Description
0,encounters,,Patient encounter data
1,encounters,Id,Primary Key. Unique Identifier of the encounter.
2,encounters,Start,The date and time (iso8601 UTC Date (yyyy-MM-d...
3,encounters,Stop,The date and time (iso8601 UTC Date (yyyy-MM-d...
4,encounters,Patient,Foreign key to the Patient.
...,...,...,...
60,procedures,Code,Procedure code from SNOMED-CT
61,procedures,Description,Description of the procedure.
62,procedures,Base_Cost,The line item cost of the procedure.
63,procedures,ReasonCode,Diagnosis code from SNOMED-CT specifying why t...


In [36]:
# Convert DataFrame into Documents with enriched metadata
def convert_to_documents(df: pd.DataFrame, data_dictionary: pd.DataFrame = None):
    documents = []
    for _, row in df.iterrows():
        text = "\n".join([f"{col}: {row[col]}" for col in df.columns])  # Use newline for better readability
        metadata = row.to_dict()
        metadata.update({
            "document_type": "organization_info" if "NAME" in df.columns else "unknown",
            "load_timestamp": datetime.now().isoformat(),
            "data_source": "hospital_data"
        })
        # Add field descriptions if data dictionary is available
        if data_dictionary is not None:
            field_descriptions = {}
            for col in df.columns:
                desc_row = data_dictionary[data_dictionary['Field'] == col]
                if not desc_row.empty:
                    field_descriptions[col] = desc_row['Description'].values[0]
            # Serialize field_descriptions to a JSON string
            metadata["field_descriptions"] = json.dumps(field_descriptions)

        document = Document(page_content=text, metadata=metadata)
        documents.append(document)

    if data_dictionary is not None:
        for _, row in data_dictionary.iterrows():
            text = "\n".join([f"{col}: {row[col]}" for col in data_dictionary.columns])
            document = Document(page_content=text, metadata={
                "source": "data_dictionary",
                "document_type": "data_dictionary",
                "load_timestamp": datetime.now().isoformat(),
                "data_source": "data_dictionary"
            })
            documents.append(document)

    return documents

In [37]:
data = convert_to_documents(merged_data_df)

In [38]:
data

[Document(metadata={'encounter_start_time': '2011-01-02 09:26:36', 'encounter_stop_time': '2011-01-02 12:58:36', 'encounter_class': 'ambulatory', 'encounter_description': 'Encounter for problem (procedure)', 'base_encounter_cost': 85.55, 'total_claim_cost': 1018.02, 'payer_coverage_amount': 0.0, 'diagnosis_description': 'N/A', 'patient_birth_date': '1928-08-25', 'patient_death_date': '2017-02-04', 'patient_prefix': 'Mr.', 'patient_first_name': 'Mariano761', 'patient_last_name': "O'Kon634", 'patient_suffix': 'N/A', 'patient_maiden_name': 'N/A', 'patient_marital_status': 'M', 'patient_race': 'white', 'patient_ethnicity': 'nonhispanic', 'patient_gender': 'M', 'patient_birth_place': 'Palermo  Sicily  IT', 'patient_address': '531 Little Crossing', 'patient_city': 'Boston', 'patient_state': 'Massachusetts', 'patient_county': 'Suffolk County', 'patient_zip': '2132.0', 'hospital_name': 'MASSACHUSETTS GENERAL HOSPITAL', 'hospital_address': '55 FRUIT STREET', 'hospital_city': 'BOSTON', 'hospital

In [39]:
# Split documents into chunks with dynamic chunk sizing
def split_text(documents: list):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=600,
        chunk_overlap=50,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    logging.info(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    if len(chunks) > 10:
        document = chunks[10]
        logging.info(f"Sample chunk content: {document.page_content}")
        logging.info(f"Sample chunk metadata: {document.metadata}")

    return chunks

In [40]:
chunks = split_text(data)

2024-11-07 20:05:47,699 - INFO - Split 60922 documents into 182766 chunks.
2024-11-07 20:05:47,700 - INFO - Sample chunk content: patient_ethnicity: nonhispanic
patient_gender: M
patient_birth_place: Lowell  Massachusetts  US
patient_address: 475 Wunsch Overpass
patient_city: Boston
patient_state: Massachusetts
patient_county: Suffolk County
patient_zip: 2121.0
hospital_name: MASSACHUSETTS GENERAL HOSPITAL
hospital_address: 55 FRUIT STREET
hospital_city: BOSTON
hospital_state: MA
hospital_zip: 2114
insurer_name: NO_INSURANCE
insurer_address: N/A
insurer_city: N/A
insurer_headquartered_state: N/A
insurer_zip: 0.0
insurer_phone: N/A
procedure_start_time: N/A
procedure_stop_time: N/A
procedure_description: N/A
2024-11-07 20:05:47,700 - INFO - Sample chunk metadata: {'encounter_start_time': '2011-01-03 16:24:45', 'encounter_stop_time': '2011-01-03 16:39:45', 'encounter_class': 'wellness', 'encounter_description': 'General examination of patient (procedure)', 'base_encounter_cost': 136.8, '

In [41]:
chunks

[Document(metadata={'encounter_start_time': '2011-01-02 09:26:36', 'encounter_stop_time': '2011-01-02 12:58:36', 'encounter_class': 'ambulatory', 'encounter_description': 'Encounter for problem (procedure)', 'base_encounter_cost': 85.55, 'total_claim_cost': 1018.02, 'payer_coverage_amount': 0.0, 'diagnosis_description': 'N/A', 'patient_birth_date': '1928-08-25', 'patient_death_date': '2017-02-04', 'patient_prefix': 'Mr.', 'patient_first_name': 'Mariano761', 'patient_last_name': "O'Kon634", 'patient_suffix': 'N/A', 'patient_maiden_name': 'N/A', 'patient_marital_status': 'M', 'patient_race': 'white', 'patient_ethnicity': 'nonhispanic', 'patient_gender': 'M', 'patient_birth_place': 'Palermo  Sicily  IT', 'patient_address': '531 Little Crossing', 'patient_city': 'Boston', 'patient_state': 'Massachusetts', 'patient_county': 'Suffolk County', 'patient_zip': '2132.0', 'hospital_name': 'MASSACHUSETTS GENERAL HOSPITAL', 'hospital_address': '55 FRUIT STREET', 'hospital_city': 'BOSTON', 'hospital

In [43]:
# Save chunks to Chroma vector store with batch saving and logging
def save_to_chroma(chunks: list):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
        logging.info(f"Cleared existing database at {CHROMA_PATH}.")

    # Use Google Generative AI Embeddings with API Key
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

    # Create a new DB from the documents with batch processing
    batch_size = 1000  # Save in batches to handle large datasets
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        db = Chroma.from_documents(
            batch, embeddings, persist_directory=CHROMA_PATH
        )
        logging.info(f"Saved batch {i // batch_size + 1} to {CHROMA_PATH}.")

    logging.info(f"Saved all chunks to {CHROMA_PATH}.")

In [44]:
save_to_chroma(chunks)

2024-11-07 20:07:47,675 - INFO - Cleared existing database at chroma_db.
2024-11-07 20:07:48,729 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-11-07 20:08:03,210 - INFO - Saved batch 1 to chroma_db.
2024-11-07 20:08:16,952 - INFO - Saved batch 2 to chroma_db.
2024-11-07 20:08:29,057 - INFO - Saved batch 3 to chroma_db.
2024-11-07 20:08:41,229 - INFO - Saved batch 4 to chroma_db.
2024-11-07 20:08:53,502 - INFO - Saved batch 5 to chroma_db.
2024-11-07 20:09:05,461 - INFO - Saved batch 6 to chroma_db.
2024-11-07 20:09:17,248 - INFO - Saved batch 7 to chroma_db.
2024-11-07 20:09:28,572 - INFO - Saved batch 8 to chroma_db.
2024-11-07 20:09:39,926 - INFO - Saved batch 9 to chroma_db.
2024-11-07 20:09:51,848 - INFO - Saved batch 10 to chroma_db.
2024-11-07 20:10:04,589 - INFO - Saved batch 11 to chroma_db.
2024-11-07 20:10:17,102 - INFO - Saved batch 12 to chroma_db.
2024-11-07 20:10:28,803 - INFO - Saved batch 1

In [56]:
def chat(query_text):
    # Prepare the DB
    embedding_function = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB with similarity filtering
    results = db.similarity_search_with_relevance_scores(query_text, k=5)
    print(f"Retrieved results: {results}")

    # Adjusted Similarity Threshold
    if len(results) == 0:
        print("Unable to find matching results.")
        return

    # Check if any results exceed a set threshold or if the best available should be used
    threshold = 0.7  # change as necessary 
    relevant_results = [result for result in results if result[1] >= threshold]

    if len(relevant_results) == 0:
        print("No results exceed the similarity threshold. Returning the best available match.")
        relevant_results = [results[0]]  # Return the best available match if no results exceed threshold

    # Prepare context from the results
    context_texts = []
    document_types = []
    data_sources = []
    field_descriptions_list = []
    patient_ids = []

    for doc, _score in relevant_results:
        context_texts.append(doc.page_content)
        document_types.append(doc.metadata.get("document_type", "unknown"))
        data_sources.append(doc.metadata.get("data_source", "unknown"))
        if "field_descriptions" in doc.metadata:
            field_descriptions_list.append(json.loads(doc.metadata["field_descriptions"]))
        if "PATIENT" in doc.metadata:
            patient_ids.append(doc.metadata["PATIENT"])

    # Compile context with separators
    context_text = "\n\n---\n\n".join(context_texts)
    document_type_context = ", ".join(set(document_types))
    data_source_context = ", ".join(set(data_sources))
    field_descriptions_context = json.dumps(field_descriptions_list, indent=2) if field_descriptions_list else "None"
    patient_ids_context = ", ".join(set(patient_ids)) if patient_ids else "None"

    # Update the prompt template with new metadata
    PROMPT_TEMPLATE = """
        You are an expert data analyst that has access to a hospital dataset that has been denormalized to a single table, 
        and contains encounter information, patient details, payment amount information, insurer details, 
        hospital details and procedures performed on patients.
         
        Answer user question based on the following context:
        
        {context}
        
        ---
        
        Metadata Information:
        Document Type: {document_type}
        Data Source: {data_source}
        Field Descriptions: {field_descriptions}
        Patient Identifiers: {patient_ids}
        
        ---
        
        Answer this question based on the above context: {question}
        """

    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(
        context=context_text,
        question=query_text,
        document_type=document_type_context,
        data_source=data_source_context,
        field_descriptions=field_descriptions_context,
        patient_ids=patient_ids_context
    )

    # Query the model
    model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
    response_text = model.predict(prompt)

    # Compile and print response with sources
    sources = [doc.metadata.get("source", "Unknown") for doc, _score in relevant_results if doc.metadata]
    formatted_response = f"Response: {response_text}\n\n\nSources: {sources}"
    print(prompt)
    print(formatted_response)


In [57]:
query_text = "what is the name of the hospital in question?"
chat(query_text)

Retrieved results: [(Document(metadata={'base_encounter_cost': 85.55, 'data_source': 'hospital_data', 'diagnosis_description': 'N/A', 'document_type': 'unknown', 'encounter_class': 'outpatient', 'encounter_description': 'Encounter for check up (procedure)', 'encounter_duration_hours': 0.25, 'encounter_start_time': '2015-11-19 02:26:41', 'encounter_stop_time': '2015-11-19 02:41:41', 'hospital_address': '55 FRUIT STREET', 'hospital_city': 'BOSTON', 'hospital_name': 'MASSACHUSETTS GENERAL HOSPITAL', 'hospital_state': 'MA', 'hospital_zip': 2114, 'insurer_address': '151 Farmington Ave', 'insurer_city': 'Hartford', 'insurer_headquartered_state': 'CT', 'insurer_name': 'Aetna', 'insurer_phone': '1-800-872-3862', 'insurer_zip': 6156.0, 'load_timestamp': '2024-11-07T20:05:16.540560', 'patient_address': '542 Jenkins Walk', 'patient_age': 57, 'patient_age_group': '51-65', 'patient_birth_date': '1967-09-06', 'patient_birth_place': 'Northbridge  Massachusetts  US', 'patient_city': 'North Scituate', 