In [57]:
import pdfplumber
import pandas as pd
import re

In [58]:
pdf_path = "20231021_PreetpalBloodReport_Detailed_0202WJ007460202_228745k.pdf"

In [59]:
full_text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        full_text += page.extract_text() + "\n"

print(full_text)

DIAGNOSTIC REPORT
PATIENT NAME : PREETPAL SINGH S/O S.AMRIK SINGH REF. DOCTOR : SELF
ACCESSION NO :0202WJ007460 AGE/SEX :44 Years Male
PREETPAL SINGH S/O S.AMRIK SINGH PATIENT ID : PREEM19027978A DRAWN :21/10/2023 12:36:27
AMRITSAR
CLIENT PATIENT ID: RECEIVED :21/10/2023 12:39:49
Amritsar 143001
ABHA NO : REPORTED :21/10/2023 14:56:46
Test Report Status Final Results Biological Reference Interval Units
HAEMATOLOGY - CBC
COMPLETE CARE TOTAL WITH SMART REPORT
BLOOD COUNTS,EDTA WHOLE BLOOD
HEMOGLOBIN (HB) 14.5 13.0 - 17.0 g/dL
METHOD : CYANMETHEMOGLOBIN METHOD
RED BLOOD CELL (RBC) COUNT 5.01 4.5 - 5.5 mil/µL
METHOD : ELECTRICAL IMPEDANCE
WHITE BLOOD CELL (WBC) COUNT 14.10 High 4.0 - 10.0 thou/µL
METHOD : ELECTRICAL IMPEDANCE
PLATELET COUNT 296 150 - 410 thou/µL
METHOD : ELECTRONIC IMPEDANCE/CALCULATION
RBC AND PLATELET INDICES
HEMATOCRIT (PCV) 43.9 40 - 50 %
METHOD : ELECTRICAL IMPEDANCE
MEAN CORPUSCULAR VOLUME (MCV) 88.0 83 - 101 fL
METHOD : CALCULATED PARAMETER
MEAN CORPUSCULAR HEMOGLOB

In [60]:
pattern = re.compile(
    r"([A-Z \-/\(\)%]+?)(?:\s*01)?\s+([0-9.]+)\s+(Low|High)?\s*([<>=\-0-9.]+)?\s*[-–]?\s*([<>=\-0-9.]+)?\s+([a-zA-Z/%μgµULdlmg]+)",
    re.IGNORECASE
)
pattern1 = re.compile(
    r"""^
    (?:[A-Za-z]{3,9}\s+\d{1,2},\s+\d{4}\s+\d{1,2}:\d{2}\s+)?   # Optional date/time at start
    ([A-Za-z0-9 \-/\(\)%]+?)                                    # Test Name (non-greedy)
    (?:\s*01)?                                                  # Optional '01' after test name
    \s+([0-9.]+)                                                # Value
    (?:\s+([a-zA-Z/%μgµULdlmg]+))?                              # Optional Unit
    \s+([0-9.<>=-]+)                                            # Low (start of reference range)
    [-–]([0-9.<>=-]+)                                           # High (end of reference range)
    \s*([HL]?)                                                  # Optional Flag (H/L)
    $""",
    re.VERBOSE
)

In [62]:
results = []
for match in pattern.finditer(full_text):
    test_name = match.group(1).strip()
    value = match.group(2).strip()
    flag = match.group(3)
    low = match.group(4)
    high = match.group(5)
    unit = match.group(6)
    results.append([test_name, value, flag, low, high, unit])

print(results)

[['HEMOGLOBIN (HB)', '14.5', None, '13.0', '17.0', 'g/dL'], ['RED BLOOD CELL (RBC) COUNT', '5.01', None, '4.5', '5.5', 'mil/µL'], ['WHITE BLOOD CELL (WBC) COUNT', '14.10', 'High', '4.0', '10.0', 'thou/µL'], ['PLATELET COUNT', '296', None, '150', '410', 'thou/µL'], ['HEMATOCRIT (PCV)', '43.9', None, '40', '50', '%'], ['MEAN CORPUSCULAR VOLUME (MCV)', '88.0', None, '83', '101', 'fL'], ['MEAN CORPUSCULAR HEMOGLOBIN (MCH)', '28.9', None, '27.0', '32.0', 'pg'], ['MEAN CORPUSCULAR HEMOGLOBIN', '32.9', None, '31.5', '34.5', 'g/dL'], ['RED CELL DISTRIBUTION WIDTH (RDW)', '14.3', 'High', '11.6', '14.0', '%'], ['MEAN PLATELET VOLUME (MPV)', '8.1', None, '6.8', '10.9', 'fL'], ['NEUTROPHILS', '62', None, '40', '80', '%'], ['LYMPHOCYTES', '32', None, '20', '40', '%'], ['MONOCYTES', '4', None, '2', '10', '%'], ['EOSINOPHILS', '2', None, '1', '6', '%'], ['BASOPHILS', '0', None, '0', '2', '%'], ['ABSOLUTE NEUTROPHIL COUNT', '8.74', 'High', '2.0', '7.0', 'thou/µL'], ['ABSOLUTE LYMPHOCYTE COUNT', '4.51'

In [63]:
df_results = pd.DataFrame(results, columns=["Test Name", "Value", "Flag", "Low", "High", "Unit"])
print(df_results.head(10))

                           Test Name  Value  Flag   Low  High     Unit
0                    HEMOGLOBIN (HB)   14.5  None  13.0  17.0     g/dL
1         RED BLOOD CELL (RBC) COUNT   5.01  None   4.5   5.5   mil/µL
2       WHITE BLOOD CELL (WBC) COUNT  14.10  High   4.0  10.0  thou/µL
3                     PLATELET COUNT    296  None   150   410  thou/µL
4                   HEMATOCRIT (PCV)   43.9  None    40    50        %
5      MEAN CORPUSCULAR VOLUME (MCV)   88.0  None    83   101       fL
6  MEAN CORPUSCULAR HEMOGLOBIN (MCH)   28.9  None  27.0  32.0       pg
7        MEAN CORPUSCULAR HEMOGLOBIN   32.9  None  31.5  34.5     g/dL
8  RED CELL DISTRIBUTION WIDTH (RDW)   14.3  High  11.6  14.0        %
9         MEAN PLATELET VOLUME (MPV)    8.1  None   6.8  10.9       fL


In [64]:
row, column = df_results.shape
print(f"Rows: {row}")
print(f"Columns: {column}")

Rows: 71
Columns: 6


In [65]:
unit_normalization = {
    "mil/µL": "mil/uL",
    "mil/μL": "mil/uL",
    "µg/dL": "ug/dL",
    "µIU/mL": "uIU/mL",
    "thou/µL": "10^3/uL",
    "thou/μL": "10^3/uL",
    "mg/dL": "mg/dL",
    "g/dL": "g/dL",
    "%": "%",
    "pg": "pg",
    "fL": "fL",
    "U/L": "U/L",
    "ng/mL": "ng/mL",
    "mmol/L": "mmol/L",
    "µmol/L": "umol/L"
}

In [66]:
df_results["Normalized Unit"] = df_results["Unit"].map(
    lambda x: unit_normalization.get(x.strip(), x.strip()) if pd.notna(x) else x
)

df_results.columns = [col.strip() for col in df_results.columns]
df_results["Test Name"] = df_results["Test Name"].str.strip()

print(df_results.head(10))

                           Test Name  Value  Flag   Low  High     Unit  \
0                    HEMOGLOBIN (HB)   14.5  None  13.0  17.0     g/dL   
1         RED BLOOD CELL (RBC) COUNT   5.01  None   4.5   5.5   mil/µL   
2       WHITE BLOOD CELL (WBC) COUNT  14.10  High   4.0  10.0  thou/µL   
3                     PLATELET COUNT    296  None   150   410  thou/µL   
4                   HEMATOCRIT (PCV)   43.9  None    40    50        %   
5      MEAN CORPUSCULAR VOLUME (MCV)   88.0  None    83   101       fL   
6  MEAN CORPUSCULAR HEMOGLOBIN (MCH)   28.9  None  27.0  32.0       pg   
7        MEAN CORPUSCULAR HEMOGLOBIN   32.9  None  31.5  34.5     g/dL   
8  RED CELL DISTRIBUTION WIDTH (RDW)   14.3  High  11.6  14.0        %   
9         MEAN PLATELET VOLUME (MPV)    8.1  None   6.8  10.9       fL   

  Normalized Unit  
0            g/dL  
1          mil/uL  
2         10^3/uL  
3         10^3/uL  
4               %  
5              fL  
6              pg  
7            g/dL  
8    

In [67]:
def determine_status(value, low, high, flag):
    try:
        value = float(value)
        if pd.notna(flag): 
            return flag.capitalize()
        if pd.notna(low) and pd.notna(high):
            low = float(low)
            high = float(high)
            if value < low:
                return "Low"
            elif value > high:
                return "High"
            else:
                return "Normal"
    except:
        return "Unknown"
    return "Unknown"


In [68]:
df_results["Status"] = df_results.apply(
    lambda row: determine_status(row["Value"], row["Low"], row["High"], row["Flag"]),
    axis=1
)

In [69]:
test_value_dict = {
    row["Test Name"]: f"{row['Value']} {row['Normalized Unit']} ({row['Status']})"
    for _, row in df_results.iterrows()
}

In [70]:
print(dict(list(test_value_dict.items())[:10]))
print(df_results[["Test Name", "Value", "Low", "High", "Status"]].head(10))

{'HEMOGLOBIN (HB)': '14.5 g/dL (Normal)', 'RED BLOOD CELL (RBC) COUNT': '5.01 mil/uL (Normal)', 'WHITE BLOOD CELL (WBC) COUNT': '14.10 10^3/uL (High)', 'PLATELET COUNT': '296 10^3/uL (Normal)', 'HEMATOCRIT (PCV)': '43.9 % (Normal)', 'MEAN CORPUSCULAR VOLUME (MCV)': '88.0 fL (Normal)', 'MEAN CORPUSCULAR HEMOGLOBIN (MCH)': '28.9 pg (Normal)', 'MEAN CORPUSCULAR HEMOGLOBIN': '32.9 g/dL (Normal)', 'RED CELL DISTRIBUTION WIDTH (RDW)': '14.3 % (High)', 'MEAN PLATELET VOLUME (MPV)': '8.1 fL (Normal)'}
                           Test Name  Value   Low  High  Status
0                    HEMOGLOBIN (HB)   14.5  13.0  17.0  Normal
1         RED BLOOD CELL (RBC) COUNT   5.01   4.5   5.5  Normal
2       WHITE BLOOD CELL (WBC) COUNT  14.10   4.0  10.0    High
3                     PLATELET COUNT    296   150   410  Normal
4                   HEMATOCRIT (PCV)   43.9    40    50  Normal
5      MEAN CORPUSCULAR VOLUME (MCV)   88.0    83   101  Normal
6  MEAN CORPUSCULAR HEMOGLOBIN (MCH)   28.9  27.0  32

In [71]:
df_results["Test Name"] = df_results["Test Name"].replace({
    "D": "25 - HYDROXYVITAMIN D",
    "LDL": "LDL CHOLESTEROL",
    "HDL": "HDL CHOLESTEROL",
    "CHOLESTEROL LDL": "LDL CHOLESTEROL",
    "CHOL/HDL RATIO": "TOTAL CHOLESTEROL : HDL RATIO"
}, regex=False)


df_results["Test Name"] = df_results["Test Name"].str.upper().str.strip()



In [72]:
first_order_findings = {
    row["Test Name"]: row["Status"]
    for _, row in df_results.iterrows()
    if row["Status"] in ["High", "Low"]
}

second_order_insights = []
causal_hypotheses = []


if any("GLUCOSE" in test and status == "High" for test, status in first_order_findings.items()) and \
   any("TRIGLYCERIDES" in test and status == "High" for test, status in first_order_findings.items()):
    second_order_insights.append("Possible insulin resistance")
    causal_hypotheses.append("High triglycerides and glucose suggest poor glucose metabolism")


lipid_keywords = ["LDL", "VLDL", "NON HDL", "CHOLESTEROL", "HDL RATIO"]
if any(any(key in test for key in lipid_keywords) and status == "High" for test, status in first_order_findings.items()):
    second_order_insights.append("Increased cardiovascular risk")
    causal_hypotheses.append("High LDL, low HDL, and poor cholesterol ratio suggest atherogenic lipid profile")

if any("RDW" in test and status == "High" for test, status in first_order_findings.items()):
    second_order_insights.append("Red cell size variation possible nutritional deficiency or anemia risk")
    causal_hypotheses.append("High RDW could indicate early signs of iron, B12, or folate deficiency")

if any("VITAMIN D" in test and status == "Low" for test, status in first_order_findings.items()):
    causal_hypotheses.append("Low Vitamin D may reduce insulin sensitivity")

if any("WBC" in test and status == "High" for test, status in first_order_findings.items()) or \
   any("NEUTROPHIL" in test and status == "High" for test, status in first_order_findings.items()) or \
   any("LYMPHOCYTE" in test and status == "High" for test, status in first_order_findings.items()):
    second_order_insights.append("Signs of immune response or systemic inflammation")
    causal_hypotheses.append("Elevated WBC, neutrophils, or lymphocytes suggest possible infection or inflammation")



print("First-Order Findings:\n")
for test, status in first_order_findings.items():
    print(f"- {test}: {status}")

print("\nSecond-Order Insights:\n")
for insight in second_order_insights:
    print(f"- {insight}")

print("\nCausal Hypotheses:\n")
for cause in causal_hypotheses:
    print(f"- {cause}")

First-Order Findings:

- WHITE BLOOD CELL (WBC) COUNT: High
- RED CELL DISTRIBUTION WIDTH (RDW): High
- ABSOLUTE NEUTROPHIL COUNT: High
- ABSOLUTE LYMPHOCYTE COUNT: High
- ABSOLUTE BASOPHIL COUNT: Low
- ESTIMATED AVERAGE GLUCOSE(EAG): High
- ALANINE AMINOTRANSFERASE (ALT/SGPT): High
- TRIGLYCERIDES: High
- HDL CHOLESTEROL: Low
- LDL CHOLESTEROL: High
- NON HDL CHOLESTEROL: High
- TOTAL CHOLESTEROL : HDL RATIO: High
- 25 - HYDROXYVITAMIN D: Low

Second-Order Insights:

- Possible insulin resistance
- Increased cardiovascular risk
- Red cell size variation possible nutritional deficiency or anemia risk
- Signs of immune response or systemic inflammation

Causal Hypotheses:

- High triglycerides and glucose suggest poor glucose metabolism
- High LDL, low HDL, and poor cholesterol ratio suggest atherogenic lipid profile
- High RDW could indicate early signs of iron, B12, or folate deficiency
- Low Vitamin D may reduce insulin sensitivity
- Elevated WBC, neutrophils, or lymphocytes suggest 

In [73]:
import json

narrative_explanation = (
    "Need AI to generate a comprehensive health report based on the blood test results. "
)

result_json = {
    "FirstOrderFindings": first_order_findings,
    "SecondOrderInsights": second_order_insights,
    "CausalHypotheses": causal_hypotheses,
    "NarrativeExplanation": narrative_explanation
}


print(json.dumps(result_json, indent=2))


with open("health_insights_report.json", "w") as f:
    json.dump(result_json, f, indent=4)
print(" Report saved to health_insights_report.json")


{
  "FirstOrderFindings": {
    "WHITE BLOOD CELL (WBC) COUNT": "High",
    "RED CELL DISTRIBUTION WIDTH (RDW)": "High",
    "ABSOLUTE NEUTROPHIL COUNT": "High",
    "ABSOLUTE LYMPHOCYTE COUNT": "High",
    "ABSOLUTE BASOPHIL COUNT": "Low",
    "ESTIMATED AVERAGE GLUCOSE(EAG)": "High",
    "ALANINE AMINOTRANSFERASE (ALT/SGPT)": "High",
    "TRIGLYCERIDES": "High",
    "HDL CHOLESTEROL": "Low",
    "LDL CHOLESTEROL": "High",
    "NON HDL CHOLESTEROL": "High",
    "TOTAL CHOLESTEROL : HDL RATIO": "High",
    "25 - HYDROXYVITAMIN D": "Low"
  },
  "SecondOrderInsights": [
    "Possible insulin resistance",
    "Increased cardiovascular risk",
    "Red cell size variation possible nutritional deficiency or anemia risk",
    "Signs of immune response or systemic inflammation"
  ],
  "CausalHypotheses": [
    "High triglycerides and glucose suggest poor glucose metabolism",
    "High LDL, low HDL, and poor cholesterol ratio suggest atherogenic lipid profile",
    "High RDW could indicate earl

In [74]:
import pandas as pd
import json
import re



first_order = {
    row["Test Name"]: row["Status"]
    for _, row in df_results.iterrows()
    if row.get("Status") in ["High", "Low"]
}


lab_summary = "\n".join([
    f"{row['Test Name']}: {row['Value']} {row['Normalized Unit']} ({row['Status']})"
    for _, row in df_results.iterrows()
    if row.get("Status") in ["High", "Low"]
])
print("Lab Summary:\n", lab_summary)

first_order_str = json.dumps(first_order, indent=2)

Lab Summary:
 WHITE BLOOD CELL (WBC) COUNT: 14.10 10^3/uL (High)
RED CELL DISTRIBUTION WIDTH (RDW): 14.3 % (High)
ABSOLUTE NEUTROPHIL COUNT: 8.74 10^3/uL (High)
ABSOLUTE LYMPHOCYTE COUNT: 4.51 10^3/uL (High)
ABSOLUTE BASOPHIL COUNT: 0.00 10^3/uL (Low)
ESTIMATED AVERAGE GLUCOSE(EAG): 116.9 mg/dL (High)
ALANINE AMINOTRANSFERASE (ALT/SGPT): 46 U/L (High)
TRIGLYCERIDES: 228 Normal (High)
HDL CHOLESTEROL: 36 Low (Low)
LDL CHOLESTEROL: 107 Optimal (High)
NON HDL CHOLESTEROL: 153 Desirable (High)
TOTAL CHOLESTEROL : HDL RATIO: 5.3 Low (High)
25 - HYDROXYVITAMIN D: 25.25 Deficiency (Low)


In [None]:
import openai

client = openai.OpenAI(api_key="Your-OpenAI-API-Key")

In [76]:

prompt = f"""
You are a medical assistant AI. Analyze the following first-order lab findings:

{json.dumps(first_order_findings, indent=2)}

Generate a structured output with the following keys:
- FirstOrderFindings (as provided)
- SecondOrderInsights (list of patterns)
- CausalHypotheses (list of causal links)
- NarrativeExplanation (detailed health summary in natural language)
Respond only in JSON.
"""


response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are a helpful medical assistant."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.3
)

ai_content = response.choices[0].message.content
print(ai_content)

{
  "FirstOrderFindings": {
    "WHITE BLOOD CELL (WBC) COUNT": "High",
    "RED CELL DISTRIBUTION WIDTH (RDW)": "High",
    "ABSOLUTE NEUTROPHIL COUNT": "High",
    "ABSOLUTE LYMPHOCYTE COUNT": "High",
    "ABSOLUTE BASOPHIL COUNT": "Low",
    "ESTIMATED AVERAGE GLUCOSE(EAG)": "High",
    "ALANINE AMINOTRANSFERASE (ALT/SGPT)": "High",
    "TRIGLYCERIDES": "High",
    "HDL CHOLESTEROL": "Low",
    "LDL CHOLESTEROL": "High",
    "NON HDL CHOLESTEROL": "High",
    "TOTAL CHOLESTEROL : HDL RATIO": "High",
    "25 - HYDROXYVITAMIN D": "Low"
  },
  "SecondOrderInsights": [
    "Increased Inflammation",
    "Possible Infection",
    "Potential Liver Dysfunction",
    "Potential Diabetes",
    "Potential Dyslipidemia",
    "Potential Vitamin D Deficiency"
  ],
  "CausalHypotheses": [
    "The high WBC, neutrophil, and lymphocyte counts may indicate an ongoing infection or inflammation.",
    "The high EAG and ALT levels suggest potential diabetes and liver dysfunction respectively.",
    "The

In [77]:
try:
    OpenAI_insights = json.loads(ai_content)
except json.JSONDecodeError:
    import re
    match = re.search(r"\{[\s\S]+\}", ai_content)
    insights = json.loads(match.group()) if match else {"error": "Parsing failed"}

# Print structured insights
print(json.dumps(OpenAI_insights, indent=2))
with open("OpenAI_health_insights_output.json", "w") as f:
    json.dump(OpenAI_insights, f, indent=2)


{
  "FirstOrderFindings": {
    "WHITE BLOOD CELL (WBC) COUNT": "High",
    "RED CELL DISTRIBUTION WIDTH (RDW)": "High",
    "ABSOLUTE NEUTROPHIL COUNT": "High",
    "ABSOLUTE LYMPHOCYTE COUNT": "High",
    "ABSOLUTE BASOPHIL COUNT": "Low",
    "ESTIMATED AVERAGE GLUCOSE(EAG)": "High",
    "ALANINE AMINOTRANSFERASE (ALT/SGPT)": "High",
    "TRIGLYCERIDES": "High",
    "HDL CHOLESTEROL": "Low",
    "LDL CHOLESTEROL": "High",
    "NON HDL CHOLESTEROL": "High",
    "TOTAL CHOLESTEROL : HDL RATIO": "High",
    "25 - HYDROXYVITAMIN D": "Low"
  },
  "SecondOrderInsights": [
    "Increased Inflammation",
    "Possible Infection",
    "Potential Liver Dysfunction",
    "Potential Diabetes",
    "Potential Dyslipidemia",
    "Potential Vitamin D Deficiency"
  ],
  "CausalHypotheses": [
    "The high WBC, neutrophil, and lymphocyte counts may indicate an ongoing infection or inflammation.",
    "The high EAG and ALT levels suggest potential diabetes and liver dysfunction respectively.",
    "The

In [None]:
similarity_prompt = f"""
Compare the following two JSON objects representing health report findings. 
Return a matching percentage (0-100) for how similar the findings are, 
and a short explanation of the main similarities and differences.

result_json:
{json.dumps(result_json, indent=2)}

OpenAI_insights:
{json.dumps(OpenAI_insights, indent=2)}

Respond in this JSON format:
{{
  "matching_percentage": <number>,
  "explanation": "<your explanation>"
}}
"""

# Call OpenAI (replace model and api_key as needed)
response = openai.OpenAI(api_key="Your-OpenAI-API-Key").chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are a helpful medical data analyst."},
        {"role": "user", "content": similarity_prompt}
    ],
    temperature=0
)

# Parse and print the result
similarity_result = response.choices[0].message.content
print(similarity_result)

{
  "matching_percentage": 85,
  "explanation": "The two JSON objects are largely similar, with a matching percentage of 85%. Both objects have identical 'FirstOrderFindings', indicating the same health parameters and their respective levels. However, there are differences in the 'SecondOrderInsights' and 'CausalHypotheses'. The 'result_json' object provides more specific insights and hypotheses, such as 'Possible insulin resistance', 'Increased cardiovascular risk', and 'Red cell size variation possible nutritional deficiency or anemia risk', while the 'OpenAI_insights' object provides more general insights like 'Increased Inflammation', 'Potential Liver Dysfunction', and 'Potential Diabetes'. The 'NarrativeExplanation' in both objects is different but they convey similar overall health concerns based on the lab results."
}
