In [None]:
###     DOWNLOAD DATA FROM THE API AND COMPILE IT IN JSON FORMAT        ###

import requests
import json

BASE_URL = "https://api.hackupm2025.workers.dev/api/v1/patients"
LIMIT = 100

def fetch_all_patients(endpoint: str):
    """Fetch all patients from a given API endpoint ('test' or 'train')."""
    page = 1
    all_patients = []

    while True:
        params = {"page": page, "limit": LIMIT}
        url = f"{BASE_URL}/{endpoint}"
        response = requests.get(url, params=params)
        response.raise_for_status()

        data = response.json()
        patients = data.get("data", [])
        all_patients.extend(patients)

        pagination = data.get("pagination", {})
        has_next = pagination.get("hasNextPage", False)

        print(f"[{endpoint.upper()}] Fetched page {page} ‚Üí {len(patients)} records")

        if not has_next:
            break
        page += 1

    print(f"\n‚úÖ Done fetching {endpoint} data! Total patients: {len(all_patients)}\n")
    return all_patients


if __name__ == "__main__":
    # Loop through both endpoints
    for endpoint in ["test", "train"]:
        all_data = fetch_all_patients(endpoint)
        filename = f"patients_data_{endpoint}.json"

        with open(filename, "w", encoding="utf-8") as f:
            json.dump(all_data, f, ensure_ascii=False, indent=2)

        print(f"üíæ Saved {len(all_data)} records to {filename}\n")


: 

In [None]:
# EXTRACT FEATURES USING A DOCKERIZED LLM EXPOSING AN OpenAI-compatible API


import json
import pandas as pd
import requests
import os

# --------------------------
# üß† Local LLM Docker server setup
# --------------------------
# üëá CHANGE THIS LINE: use your Docker container‚Äôs endpoint
LLM_API_URL = "http://localhost:8000/v1/chat/completions"
MODEL_NAME = "local-llm"  # placeholder, can be anything

# --------------------------
# ü©∫ Extraction instructions
# --------------------------
instructions = """You are a clinical data extractor.
Given a patient description, return ONLY a JSON with these fields:

{
 "Age": integer,
 "Gender": "Male" or "Female",
 "Hypertension": 0 or 1,
 "Heart Disease": 0 or 1,
 "Smoking History": "never" | "past" | "current" | "not known",
 "BMI": float,
 "HbA1c": "Low" | "Normal" | "High" | "Very High",
 "Random Glucose": "Low" | "Normal" | "High" | "Very High"
}

Do not add explanations or text outside the JSON.
"""

# --------------------------
# üìÇ Load dataset
# --------------------------
with open("patients_data_test.json", "r", encoding="utf-8") as f:
    patients = json.load(f)

# --------------------------
# üíæ Output setup
# --------------------------
OUTPUT_FILE = "extracted_patients_data.csv"

# Resume if partial CSV exists
if os.path.exists(OUTPUT_FILE):
    existing_df = pd.read_csv(OUTPUT_FILE)
    processed_indices = set(existing_df["index"].tolist())
    results = existing_df.to_dict(orient="records")
    print(f"üìÑ Resuming from previous run ({len(processed_indices)} patients already processed)")
else:
    processed_indices = set()
    results = []

# --------------------------
# üîÅ Iterate through each patient
# --------------------------
for i, patient in enumerate(patients, start=1):
    if i in processed_indices:
        print(f"‚è© Skipping patient {i} (already processed)")
        continue

    medtext = patient["medical_note"]

    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": medtext}
    ]

    try:
        response = requests.post(
            LLM_API_URL,  # üëà using your local Docker endpoint now
            headers={"Content-Type": "application/json"},
            json={
                "model": MODEL_NAME,
                "messages": messages,
                "temperature": 0.1,
                "max_tokens": 512
            },
            timeout=120  # give the local model some time
        )

        response.raise_for_status()
        output = response.json()
        content = output["choices"][0]["message"]["content"].strip()

        try:
            data = json.loads(content)
        except json.JSONDecodeError:
            print(f"‚ö†Ô∏è Could not parse JSON for patient {i}")
            data = {}

        data["index"] = i

        # Save incrementally
        print(f"‚úÖ Extracted data for patient {i}: {data}")
        results.append(data)
        pd.DataFrame(results).to_csv(OUTPUT_FILE, index=False, encoding="utf-8")

        print(f"üíæ Saved progress up to patient {i}")

    except Exception as e:
        print(f"‚ùå Error on patient {i}: {e}")
        error_data = {"index": i, "error": str(e)}
        results.append(error_data)
        pd.DataFrame(results).to_csv(OUTPUT_FILE, index=False, encoding="utf-8")

# --------------------------
# ‚úÖ Done
# --------------------------
print("üèÅ Extraction complete! Data saved to", OUTPUT_FILE)


In [None]:
###     DATA CLEANUP AND PREPARATION FOR MODELING        ###
import pandas as pd

# Load the CSV file
df = pd.read_csv("extracted_patients_data.csv")

# Columns to remove
columns_to_remove = ["Glucose Level", "Low", "Low Risk", "Blood Pressure", "Glucose Levels"]

# Remove columns if they exist
df = df.drop(columns=[col for col in columns_to_remove if col in df.columns])

# Drop rows with any null (missing) values
df = df.dropna()

# Save the cleaned CSV
df.to_csv("cleaned_patients_data.csv", index=False)

print("‚úÖ Columns removed, null rows deleted, and cleaned file saved as 'cleaned_patients_data.csv'")
