In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

# Seed for reproducibility
np.random.seed(42)

# Simulate large-scale hospital data
large_hospital_data = pd.DataFrame({
    "HospitalID": range(1, 1001),  # Unique hospital IDs
    "Capacity": np.random.randint(50, 100, 1000),  # Random capacities between 50 and 100
    "CurrentLoad": np.random.randint(10, 50, 1000),  # Random current loads between 10 and 50
    "Location": [(np.random.uniform(43.6, 43.9), np.random.uniform(-79.6, -79.4)) for _ in range(1000)]  # Random lat/lon
})

# Convert Pandas DataFrame to Dask DataFrame
ddf = dd.from_pandas(large_hospital_data, npartitions=10)  # Split into 10 partitions for distributed processing

# Example 1: Calculate mean and standard deviation of current load
mean_load = ddf["CurrentLoad"].mean().compute()  # Compute mean of "CurrentLoad"
std_load = ddf["CurrentLoad"].std().compute()  # Compute standard deviation of "CurrentLoad"
print("\n--- Load Statistics ---")
print(f"Mean Load: {mean_load:.2f}")
print(f"Standard Deviation of Load: {std_load:.2f}")

# Example 2: Filter hospitals with high utilization (CurrentLoad / Capacity > 0.75)
ddf["Utilization"] = ddf["CurrentLoad"] / ddf["Capacity"]
high_utilization_hospitals = ddf[ddf["Utilization"] > 0.75]
print("\n--- High Utilization Hospitals ---")
print(high_utilization_hospitals.compute().head())  # Compute and display a few rows

# Example 3: Aggregations - Total capacity and load by location clusters
# Simulate location clusters for aggregation
ddf["Cluster"] = ddf["Location"].apply(lambda loc: f"Cluster_{int(loc[0] * 10)}", meta=("Location", str))
aggregated_stats = ddf.groupby("Cluster").agg({
    "Capacity": "sum",
    "CurrentLoad": "sum"
}).compute()
print("\n--- Aggregated Statistics by Location Clusters ---")
print(aggregated_stats)

# Example 4: Export filtered data
# Save high-utilization hospitals to a CSV file (parallel write using Dask)
high_utilization_hospitals.to_csv("high_utilization_hospitals_*.csv", index=False, single_file=True)
print("\nHigh-utilization hospitals saved to 'high_utilization_hospitals.csv'.")

# Example 5: Perform lazy computations
# Demonstrating Dask's lazy evaluation capabilities
lazy_computation = ddf["CurrentLoad"].sum()  # No computation happens here
print("\nLazy computation object:", lazy_computation)  # Shows a Dask computation graph
total_load = lazy_computation.compute()  # Actual computation triggered
print("Total Load:", total_load)



--- Load Statistics ---
Mean Load: 30.00
Standard Deviation of Load: 11.66

--- High Utilization Hospitals ---
     HospitalID  Capacity  CurrentLoad  \
15           16        52           46   
64           65        53           46   
106         107        60           49   
119         120        57           49   
120         121        61           49   

                                    Location  Utilization  
15   (43.70993846046142, -79.52076546167815)     0.884615  
64   (43.74903005765775, -79.40244289595702)     0.867925  
106   (43.87917856439317, -79.5767267204229)     0.816667  
119  (43.643912197648675, -79.5264931155417)     0.859649  
120   (43.62045169281565, -79.5948376186101)     0.803279  

--- Aggregated Statistics by Location Clusters ---
             Capacity  CurrentLoad
Cluster                           
Cluster_438     24168        10023
Cluster_437     26605        10007
Cluster_436     24057         9968

High-utilization hospitals saved to 'high_utili

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

