In [1]:
import pandas as pd
import numpy as np

# Simulating real-time hospital data feed
real_time_data = pd.DataFrame({
    "HospitalID": range(1, 6),
    "CurrentLoad": np.random.randint(10, 50, 5),  # Current number of patients
    "Capacity": np.random.randint(50, 100, 5),  # Maximum hospital capacity
    "ScannerAvailability": np.random.choice([0, 1], size=5, p=[0.2, 0.8])  # Whether scanners are available
})

# Simulating historical wait time data
historical_data = pd.DataFrame({
    "HospitalID": range(1, 6),
    "AverageWaitTime": np.random.randint(20, 60, 5),  # Average historical wait times (in minutes)
    "HistoricalCapacityUtilization": np.random.uniform(0.5, 0.9, 5)  # Historical utilization ratios
})

# Merging real-time and historical data
combined_data = pd.merge(real_time_data, historical_data, on="HospitalID")

# Derived features
combined_data["CurrentCapacityUtilization"] = combined_data["CurrentLoad"] / combined_data["Capacity"]  # Utilization ratio
combined_data["LoadDifference"] = combined_data["CurrentLoad"] - (combined_data["HistoricalCapacityUtilization"] * combined_data["Capacity"])  # Compare current to historical load
combined_data["WeightedWaitTime"] = combined_data["AverageWaitTime"] * (1 + combined_data["CurrentCapacityUtilization"])  # Adjusted wait time based on current utilization

# Data validation
if combined_data.isnull().values.any():
    print("Data contains missing values. Please investigate:")
    print(combined_data.isnull().sum())
else:
    print("No missing values detected.")

# Handle outliers in CurrentCapacityUtilization
# Replace outliers outside [0, 1.5] with the median
utilization_median = combined_data["CurrentCapacityUtilization"].median()
combined_data["CurrentCapacityUtilization"] = combined_data["CurrentCapacityUtilization"].apply(
    lambda x: x if 0 <= x <= 1.5 else utilization_median
)

# Normalize weighted wait time for comparison
combined_data["NormalizedWaitTime"] = (combined_data["WeightedWaitTime"] - combined_data["WeightedWaitTime"].min()) / \
                                      (combined_data["WeightedWaitTime"].max() - combined_data["WeightedWaitTime"].min())

# Display integrated and processed data
print("\nIntegrated and Processed Hospital Data:")
print(combined_data)


No missing values detected.

Integrated and Processed Hospital Data:
   HospitalID  CurrentLoad  Capacity  ScannerAvailability  AverageWaitTime  \
0           1           37        84                    1               52   
1           2           13        76                    1               49   
2           3           31        74                    1               36   
3           4           40        64                    1               42   
4           5           35        94                    1               28   

   HistoricalCapacityUtilization  CurrentCapacityUtilization  LoadDifference  \
0                       0.752534                    0.440476      -26.212819   
1                       0.641180                    0.171053      -35.729693   
2                       0.790991                    0.418919      -27.533309   
3                       0.841313                    0.625000      -13.844062   
4                       0.813849                    0.372340  