In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Example hospital data with missing capacities and other issues
hospitals = pd.DataFrame({
    "HospitalID": [1, 2, 3, 4, 5],
    "Capacity": [100, None, 50, None, 75],
    "Latitude": [43.7, 43.8, 43.9, 44.0, 44.1],
    "Longitude": [-79.4, -79.5, -79.3, -79.6, -79.2],
    "Name": ["Hospital A", "Hospital B", "Hospital C", None, "Hospital E"]
})

# 1. Impute missing capacities with mean
imputer = SimpleImputer(strategy="mean")
hospitals["Capacity"] = imputer.fit_transform(hospitals[["Capacity"]])

# 2. Handle missing or inconsistent hospital names
# For missing names, replace with "Unknown"

hospitals["Name"] = hospitals["Name"].fillna("Unknown")

# 3. Validate latitude and longitude values (e.g., ensuring they are within realistic ranges)
def validate_coordinates(lat, lon):
    if -90 <= lat <= 90 and -180 <= lon <= 180:
        return lat, lon
    return None, None

hospitals[["Latitude", "Longitude"]] = hospitals.apply(
    lambda row: validate_coordinates(row["Latitude"], row["Longitude"]), axis=1, result_type="expand"
)

# 4. Remove duplicates (if any) based on HospitalID
hospitals = hospitals.drop_duplicates(subset="HospitalID")

# 5. Normalize the "Name" column (e.g., capitalize each word)
hospitals["Name"] = hospitals["Name"].str.title()

# 6. Add a derived column for regions based on latitude
def determine_region(lat):
    if lat < 44.0:
        return "South"
    elif 44.0 <= lat < 44.5:
        return "Central"
    else:
        return "North"

hospitals["Region"] = hospitals["Latitude"].apply(determine_region)

# 7. Ensure HospitalID is unique and sorted
hospitals = hospitals.sort_values(by="HospitalID").reset_index(drop=True)

# Final cleaned dataset
print("Cleaned Hospital Data:")
print(hospitals)


Cleaned Hospital Data:
   HospitalID  Capacity  Latitude  Longitude        Name   Region
0           1     100.0      43.7      -79.4  Hospital A    South
1           2      75.0      43.8      -79.5  Hospital B    South
2           3      50.0      43.9      -79.3  Hospital C    South
3           4      75.0      44.0      -79.6     Unknown  Central
4           5      75.0      44.1      -79.2  Hospital E  Central
