**Aim 1: Classify countries by endemicity status.**

*CPSC 581: Machine Learning*

*Yale University*

*Instructor: Alex Wong*

*Student: Hailey Robertson*

Import packages

In [41]:
import numpy as np
import sklearn
import matplotlib
import pandas as pd
import geopandas as gpd
from shapely.geometry import MultiPolygon 
from shapely.geometry.base import BaseGeometry 
import country_converter as coco
import warnings
import os

warnings.filterwarnings(action='ignore')
np.random.seed(42)

Load data function

Clean Open Dengue data

In [None]:
# --- Load dengue data ---
open_dengue = pd.read_csv('../data/open_dengue_v1_2.csv')

# Standardize country names to ISO3
open_dengue["adm_0_iso3"] = coco.convert(
    names=open_dengue["adm_0_name"],
    to='ISO3',
    not_found="missing"
)

# Convert date columns to datetime
date_cols = ['calendar_start_date', 'calendar_end_date']
open_dengue[date_cols] = open_dengue[date_cols].apply(pd.to_datetime)

# Report missing ISO3 codes
missing_iso3 = open_dengue[open_dengue["adm_0_iso3"] == "missing"]
if missing_iso3.empty:
    print("No missing ISO3 codes!")
else:
    print("Missing ISO3 codes for the following entries:")
    print(missing_iso3[["adm_0_name"]].drop_duplicates())

# --- Load world geometry ---
world = gpd.read_file("../data/ne_110m_admin_0_countries")

# Rename and select necessary columns
world = world.rename(columns={
    "ADM0_A3": "adm_0_iso3",
    "ADMIN": "adm_0_name",
    "REGION_UN": "region_un",
    "geometry": "adm_0_geometry"
})[["adm_0_iso3", "adm_0_name", "region_un", "adm_0_geometry"]]

# Sort for consistency
world = world.sort_values(by="adm_0_name")

# --- Fix known issues ---
# Split out French Guiana from France
france_idx = world['adm_0_name'] == 'France'
france_geom = world.loc[france_idx, 'adm_0_geometry'].values[0]

if isinstance(france_geom, MultiPolygon):
    polygons = list(france_geom.geoms)
    french_guiana_polygon = next((poly for poly in polygons if poly.bounds[0] < -50 and poly.bounds[2] > -54), None)

    if french_guiana_polygon:
        # Remove French Guiana from France
        remaining_polygons = [poly for poly in polygons if poly != french_guiana_polygon]
        world.loc[france_idx, 'adm_0_geometry'] = MultiPolygon(remaining_polygons)

        # Add French Guiana as separate entry
        french_guiana_row = {
            'adm_0_iso3': 'GUF',
            'adm_0_name': 'French Guiana',
            'region_un': 'Americas',
            'adm_0_geometry': french_guiana_polygon
        }
        world = pd.concat([world, gpd.GeoDataFrame([french_guiana_row], geometry='adm_0_geometry')], ignore_index=True)

# Patch ISO3 codes for special cases
world.loc[world['adm_0_name'] == 'Norway', 'adm_0_iso3'] = 'NOR'
world.loc[world['adm_0_name'] == 'Somaliland', 'adm_0_iso3'] = 'SOM'
world.loc[world['adm_0_name'] == 'Kosovo', 'adm_0_iso3'] = 'RKS'
world.loc[world['adm_0_name'] == 'South Sudan', 'adm_0_iso3'] = 'SSD'

# --- Merge dengue data with geometry ---
dengue = pd.merge(open_dengue, world, on='adm_0_iso3', how='outer', suffixes=('', '_world'))

# If adm_0_name is missing in dengue, fill with name from world
dengue['adm_0_name'] = dengue['adm_0_name'].fillna(dengue['adm_0_name_world'])

# Same for region_un and geometry
dengue['region_un'] = dengue['region_un'].fillna(dengue['region_un_world'])
dengue['adm_0_geometry'] = dengue['adm_0_geometry'].fillna(dengue['adm_0_geometry_world'])

# Drop duplicate _world columns
dengue = dengue.drop(columns=['adm_0_name_world', 'region_un_world', 'adm_0_geometry_world'])

# Reorder columns nicely
front_cols = ['adm_0_name', 'adm_0_iso3']
geometry_col = ['adm_0_geometry']
other_cols = [col for col in dengue.columns if col not in front_cols + geometry_col]
dengue = dengue[front_cols + other_cols + geometry_col]


# --- Final summary ---
print("Number of unique ISO3 codes:", dengue['adm_0_iso3'].nunique())
print("UN Regions:", dengue['region_un'].dropna().unique())
print("Columns:", dengue.columns.tolist())


No missing ISO3 codes!
Number of unique ISO3 codes: 47
UN Regions: ['Other' 'Asia' 'Europe' 'Africa']
