# Data extraction and cleaning for OpenDengue
### Author: Hailey Robertson
### Date: 2024-09-24
### Citations:
- Clarke J, Lim A, Gupte P, Pigott DM, van Panhuis WG, Brady OJ. A global dataset of publicly available dengue case count data. Sci Data. 2024 Mar 14;11(1):296.
- Clarke J, Lim A, Gupte P, Pigott DM, van Panhuis WG, Brady OJ. OpenDengue: data from the OpenDengue database. Version [1.2]. figshare; 2023. Available from: https://doi.org/10.6084/m9.figshare.24259573.

In [52]:
# Imports
import requests
import pandas as pd
from datetime import datetime
import os
import numpy as np
import geopandas as gpd
import country_converter as coco
from geopy.geocoders import Nominatim
import pickle
import time
from dotenv import load_dotenv

load_dotenv()

True

In [38]:
# Get data from master repo, reset index to access adm_0_name
url = 'https://raw.githubusercontent.com/OpenDengue/master-repo/refs/heads/main/data/raw_data/masterDB_V1.2.csv'
df = pd.read_csv(url, index_col=0, encoding='latin-1').reset_index()

# Convert unstandardized country names into iso3 codes
df["adm_0_iso3"] = coco.convert(names=df["adm_0_name"],to='ISO3', not_found="missing")

# Convert all date columns to datetime
date_cols = ['calendar_start_date', 'calendar_end_date']
df[date_cols] = df[date_cols].apply(pd.to_datetime)

df.head()

Unnamed: 0,adm_0_name,adm_0_code,adm_1_name,adm_1_code,adm_2_name,adm_2_code,calendar_start_date,calendar_end_date,dengue_total,case_definition_standardised,UUID,adm_0_iso3
0,Argentina,12.0,Salta,445.0,Oran,4781.0,2019-07-21,2019-07-27,2.0,Total,MOH-ARG-2019-Y01-00,ARG
1,Argentina,12.0,Misiones,442.0,Iguazu,4728.0,2019-09-01,2019-09-07,1.0,Total,MOH-ARG-2019-Y01-00,ARG
2,Argentina,12.0,Salta,445.0,Grl. Jose De San Martin,4773.0,2019-07-14,2019-07-20,1.0,Total,MOH-ARG-2019-Y01-00,ARG
3,Argentina,12.0,Jujuy,438.0,Ledesma,4649.0,2019-07-14,2019-07-20,1.0,Total,MOH-ARG-2019-Y01-00,ARG
4,Argentina,12.0,Salta,445.0,Grl. Jose De San Martin,4773.0,2019-06-30,2019-07-06,3.0,Total,MOH-ARG-2019-Y01-00,ARG


In [40]:
# Check for missing iso3 (should be empty)
missing_rows = df[df["adm_0_iso3"] == "missing"]
print(missing_rows)

Empty DataFrame
Columns: [adm_0_name, adm_0_code, adm_1_name, adm_1_code, adm_2_name, adm_2_code, calendar_start_date, calendar_end_date, dengue_total, case_definition_standardised, UUID, adm_0_iso3]
Index: []


In [65]:
# World geometry shapefile, devastating that get_path("naturalearth_lowres") was deprecated 
world = gpd.read_file("../data/ne_110m_admin_0_countries")
world = world.rename(columns={"ADM0_A3":"adm_0_iso3",
                              "ADMIN":"adm_0_name",
                              "geometry":"adm_0_geometry"})
cols = ['adm_0_iso3','adm_0_name','adm_0_geometry']
world = world[cols]
world = world.sort_values(by="adm_0_name")

# Fix known issues in geopandas
world.loc[world['adm_0_name'] == 'France', 'adm_0_iso3'] = 'FRA'
world.loc[world['adm_0_name'] == 'Norway', 'adm_0_iso3'] = 'NOR'
world.loc[world['adm_0_name'] == 'Somaliland', 'adm_0_iso3'] = 'SOM'
world.loc[world['adm_0_name'] == 'Kosovo', 'adm_0_iso3'] = 'RKS'

In [66]:
# Merge geometry, keeping nulls
open_dengue = pd.merge(world,df,on='adm_0_iso3')

# Deal with columns with `_x` and `_y` suffixes
columns_to_rename = {col: col.replace('_x', '') for col in open_dengue.columns if '_x' in col}
columns_to_drop = [col for col in open_dengue.columns if '_y' in col]

# Rename the '_x' columns to their original names
open_dengue = open_dengue.rename(columns=columns_to_rename)

# Drop the '_y' columns
open_dengue = open_dengue.drop(columns=columns_to_drop)


In [67]:
# Not all periods are the same length – some places report every year, some every month, some every week
open_dengue["date_diff"] = (open_dengue["calendar_end_date"] - open_dengue["calendar_start_date"]).dt.days

# Find geo resolution and combine places
def highest_geo_resolution(row):
    if pd.notna(row['adm_2_name']) and row['adm_2_name'] != '':
        return 'adm_2'
    elif pd.notna(row['adm_1_name']) and row['adm_1_name'] != '':
        return 'adm_1'
    else:
        return 'adm_0'

open_dengue['geo_resolution'] = open_dengue.apply(highest_geo_resolution, axis=1)

open_dengue['combined_place'] = open_dengue[['adm_2_name', 'adm_1_name', 'adm_0_name']].apply(
    lambda x: ', '.join([str(place) for place in x if pd.notna(place) and place != '']), axis=1)


In [72]:
# # GEOCODE places (if needed)
#  user_agent = os.getenv("USER_AGENT")
# geolocator = Nominatim(user_agent=user_agent, timeout=10)

# # Load previously saved geocoded locations to avoid duplicates
# try:
#     with open('../data/cached_locations.pkl', 'rb') as f:
#         cached_locations = pickle.load(f)
# except (FileNotFoundError, EOFError):
#     cached_locations = {}

# def geocode_location(loc):
#     """Geocode the address if not already cached."""
#     if loc in cached_locations:
#         return cached_locations[loc]
#     else:
#         try:
#             location = geolocator.geocode(loc)
#             if location:
#                 # Only return latitude and longitude
#                 lat = location.latitude
#                 long = location.longitude
#                 lat_long = lat, long
#                 cached_locations[loc] = {
#                                     'latitude': lat,
#                                     'longitude': long,
#                                     'lat_long': lat_long
#                 }
#             else:
#                 cached_locations[loc] = None
#             # Save the cache after every new entry
#             with open('cached_locations.pkl', 'wb') as f:
#                 pickle.dump(cached_locations, f)
#             return cached_locations[loc]
#         except Exception as e:
#             print(f"Error geocoding {loc}: {e}")
#             return None

# locs = open_dengue["combined_place"]

# open_dengue["latitude"] = None
# open_dengue["longitude"] = None
# open_dengue["lat_long"] = None


# # Delay between requests in seconds to avoid hitting the rate limit
# REQUEST_DELAY = 0

# # Geocode locations with rate limiting and caching
# for idx, loc in enumerate(locs):
#     coords = geocode_location(loc)
    
#     if coords:
#         open_dengue.at[idx, "latitude"] = coords['latitude']
#         open_dengue.at[idx, "longitude"] = coords['longitude']
#         open_dengue.at[idx, "lat_long"] = coords['lat_long']
#         print(f"Address: {loc}, Coordinates: {coords}")
#     else:
#         print(f"Address: {loc}, Coordinates: Not found")
    
#     # Introduce delay to avoid API rate limits
#     time.sleep(REQUEST_DELAY)

Address: Afghanistan, Coordinates: {'latitude': 33.7680065, 'longitude': 66.2385139, 'lat_long': (33.7680065, 66.2385139)}
Address: Afghanistan, Coordinates: {'latitude': 33.7680065, 'longitude': 66.2385139, 'lat_long': (33.7680065, 66.2385139)}
Address: Afghanistan, Coordinates: {'latitude': 33.7680065, 'longitude': 66.2385139, 'lat_long': (33.7680065, 66.2385139)}
Address: Afghanistan, Coordinates: {'latitude': 33.7680065, 'longitude': 66.2385139, 'lat_long': (33.7680065, 66.2385139)}
Address: Afghanistan, Coordinates: {'latitude': 33.7680065, 'longitude': 66.2385139, 'lat_long': (33.7680065, 66.2385139)}
Address: Afghanistan, Coordinates: {'latitude': 33.7680065, 'longitude': 66.2385139, 'lat_long': (33.7680065, 66.2385139)}
Address: Afghanistan, Coordinates: {'latitude': 33.7680065, 'longitude': 66.2385139, 'lat_long': (33.7680065, 66.2385139)}
Address: Afghanistan, Coordinates: {'latitude': 33.7680065, 'longitude': 66.2385139, 'lat_long': (33.7680065, 66.2385139)}
Address: Afghani

KeyboardInterrupt: 

In [None]:
open_dengue

# TODO


# DONE and NOTES
- find NAs at adm2 and adm1 compared to adm0 (8% missing adm1, 44% missing adm2)
- zero-filling – zeroes were most likely previously NAs, but not necessarily 
- dates to dates
- floats to floats
- match on iso3
- observed reporting period differences
- combined place column
- geocode places


