# 1_DataProcessing
In this notebook, we first use R to read in and explore the data from Brazil. The goal is to decide on the what parts of the data to use, and to format everything for easy reading into Python (for the creation of Darts TS objects). Part of this is also deciding on the training-validation-testing regimen to use.

There are three steps to this process:
1. Compute state-level dengue case data (provided values are at the municipality-level and need to be aggregated).
2. Compute state-level static covariates (we can use the proportion of each state living in each climate zone / biome).
3. Compute state-level meteorological values (municipality values provided but we can aggregate to state-level through population weighting).

In [None]:
library(tidyverse)

# 1. Reading Input

In [None]:
base_dir = getwd()
raw_data_dir = file.path(base_dir, "data_sprint_2025")
api_data_dir = file.path(base_dir, "API Downloaded")
other_data_dir = file.path(base_dir, "Other Data")
output_dir = file.path(base_dir, "ModelInput")

In [None]:
dengue_df = read_csv(file.path(raw_data_dir, "dengue.csv.gz")) #Dengue data
geo_df = read_csv(file.path(raw_data_dir, "map_regional_health.csv")) #Information on geocodes and admin hierarchy
pop_df = read_csv(file.path(raw_data_dir, "datasus_population_2001_2024.csv.gz")) #Population data
environ_df = read_csv(file.path(raw_data_dir, "environ_vars.csv.gz")) #Environmental variables (used for statis covariates). 
climate_df = read_csv(file.path(raw_data_dir, "climate.csv.gz")) #Municipality-level climate covariates
api_dl_clim_df = read_csv(file.path(api_data_dir, "climate_2025.csv")) #Municipality-level API-downloaded climate data (dfiff format from climate_df)

covid_index_df = read_csv(file.path(other_data_dir, "OxCGRT_compact_subnational_v1.csv"))

sst_df = read_csv(file.path(raw_data_dir, "ocean_climate_oscillations.csv.gz")) %>% rename(Date = date)

# 2. Processing Case Data
Here, we aggregate case data to the state-level and output a long format file that can easily be used to create Darts TimeSeries objects.

In [None]:
cases_df = dengue_df %>% mutate(Year = as.integer(substr(epiweek, 1, 4)), Week = as.integer(substr(epiweek, 5, 6))) %>% 
    select(date, epiweek, Year, Week, uf, casos, train_1, target_1, train_2, target_2, train_3, target_3) %>% 
    group_by(date, epiweek, Year, Week, uf) %>% summarise(casos = sum(casos), train_1 = all(train_1), target_1 = all(target_1),
                                                train_2 = all(train_2), target_2 = all(target_2), train_3 = all(train_3), target_3 = all(target_3)) %>% ungroup

state_mapper = geo_df %>% select(uf, uf_name) %>% distinct
cases_df = cases_df %>% left_join(state_mapper, by = "uf")



# 3. State-level Static Covariates
We can compute state-level static covariates by computing the percentage of each state population living in each Koppen climate class/biome. We can base this on the mid-study period population to keep this as a static covariate. 

In [None]:
mun_to_state = geo_df %>% select(geocode, uf, uf_name)

In [None]:
mun_pop_df = pop_df %>% left_join(mun_to_state, by = "geocode")
state_pop_df = mun_pop_df
state_pop_df = state_pop_df %>% select(uf, uf_name, year, population) %>% group_by(uf, uf_name, year) %>% summarise(population = sum(population)) %>% 
                rename(uf_population = population)

mun_pop_df = mun_pop_df %>% left_join(state_pop_df, by = c("year", "uf", "uf_name")) %>% mutate(mun_uf_prop = population / uf_population)

In [None]:
uf_list = mun_pop_df %>% pull(uf) %>% unique

In [None]:
#Format the environmental static covariates
mun_pop_year = as.integer((2010 + 2024) / 2)
mun_pop_filt = mun_pop_df %>% filter(year == mun_pop_year)

stat_cov_df = environ_df %>% left_join(mun_pop_filt, by = "geocode") %>% 
                mutate(biome = ifelse(biome == "Mata Atlântica", "Mata Atlantica",
                                     ifelse(biome == "Amazônia", "Amazonia", biome)))
koppen_df = stat_cov_df %>% select(uf, uf_name, koppen, mun_uf_prop) %>% group_by(uf, uf_name, koppen) %>% 
            summarise(mun_uf_prop = sum(mun_uf_prop)) %>% mutate(koppen = paste0("koppen_", koppen)) %>% 
            pivot_wider(id_cols = c(uf, uf_name), names_from = koppen, values_from = mun_uf_prop) %>% 
            mutate(across(everything(), ~replace_na(., 0)))

biome_df = stat_cov_df %>% select(uf, uf_name, biome, mun_uf_prop) %>% group_by(uf, uf_name, biome) %>% 
            summarise(mun_uf_prop = sum(mun_uf_prop)) %>% mutate(biome = paste0("biome_", biome)) %>% 
            pivot_wider(id_cols = c(uf, uf_name), names_from = biome, values_from = mun_uf_prop) %>% 
            mutate(across(everything(), ~replace_na(., 0)))
merged_stats_df = koppen_df %>% left_join(biome_df, by = c("uf", "uf_name"))
merged_stats_df

# 4. Processing Meteorological Values
We generate state-level meteorological values coming from the municipality-level values. To do this, we apply a population weighting scheme - ensuring each value is weighted by the proportion of the state population living in the municipality in the given year. We assume that the population proportions for 2025 are the same as in 2024. 

In [None]:
mun_pop_2024 = mun_pop_df %>% filter(year == 2024)
mun_weights = mun_pop_df %>% rbind(mun_pop_2024 %>% mutate(year = 2025)) %>% #Assume that the proportions and populations in 2025 are the same as in 2024.
                rename(Year = year) %>% 
                select(Year, geocode, uf, uf_name, mun_uf_prop)
mun_weights

In [None]:
api_dl_clim_df_cleaned = api_dl_clim_df %>% select(-`...1`) %>% 
                                            rename(pressure_min = pressao_min, pressure_med = pressao_med, pressure_max = pressao_max,
                                             rel_humid_min = umid_min, rel_humid_med = umid_med, rel_humid_max = umid_max, geocode = geocodigo) %>% #Rename columns to match original climate data provided by organisers
                                             mutate(rainy_days = ifelse(precip_tot > 0.03, 1, 0), num_days = 1) %>%  #Set rainy days definition based on information from organisers
                                            filter(epiweek != 202501)

#Aggregate weekly values
api_dl_clim_df_cleaned_weekly = api_dl_clim_df_cleaned %>% group_by(geocode, epiweek) %>% 
                                    summarise(date = min(date), temp_min = mean(temp_min), temp_med = mean(temp_med), temp_max = mean(temp_max),
                                             precip_min = sum(precip_min), precip_med = sum(precip_med), precip_max = sum(precip_max),
                                             pressure_min = mean(pressure_min), pressure_med = mean(pressure_med), pressure_max = mean(pressure_max),
                                             rel_humid_min = mean(rel_humid_min), rel_humid_med = mean(rel_humid_med), rel_humid_max = mean(rel_humid_max),
                                             rainy_days = sum(rainy_days), num_days = sum(num_days)) %>% mutate(thermal_range = temp_max - temp_min)
api_dl_clim_df_cleaned_weekly

In [None]:
#Check that the API downloaded climate data has been processed in the same way as the original provided by organisers.
temp1 = api_dl_clim_df_cleaned_weekly %>% filter(epiweek == 202515) %>% arrange(geocode) %>% select(-num_days)
temp1 = temp1[, sort(names(temp1))]
temp2 = climate_df %>% filter(epiweek == 202515) %>% arrange(geocode)
temp2 = temp1[, sort(names(temp2))]
identical(temp1, temp2)

In [None]:
#We get a collection of the municipalities where climate data is not available. These are primarily islands. 
climate_cleaned_df = climate_df

orig_end = climate_cleaned_df %>% pull(epiweek) %>% max #Last epiweek given by the organisers
add_from_api = api_dl_clim_df_cleaned_weekly %>% filter(epiweek > orig_end) %>% select(-num_days) #Get epiweeks from the API that come after the data provided by organisers

climate_cleaned_df = climate_cleaned_df %>% rbind(add_from_api) #Merge the data from the API into the climate DataFrame

clim_avail_codes = climate_cleaned_df %>% pull(geocode) %>% unique
missing_codes = mun_weights %>% filter(!(geocode %in% clim_avail_codes)) %>% pull(geocode) %>% unique

#The missing geocodes do not have any climate data. To solve this, we take climate data for some nearby municipality, change the geocode, then add it to our climate DataFrame

#2916104 Itaparica is assigned 2933208 Vera Cruz
#2919926 Madre de Deus is assigned to 2929206 Sao Francisco do Conde
#2605459 Fernando de Noronha = 2407500 Maxaranguape
missing_filler = list("2916104" = "2933208", "2919926" = "2929206", "2605459" = "2407500")

#Use the climate in another municiaplit to fill in the values for those that are missing
builder = list()
missing_list = names(missing_filler)
for(curr_missing in missing_list){
    #Retrieve values based on missing_filler
    to_fill = climate_cleaned_df %>% filter(geocode == as.integer(missing_filler[[curr_missing]]))
    to_fill = to_fill %>% mutate(geocode = as.integer(curr_missing)) #Change the geocode to what is missing
    builder[[curr_missing]] = to_fill #add to the builder list
}
clim_addition = do.call(rbind, builder)
rownames(clim_addition) = NULL

climate_cleaned_df = climate_cleaned_df %>% rbind(clim_addition)

#Below is info on the missing geocodes
#geo_df %>% filter(geocode %in% missing_codes)

In [None]:
clim_weighted_df = climate_cleaned_df %>% 
                mutate(Year = as.integer(substr(epiweek, 1, 4)), Week = as.integer(substr(epiweek, 5, 6))) %>% #Split epiweek into Year Week
                filter(Year >= 2010) %>% #We only get the values from 2010 since those are the years we have dengue data
                left_join(mun_weights, by = c("Year", "geocode"))  %>% 
                rename(Date = date, weight = mun_uf_prop) %>% 
                mutate(across(
                    .cols = -c(Date, epiweek, geocode, Year, Week, uf, uf_name, weight),
                    .fns = ~.x * weight
                ))
clim_weighted_df

In [None]:
state_clim_df = clim_weighted_df %>% #Get the weighted DataFrame
                select(Date, epiweek, Year, Week, uf, uf_name, everything()) %>% #Re-order the columns
                select(-c(geocode)) %>% #Remove municipality codes
                group_by(Date, epiweek, Year, Week, uf, uf_name) %>% #Group by and summarise to get the weighted sum
                summarise(across(everything(), sum)) %>% 
                mutate(thermal_range = temp_max - temp_min) #Recompute thermal_range to be sure
state_clim_df

In [None]:
#Check if there is a state where the sum of weights is not 1 for any date. This should be empty
checker1 = state_clim_df %>% filter((weight + 0.000001) < 1) %>% nrow  #We use a small epsilon due to some precision issues
weights_check = mun_weights %>% filter(Year >= 2010) %>% select(-geocode) %>% group_by(Year, uf, uf_name) %>% summarise(mun_uf_prop = sum(mun_uf_prop))
checker2 = weights_check %>% filter((mun_uf_prop + 0.00001) < 1) %>% nrow

(checker1 == 0) & (checker2==0)

# 5. Processing COVID-19 Stringency Index
We use the sub-national level stringency index from the Oxford COVID-19 Government Response Tracker. 

In [None]:
#Generate a DataFrame with the WeekStart, WeekMid, WeekEnd information. 
week_cal = climate_cleaned_df %>% select(date, epiweek) %>% distinct %>% 
                rename(WeekStart = date) %>% mutate(WeekEnd = WeekStart + 6) %>% 
                mutate(WeekMid = WeekStart + 4) %>% 
                select(epiweek, WeekStart, WeekMid, WeekEnd)
week_cal 

covid_brazil = covid_index_df %>% filter(CountryName == "Brazil")

covid_state_df = covid_brazil %>% filter(Jurisdiction == "STATE_TOTAL") %>% select(RegionName, RegionCode, Jurisdiction, Date, StringencyIndex_Average) %>% 
                    mutate(Date = as.Date(as.character(Date), format = "%Y%m%d")) %>% 
                    mutate(RegionCode = str_replace(RegionCode, "BR_", "")) %>% select(-Jurisdiction) %>% 
                    rename(StringencyIndex = StringencyIndex_Average)
covid_state_df = covid_state_df %>% left_join(week_cal, by = join_by(between(Date, WeekStart, WeekEnd)))

covid_state_df

covid_state_df %>% filter(RegionCode == "SP")

#Get weekly average stringency index 
weekly_covid_state_df = covid_state_df %>% mutate(num_days = 1) %>% select(-Date) %>%
                        group_by(RegionName, RegionCode, epiweek, WeekStart, WeekMid, WeekEnd) %>% 
                        summarise(num_days = sum(num_days), StringencyIndex = mean(StringencyIndex)) %>% ungroup
weekly_covid_state_df = weekly_covid_state_df %>% rename(uf = RegionCode, uf_name = RegionName) %>% 
                        mutate(Year = str_sub(epiweek, 1, 4), Week = str_sub(epiweek, 5, 6)) %>% 
                        mutate(Year = as.integer(Year), Week = as.integer(Week))
weekly_covid_state_df

# 6. Processing SST Indices
We also process the SST Indices so we can look into potentially forecasting them and using them as covariates to the model.

In [None]:
options(repr.plot.width = 12, repr.plot.height = 5)

sst_df %>% pivot_longer(-Date, names_to = "IndexName", values_to = "Value") %>%
    ggplot(aes(x = Date, y = Value)) + 
        geom_point() + 
        theme(text = element_text(size = 14)) + 
        facet_wrap(~IndexName)

In [None]:
interp_date_start = sst_df %>% pull(Date) %>% min
interp_date_end = sst_df %>% pull(Date) %>% max


#Date range to generate daily values for
to_interp_dates = seq(from = interp_date_start, to = interp_date_end, by = "day")

#Use linear interpolation to fill in gaps in the SST indices
enso_approx = approx(x = sst_df %>% pull(Date), y = sst_df %>% pull(enso), xout = to_interp_dates)
iod_approx = approx(x = sst_df %>% pull(Date), y = sst_df %>% pull(iod), xout = to_interp_dates)
pdo_approx = approx(x = sst_df %>% pull(Date), y = sst_df %>% pull(pdo), xout = to_interp_dates)

sst_filled_df = data.frame(Date = to_interp_dates, enso = enso_approx$y, iod = iod_approx$y, pdo = pdo_approx$y, num_days = 1) 

weekly_sst_df = sst_filled_df%>% left_join(week_cal, by = join_by(between(Date, WeekStart, WeekEnd))) %>% #Join to calendar
                    filter(!is.na(epiweek)) %>% select(-Date) %>% #Remove those with no epiweek (outside study period) and remove date column
                    group_by(epiweek, WeekStart, WeekMid, WeekEnd) %>% 
                    summarise(enso = mean(enso), iod = mean(iod), pdo = mean(pdo), num_days = sum(num_days)) %>% #Get average values
                    mutate(Year = str_sub(epiweek, 1, 4), Week = str_sub(epiweek, 5, 6)) %>% 
                    mutate(Year = as.integer(Year), Week = as.integer(Week)) #Split the Year and Week values


weekly_sst_df = weekly_sst_df %>% select(Year, Week, epiweek, WeekStart, WeekMid, WeekEnd, enso, iod, pdo, num_days)
weekly_sst_df

# 7. Putting Things Together and Outputting Data

In [None]:
#Attach the COVID Stringency Index to the climate data to form the time-varying covariates dataframe
covid_index_attach = weekly_covid_state_df %>% select(uf, Year, Week, StringencyIndex)
time_varying_covs_df = state_clim_df %>% left_join(covid_index_attach, by = c("uf", "Year", "Week")) %>% 
                        replace_na(list(StringencyIndex = 0))

In [None]:
time_varying_covs_df

In [None]:
main_cases_df = cases_df %>% rename(Cases = casos, Date = date) %>%
                    select(Date, epiweek, Year, Week, uf, uf_name, Cases, train_1, target_1, train_2, target_2, train_3, target_3)
main_cases_df

In [None]:
calendar = week_cal %>% mutate(Year = str_sub(epiweek, 1, 4), Week = str_sub(epiweek, 5, 6)) %>% 
                        mutate(Year = as.integer(Year), Week = as.integer(Week))

set_info = main_cases_df %>% select(Year, Week, train_1, target_1, train_2, target_2, train_3, target_3) %>% distinct

calendar = calendar %>% left_join(set_info, by = c("Year", "Week")) %>% 
                        replace_na(list(train_1 = FALSE, target_1 = FALSE, train_2 = FALSE, target_2 = FALSE, train_3 = FALSE, target_3 = FALSE))
calendar

In [None]:
# #Write values to csv files
# main_cases_df %>% write.csv(file.path(output_dir, "DengueCases.csv"), row.names = FALSE) #CSV file with cases
# time_varying_covs_df %>% write.csv(file.path(output_dir, "TimeVaryingCovs.csv"), row.names = FALSE) #CSV file with time-varying covariates (climate + COVID-19 stringency index)
# merged_stats_df %>% write.csv(file.path(output_dir, "StaticCovs.csv"), row.names = FALSE) #CSV file with static covariate information - Koppen climate and Brazilian biomes
# weekly_sst_df%>% write.csv(file.path(output_dir, "SSTIndices.csv"), row.names = FALSE) #CSV file containing interpolated - weekly aggregated SST indices
# calendar %>% write.csv(file.path(output_dir, "Calendar.csv"), row.names = FALSE) #CSV file containing epiweek calendar information