# 1. Data preprocessing

In this notebook, we preprocess the data, the procedure of which includes the following:

* Remove duplicate entries.

* Check data size -> Data size is too small.
    * Use other cities' data as proxy cities.
    * Proxy cities: Have high correlation in bloom_doy to Kyoto.

* Identify the nearest NOAA weather stations for the proxy cities.

* Download the weather data.

In [1]:
# Load necessary packages 
library(tidyverse)
library(yaml)
library(rnoaa)
library(mice)

configs <- read_yaml("./_config.yaml") 
comp_data_dir <- configs$competition_data   # competition data
data_dir <- configs$data_dir                # output data

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.1     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘mice’


The following object is masked from ‘package:stats’:

    filter


The following objects are masked from ‘package:base’:

    cbind, rbind



## 1a. Is the data large enough?

To apply ML methods, we must have enough data size. 

Here, we check whether we have enough data for Kyoto.

In [2]:
kyoto <- read.csv(paste0(comp_data_dir, "/kyoto.csv")) %>%
    rename(c("city" = location)) %>%
    mutate(bloom_date = as.Date(bloom_date, format = "%Y-%m-%d")) %>%
    mutate(city = "Kyoto")
head(kyoto)
print(nrow(kyoto))

Unnamed: 0_level_0,city,lat,long,alt,year,bloom_date,bloom_doy
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<int>,<int>,<date>,<int>
1,Kyoto,35.01198,135.6761,44,812,812-04-01,92
2,Kyoto,35.01198,135.6761,44,815,815-04-15,105
3,Kyoto,35.01198,135.6761,44,831,831-04-06,96
4,Kyoto,35.01198,135.6761,44,851,851-04-18,108
5,Kyoto,35.01198,135.6761,44,853,853-04-14,104
6,Kyoto,35.01198,135.6761,44,864,864-04-09,100


[1] 835


For temperature-based analyses, we need historical temperature data, and to obtain them we need to find out where the closest NOAA weather station is to Kyoto.

In [3]:
# Pull the list of weather stations.
weather_stations <- ghcnd_stations() %>%
    filter(last_year %in% c(2022,2023)) %>%
    distinct(id, .keep_all = TRUE) %>%
    filter(str_sub(id, 1, 2) %in% c("JA"))
head(weather_stations)

using cached file: ~/.cache/R/noaa_ghcnd/ghcnd-stations.rds

date created (size, mb): 2023-02-06 22:22:18 (2.228)

using cached file: ~/.cache/R/noaa_ghcnd/ghcnd-inventory.rds

date created (size, mb): 2023-02-07 23:34:57 (2.757)



id,latitude,longitude,elevation,state,name,gsn_flag,wmo_id,element,first_year,last_year
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>
JA000047401,45.417,141.683,12,,WAKKANAI,GSN,47401,TMAX,1951,2022
JA000047402,44.933,142.583,8,,KITAMIESASHI,,47402,TMAX,1951,2023
JA000047404,44.367,141.7,10,,HABORO,,47404,TMAX,1951,2023
JA000047405,44.583,142.967,15,,OMU,,47405,TMAX,1951,2023
JA000047406,43.95,141.633,28,,RUMOI,,47406,TMAX,1951,2023
JA000047407,43.767,142.367,116,,ASAHIKAWA,,47407,TMAX,1951,2022


In [4]:
temp_station <- weather_stations %>%
    mutate(lat = latitude) %>%
    mutate(long = longitude) %>%
    mutate(alt = elevation) %>%
    rename_with(~"city", id) %>%
    dplyr::select(city, lat, long, alt) %>%
    data.frame()

kyoto_stations <- rbind(kyoto[1, c("city", "lat", "long", "alt")], temp_station)
head(kyoto_stations)

Unnamed: 0_level_0,city,lat,long,alt
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>
1,Kyoto,35.01198,135.6761,44
2,JA000047401,45.417,141.683,12
3,JA000047402,44.933,142.583,8
4,JA000047404,44.367,141.7,10
5,JA000047405,44.583,142.967,15
6,JA000047406,43.95,141.633,28


In [5]:
# Compute the Euclidean distances between Kyoto and the stations.
kyoto_stations$dist <- apply(kyoto_stations, MARGIN = 1, FUN = function(row){
    geo = c("lat", "long")
    x = kyoto_stations[kyoto_stations$city == "Kyoto", geo]
    y = row[geo]
    dist = as.numeric(dist(rbind(x, y)))
    return(dist)
    })

kyoto_stations %>% arrange(dist) %>% head()

Unnamed: 0_level_0,city,lat,long,alt,dist
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1,Kyoto,35.01198,135.6761,44,1.385135e-05
2,JA000047759,35.017,135.733,46,0.0571073
3,JA000047780,34.7,135.833,106,0.3492089
4,JAM00047772,34.683,135.517,83,0.3654408
5,JA000047649,34.767,136.15,161,0.5334652
6,JA000047770,34.7,135.217,30,0.5550844


The station id that is closest to Kyoto is "JA000047759".

Now, pull the weather data from NOAA.

In [6]:
kyoto_id <- kyoto_stations[2, "city"]    # JA000047759

kyoto_temp <- ghcnd_search(stationid = kyoto_id, var = c("TMAX", "TMIN", "PRCP"), date_min = "1900-01-01", date_max = "2024-05-30") %>%
    purrr::reduce(left_join, by = "date") %>%
    dplyr::select(id.x, date, tmax, tmin, prcp) %>%
    dplyr::rename_with(~ "id", id.x) %>%
    mutate(tmax = tmax/10) %>%      # in C
    mutate(tmin = tmin/10) %>%      # in C
    mutate(prcp = prcp/10) %>%      # in mm
    mutate(year = format(date, "%Y")) %>%
    mutate(month = as.integer(strftime(date, '%m'))) %>%
    mutate(day = as.integer(strftime(date, '%d')))
head(kyoto_temp)

using cached file: ~/.cache/R/noaa_ghcnd/JA000047401.dly

date created (size, mb): 2023-02-18 19:46:08 (1.738)



file min/max dates: 1951-01-01 / 2022-10-31



id,date,tmax,tmin,prcp,year,month,day
<chr>,<date>,<dbl>,<dbl>,<dbl>,<chr>,<int>,<int>
JA000047401,1951-01-01,-2.7,-7.1,2.5,1951,1,1
JA000047401,1951-01-02,-6.3,-7.9,2.5,1951,1,2
JA000047401,1951-01-03,-6.4,-8.9,1.6,1951,1,3
JA000047401,1951-01-04,-8.1,-10.1,0.9,1951,1,4
JA000047401,1951-01-05,-6.2,-9.6,3.0,1951,1,5
JA000047401,1951-01-06,-7.3,-9.1,0.2,1951,1,6


The earliest possible year for analyses is 1951. Hence, trim the bloom data accordingly.

In [7]:
kyoto <- kyoto %>%
    filter(year >= 1951) %>%
    mutate(bloom_date = as.Date(bloom_date, format = "%Y-%m-%d"))
print(dim(kyoto))
head(kyoto)

[1] 73  7


Unnamed: 0_level_0,city,lat,long,alt,year,bloom_date,bloom_doy
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<int>,<int>,<date>,<int>
1,Kyoto,35.01198,135.6761,44,1951,1951-04-08,98
2,Kyoto,35.01198,135.6761,44,1952,1952-04-14,105
3,Kyoto,35.01198,135.6761,44,1953,1953-04-11,101
4,Kyoto,35.01198,135.6761,44,1954,1954-04-08,98
5,Kyoto,35.01198,135.6761,44,1955,1955-04-07,97
6,Kyoto,35.01198,135.6761,44,1956,1956-04-08,99


Now we have only 73 observed cherry blossom bloom days to work with, which is too small to train ML models.

One way to increase data size is to bring data from other cities that are similar to Kyoto, namely the 'proxy' cities.

Here, we define the proxies as the cities with high correlations to Kyoto in bloom_doy.

In [8]:
# Load data
japan <- read.csv(paste0(comp_data_dir, "/japan.csv")) %>%
    mutate(city = str_split(location, pattern = "/", simplify = TRUE)[, 2]) %>%
    filter(city != "Kyoto") %>%
    dplyr::select(-location) %>%
    relocate(city) %>%
    mutate(bloom_date = as.Date(bloom_date, format = "%Y-%m-%d")) %>%
    rbind(., kyoto)

## 1b. Remove duplicated entries

We noticed that some entries in the competition data are duplicated.

Here, we identify them and eliminate them.

In [9]:
# Find cities with duplicate entries
japan_dups <- japan %>% group_by(city, year) %>%
    summarise(n = n()) %>%
    filter(n >= 2) %>%
    distinct(city, n)
japan_dups

[1m[22m`summarise()` has grouped output by 'city'. You can override using the
`.groups` argument.


city,n
<chr>,<int>
Akita,2
Izuhara,2
Kochi,2
Kushiro,2
Muroran,2
Nagoya,2
Naze,2
Sendai,2
Shionomisaki,2
Tottori,2


In [10]:
# Remove the duplicated entries from the japan data.

# - Assign identifiers
japan$identifier <- paste0(as.character(japan$city), as.character(japan$lat), as.character(japan$long), as.character(japan$alt))

# - Create a unique list of cities, along with their identifiers.
japan_cities <- japan %>%
    dplyr::select(city, lat, long, alt) %>%
    group_by(city, .drop = FALSE) %>%
    filter(row_number(lat) == 1)
japan_cities$identifier <- paste0(as.character(japan_cities$city), as.character(japan_cities$lat), as.character(japan_cities$long), as.character(japan_cities$alt))

# - Extract distinct rows using the identifiers
japan <- japan %>%
    filter(identifier %in% japan_cities$identifier) %>%
    dplyr::select(-identifier) %>%
    distinct()

# - Check dups again
japan %>% group_by(city, year) %>%
    summarise(n = n()) %>%
    filter(n >= 2) %>%
    distinct(city, n)
head(japan)

[1m[22m`summarise()` has grouped output by 'city'. You can override using the
`.groups` argument.


city,n
<chr>,<int>


Unnamed: 0_level_0,city,lat,long,alt,year,bloom_date,bloom_doy
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<int>,<date>,<int>
1,Wakkanai,45.415,141.6789,2.85,1953,1953-05-30,150
2,Wakkanai,45.415,141.6789,2.85,1954,1954-05-27,147
3,Wakkanai,45.415,141.6789,2.85,1955,1955-05-23,143
4,Wakkanai,45.415,141.6789,2.85,1956,1956-05-14,135
5,Wakkanai,45.415,141.6789,2.85,1957,1957-05-22,142
6,Wakkanai,45.415,141.6789,2.85,1958,1958-05-25,145


## 1c. Find proxy cities

One way to increase data size is by incorporating data from other cities that are similar to Kyoto.

Here, we define the proxy cities as where their bloom_doy are highly correlated ($R^2$) with Kyoto's.

In [11]:
japan_cities$corr <- apply(
    japan_cities, MARGIN = 1
    , FUN = function(row) {
        if (row["city"] == "Kyoto") {
            return(1)
        }

        row_df <- japan[japan$city %in% c("Kyoto", as.character(row["city"])), c("year", "city", "bloom_doy")] %>% 
            pivot_wider(names_from = "city", values_from = "bloom_doy") %>%
            drop_na()
        
        x = row_df[, "Kyoto"]
        y = row_df[, as.character(row["city"])]
        
        city_cor = as.numeric(cor(x, y))
        return(city_cor)
    })

# Pull those with R^2 > 0.8 to Kyoto.
japan_tops <- japan_cities %>%
    arrange(desc(corr)) %>%
    dplyr::select(-identifier) %>%
    filter(corr > 0.80) %>%
    as.data.frame()

# display(japan_tops)
print(nrow(japan_tops))
head(japan_tops)

[1] 43


Unnamed: 0_level_0,city,lat,long,alt,corr
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1,Kyoto,35.01198,135.6761,44.0,1.0
2,Nara,34.69389,135.8278,104.4,0.9269058
3,Okayama,34.68583,133.9253,5.32,0.9249229
4,Hikone,35.27611,136.2439,87.3,0.9191706
5,Tottori,35.48778,134.2383,7.1,0.9026952
6,Maebashi,36.40528,139.0606,112.1,0.8982915


bloom_doy of 42 cities are highly correlated with that of Kyoto.

Next, we find their closest weather stations, and pull their historical weather data.

In [12]:
cherry_sub = japan %>%
    filter(city %in% japan_tops$city) %>%
    mutate(bloom_date = as.Date(bloom_date, format = "%Y-%m-%d"))

temp_df <- cherry_sub %>%
    dplyr::select(city, lat, long, alt) %>%
    data.frame()

temp_station <- weather_stations %>%
    mutate(lat = latitude) %>%
    mutate(long = longitude) %>%
    mutate(alt = elevation) %>%
    rename_with(~"city", id) %>%
    dplyr::select(city, lat, long, alt) %>%
    data.frame()

# Placeholder for the resulting pairs
city_station_pair <- data.frame(
    matrix(NA, nrow = 0, ncol = 3
        , dimnames = list(NULL, c("city", "id", "dist"))))

target_cities <- unique(cherry_sub$city)

for (c in seq_len(length(target_cities))) {
    
    skip_to_next <- 0
    ct <- target_cities[c]
    
    # Replace any special characters in the city names
    ct_converted <- str_replace(str_replace(str_replace(str_replace(ct, "-", "."), " ", "."), ",", "."), "'",".")
    tryCatch(
        {
            # rbind the city's geographical features and the weather stations'.
            temp_merged <- temp_df %>% 
                filter(city == ct) %>%
                dplyr::select(city, lat, long, alt) %>%
                distinct() %>%
                rbind(., temp_station) %>%
                data.frame(.)
        
            # Compute the Euclidean distance between the city and the stations.
            temp_merged$dist <- apply(temp_merged, MARGIN = 1, FUN = function(row){
                geo = c("lat", "long")
                x = temp_merged[temp_merged$city == ct, geo]
                y = row[geo]
                dist = as.numeric(dist(rbind(x, y)))
                return(dist)
                })
            
            # Sort the rows by ascending dist.
            temp_merged <- temp_merged %>%
                arrange(dist)

            # The first row is the city itself. Select the second row as the closest weather station.        
            station_id <- temp_merged[2, "city"]
            station_dist <- temp_merged[2, "dist"]
        }
        
        , error = function(e) skip_to_next <<-1
    )
    if (skip_to_next == 1) {
        # If error, skip to the next city.
        next
    }

    city_station_pair[nrow(city_station_pair) + 1, ] <- c(ct, station_id, station_dist)
}
city_station_pairs <- city_station_pair %>% 
    mutate(dist = as.numeric(dist)) %>%
    filter(dist < 2) %>%   # Only include the pairs that are close enough.
    arrange(dist) %>%
    group_by(id) %>%
    filter(row_number(id) == 1) %>%  # Remove any duplicated rows.
    as.data.frame(.)

head(city_station_pairs)

Unnamed: 0_level_0,city,id,dist
Unnamed: 0_level_1,<chr>,<chr>,<dbl>
1,Utsunomiya,JA000047615,0.001361961
2,Nagoya,JA000047636,0.001640536
3,Mito,JA000047629,0.002163816
4,Osaka,JAM00047772,0.002302039
5,Shirakawa,JA000047597,0.002403701
6,Tsu,JA000047651,0.00296169


In [13]:
print(length(unique(city_station_pairs$city)))
print(length(unique(city_station_pairs$id)))

[1] 42
[1] 42


In the weather data, there could some missing values.

We impute the missing values using the 'pmm' method, provided in the mice package.

In [14]:
# Define functions to pull (imputed if missing) temperature data
F01_get_temperature <- function(stationid, date_min = "1950-01-01", date_max = "2023-05-31") {

    dat <- ghcnd_search(stationid = stationid, var = c("TMAX", "TMIN", "PRCP"), 
               date_min = date_min, date_max = date_max) %>%
               purrr::reduce(left_join, by = "date") %>%
               dplyr::select(id.x, date, tmax, tmin, prcp) %>%
               dplyr::rename_with(~ "id", id.x) %>%
               mutate(tmax = tmax/10) %>%      # in C
               mutate(tmin = tmin/10) %>%      # in C
               mutate(prcp = prcp/10) %>%      # in mm
               mutate(year = format(date, "%Y")) %>%
               mutate(month = as.integer(strftime(date, '%m'))) %>%
               mutate(day = as.integer(strftime(date, '%d')))
    
    return(dat)
}

F01_get_imp_temperature <- function(city_station_pair, date_min = "1950-01-01", date_max = "2023-05-31", imp_method = "pmm") {

    station_ids <- city_station_pair$id
    city_temp_list <- list()

    for (c in seq_len(length(station_ids))) {

        skip_to_next <- 0
        
        temp_df <- tryCatch(
            {F01_get_temperature(station_ids[c]
            , date_min = date_min
            , date_max = date_max)
            }
        , error = function(x) skip_to_next <<-1 )
        
        if (skip_to_next == 1 ){
            next
        }
        # Impute missing data
        # - check missing data
        n_missing <- sum(is.na(temp_df[, c("tmax", "tmin", "prcp")]))

        if (n_missing > 0) {
            tempData <- mice(temp_df, m = 3, method = imp_method)

            # complete set
            imputed_temp <- complete(tempData, 3)
        
        } else {
            imputed_temp <- temp_df
        }
        city_temp_list[[c]] <- imputed_temp
    }
    out <- city_temp_list %>% bind_rows()
    return(out)
}


In [15]:
japan_temp_file = paste0(data_dir, "/A11_japan_temperatures.csv")

if (file.exists(japan_temp_file)) {
    cherry_temp_raw <- data.frame(data.table::fread(japan_temp_file))
} else {
    cherry_temp_raw <- F01_get_imp_temperature(
    # pull weather data for the cities listed in city_station_pairs. Impute missing tmax, tmin, prcp
    city_station_pairs
    , date_max = "2024-05-31"
    )
    write.csv(cherry_temp_raw, paste0(data_dir, "/A11_japan_temperatures.csv"), row.names=FALSE)
}

head(cherry_temp_raw)

Unnamed: 0_level_0,id,date,tmax,tmin,prcp,year,month,day
Unnamed: 0_level_1,<chr>,<IDate>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>
1,JA000047615,1951-01-01,6.4,-6.4,0,1951,1,1
2,JA000047615,1951-01-02,5.0,-9.3,0,1951,1,2
3,JA000047615,1951-01-03,5.8,-9.1,0,1951,1,3
4,JA000047615,1951-01-04,6.9,-7.4,0,1951,1,4
5,JA000047615,1951-01-05,8.2,-8.0,0,1951,1,5
6,JA000047615,1951-01-06,5.6,-7.5,0,1951,1,6


In [16]:
# Exclude years which do not have temperature data for the entire year.
cherry_temp_n <- cherry_temp_raw %>%
    group_by(id, year) %>%
    summarise(n = n()) %>%
    filter(n >=365) %>%
    merge(y = city_station_pairs[, c("id", "city")], by = "id", how = "inner") %>%
    merge(y = cherry_sub[, c("city", "year", "bloom_doy", "bloom_date", "lat", "long", "alt")], by = c("city", "year"), how = "inner") %>%
    mutate(bloom_date = as.Date(bloom_date, format = "%Y-%m-%d"))

head(cherry_temp_n)

[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.


Unnamed: 0_level_0,city,year,id,n,bloom_doy,bloom_date,lat,long,alt
Unnamed: 0_level_1,<chr>,<int>,<chr>,<int>,<int>,<date>,<dbl>,<dbl>,<dbl>
1,Fukui,1953,JA000047616,365,102,1953-04-12,36.05556,136.2225,8.8
2,Fukui,1954,JA000047616,365,96,1954-04-06,36.05556,136.2225,8.8
3,Fukui,1955,JA000047616,365,99,1955-04-09,36.05556,136.2225,8.8
4,Fukui,1956,JA000047616,366,105,1956-04-14,36.05556,136.2225,8.8
5,Fukui,1957,JA000047616,365,107,1957-04-17,36.05556,136.2225,8.8
6,Fukui,1958,JA000047616,365,97,1958-04-07,36.05556,136.2225,8.8


In [17]:
# Extract id and year pairs from cherry_temp_raw that are included in cherry_temp_n
cherry_temp <- cherry_temp_raw %>%
    merge(y = cherry_temp_n[, c("id", "year", "city", "bloom_doy", "bloom_date", "lat", "long", "alt")], by = c("id", "year"), all.x = TRUE) %>%
    drop_na(city) %>%
    mutate(date = as.Date(date, format = "%Y-%m-%d")) %>%
    mutate(bloom_date = as.Date(bloom_date, format = "%Y-%m-%d")) %>%
    arrange(id, date)
    
head(cherry_temp)

Unnamed: 0_level_0,id,year,date,tmax,tmin,prcp,month,day,city,bloom_doy,bloom_date,lat,long,alt
Unnamed: 0_level_1,<chr>,<int>,<date>,<dbl>,<dbl>,<dbl>,<int>,<int>,<chr>,<int>,<date>,<dbl>,<dbl>,<dbl>
1,JA000047587,1971,1971-01-01,5.0,0.7,10.5,1,1,Sakata,114,1971-04-24,38.90861,139.8433,3.14
2,JA000047587,1971,1971-01-02,4.1,-0.9,1.0,1,2,Sakata,114,1971-04-24,38.90861,139.8433,3.14
3,JA000047587,1971,1971-01-03,1.9,-2.7,0.0,1,3,Sakata,114,1971-04-24,38.90861,139.8433,3.14
4,JA000047587,1971,1971-01-04,5.3,0.1,6.5,1,4,Sakata,114,1971-04-24,38.90861,139.8433,3.14
5,JA000047587,1971,1971-01-05,2.6,-3.5,12.5,1,5,Sakata,114,1971-04-24,38.90861,139.8433,3.14
6,JA000047587,1971,1971-01-06,2.3,-4.1,2.5,1,6,Sakata,114,1971-04-24,38.90861,139.8433,3.14


In [18]:
# Save the japan temperature data
write.csv(cherry_temp, paste0(data_dir, "/A11_japan_temperatures2.csv"), row.names = FALSE)

## 1d. Repeat the procedure for Liestal

In [19]:
Liestal <- read.csv(paste0(comp_data_dir, "/liestal.csv")) %>%
    rename(c("city" = location)) %>%
    mutate(bloom_date = as.Date(bloom_date, format = "%Y-%m-%d")) %>%
    mutate(city = "Liestal")
min_year <- min(Liestal$year)
# head(liestal)
# print(nrow(liestal))

# Pull the list of weather stations.
# - Switzerland is close to Germany, so pull both SZ, GM, and FR data
weather_stations <- ghcnd_stations() %>%
    filter(last_year %in% c(2022,2023, 2024)) %>%
    distinct(id, .keep_all = TRUE) %>%
    filter(str_sub(id, 1, 2) %in% c("SZ", "GM", "FR"))
# head(weather_stations)

temp_station <- weather_stations %>%
    mutate(lat = latitude) %>%
    mutate(long = longitude) %>%
    mutate(alt = elevation) %>%
    rename_with(~"city", id) %>%
    dplyr::select(city, lat, long, alt) %>%
    data.frame()

Liestal_stations <- rbind(Liestal[1, c("city", "lat", "long", "alt")], temp_station)
# head(Liestal_stations)

# Compute the Euclidean distances between Liestal and the stations.
Liestal_stations$dist <- apply(Liestal_stations, MARGIN = 1, FUN = function(row){
    geo = c("lat", "long", "alt")
    x = Liestal_stations[Liestal_stations$city == "Liestal", geo]
    y = row[geo]
    dist = as.numeric(dist(rbind(x, y)))
    return(dist)
    })


using cached file: ~/.cache/R/noaa_ghcnd/ghcnd-stations.rds

date created (size, mb): 2023-02-06 22:22:18 (2.228)



using cached file: ~/.cache/R/noaa_ghcnd/ghcnd-inventory.rds

date created (size, mb): 2023-02-07 23:34:57 (2.757)



In [20]:
# Liestal_stations %>% arrange(dist) %>% head()
Liestal_id <- "GME00127786"
Liestal_stations[Liestal_stations$city == "Liestal", "id"] <- Liestal_id
Liestal_temp <- ghcnd_search(stationid = Liestal_id, var = c("TMAX", "TMIN", "PRCP"), date_min = min_year, date_max = "2024-05-30") %>%
    purrr::reduce(left_join, by = "date") %>%
    dplyr::select(id.x, date, tmax, tmin, prcp) %>%
    dplyr::rename_with(~ "id", id.x) %>%
    mutate(tmax = tmax/10) %>%      # in C
    mutate(tmin = tmin/10) %>%      # in C
    mutate(prcp = prcp/10) %>%      # in mm
    mutate(year = format(date, "%Y")) %>%
    mutate(month = as.integer(strftime(date, '%m'))) %>%
    mutate(day = as.integer(strftime(date, '%d')))

# head(Liestal_temp)
Liestal <- Liestal %>%
    filter(year >= min_year)
# print(dim(Liestal))

using cached file: ~/.cache/R/noaa_ghcnd/GME00127786.dly

date created (size, mb): 2023-02-06 22:23:42 (1.573)



file min/max dates: 1953-09-01 / 2022-10-31



In [21]:
# Load data
meteoswiss <- read.csv(paste0(comp_data_dir, "/meteoswiss.csv")) %>%
    mutate(city = str_split(location, pattern = "/", simplify = TRUE)[, 2]) %>%
    filter(city != "Liestal") %>%
    dplyr::select(-location) %>%
    relocate(city) %>%
    mutate(bloom_date = as.Date(bloom_date, format = "%Y-%m-%d")) %>%
    rbind(., Liestal)
tail(meteoswiss)

Unnamed: 0_level_0,city,lat,long,alt,year,bloom_date,bloom_doy
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<int>,<int>,<date>,<int>
6728,Liestal,47.4814,7.730519,350,2018,2018-04-08,98
6729,Liestal,47.4814,7.730519,350,2019,2019-03-27,86
6730,Liestal,47.4814,7.730519,350,2020,2020-03-17,77
6731,Liestal,47.4814,7.730519,350,2021,2021-03-28,87
6732,Liestal,47.4814,7.730519,350,2022,2022-03-26,85
6733,Liestal,47.4814,7.730519,350,2023,2023-03-29,88


In [22]:
# Find cities with duplicate entries
meteoswiss_dups <- meteoswiss %>% group_by(city, year) %>%
    summarise(n = n()) %>%
    filter(n >= 2) %>%
    distinct(city, n)
# meteoswiss_dups


# Remove the duplicated entries from the meteoswiss data.
# - Assign identifiers
meteoswiss$identifier <- paste0(as.character(meteoswiss$city), as.character(meteoswiss$lat), as.character(meteoswiss$long), as.character(meteoswiss$alt))

# - Create a unique list of cities, along with their identifiers.
meteoswiss_cities <- meteoswiss %>%
    dplyr::select(city, lat, long, alt) %>%
    group_by(city, .drop = FALSE) %>%
    filter(row_number(lat) == 1)
meteoswiss_cities$identifier <- paste0(as.character(meteoswiss_cities$city), as.character(meteoswiss_cities$lat), as.character(meteoswiss_cities$long), as.character(meteoswiss_cities$alt))

# - Extract distinct rows using the identifiers
meteoswiss <- meteoswiss %>%
    filter(identifier %in% meteoswiss_cities$identifier) %>%
    dplyr::select(-identifier) %>%
    distinct()

# - Check dups again
meteoswiss %>% group_by(city, year) %>%
    summarise(n = n()) %>%
    filter(n >= 2) %>%
    distinct(city, n)

[1m[22m`summarise()` has grouped output by 'city'. You can override using the
`.groups` argument.
[1m[22m`summarise()` has grouped output by 'city'. You can override using the
`.groups` argument.


city,n
<chr>,<int>


In [23]:
tail(meteoswiss)

Unnamed: 0_level_0,city,lat,long,alt,year,bloom_date,bloom_doy
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<int>,<int>,<date>,<int>
6728,Liestal,47.4814,7.730519,350,2018,2018-04-08,98
6729,Liestal,47.4814,7.730519,350,2019,2019-03-27,86
6730,Liestal,47.4814,7.730519,350,2020,2020-03-17,77
6731,Liestal,47.4814,7.730519,350,2021,2021-03-28,87
6732,Liestal,47.4814,7.730519,350,2022,2022-03-26,85
6733,Liestal,47.4814,7.730519,350,2023,2023-03-29,88


In [24]:
# Compute correlations
meteoswiss_cities$corr <- apply(
    meteoswiss_cities, MARGIN = 1
    , FUN = function(row) {
        if (row["city"] == "Liestal") {
            return(1)
        }

        row_df <- meteoswiss[meteoswiss$city %in% c("Liestal", as.character(row["city"])), c("year", "city", "bloom_doy")] %>% 
            pivot_wider(names_from = "city", values_from = "bloom_doy") %>%
            drop_na()
        
        x = row_df[, "Liestal"]
        y = row_df[, as.character(row["city"])]
        
        city_cor = as.numeric(cor(x, y))
        return(city_cor)
    })


In [25]:
# Pull those with R^2 > 0.6 to Liestal.
meteoswiss_tops <- meteoswiss_cities %>%
    arrange(desc(corr)) %>%
    dplyr::select(-identifier) %>%
    filter(corr > 0.6) %>%
    as.data.frame()
dim(meteoswiss_tops)

row_liestal <- which(meteoswiss_tops$city == "Liestal")
if (row_liestal != 1){
    temp_row <- meteoswiss_tops[1, ]
    meteoswiss_tops[1, ] <- meteoswiss_tops[row_liestal, ]
    meteoswiss_tops[row_liestal, ] <- temp_row
}
head(meteoswiss_tops)

“input string 1 is invalid in this locale”
“input string 1 is invalid in this locale”
“input string 1 is invalid in this locale”
ERROR while rich displaying an object: Error in gsub(chr, html_specials[[chr]], text, fixed = TRUE): input string 1 is invalid in this locale

Traceback:
1. tryCatch(withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler), error = outer_handler)
2. tryCatchList(expr, classes, parentenv, handlers)
3. tryCatchOne(expr, names, parentenv, handlers[[1L]])
4. doTryCatch(return(expr), name, parentenv, handler)
5. withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 . 

Unnamed: 0_level_0,city,lat,long,alt,corr
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<int>,<dbl>
1,Liestal,47.4814,7.730519,350,1.0
2,Mellingen,47.42098,8.273286,356,1.0
3,Faido,46.47788,8.800878,715,1.0
4,Dornach,47.48164,7.611114,300,0.9403232
5,Basel-Binningen,47.54859,7.582372,315,0.9323295
6,Zrich-MeteoSchweiz,47.37814,8.565853,555,0.9317979


In [26]:
# Download temperature data for the proxy cities.
cherry_sub = meteoswiss %>%
    filter(city %in% meteoswiss_tops$city) %>%
    mutate(bloom_date = as.Date(bloom_date, format = "%Y-%m-%d"))

temp_df <- cherry_sub %>%
    dplyr::select(city, lat, long, alt) %>%
    data.frame()

temp_station <- weather_stations %>%
    mutate(lat = latitude) %>%
    mutate(long = longitude) %>%
    mutate(alt = elevation) %>%
    rename_with(~"city", id) %>%
    dplyr::select(city, lat, long, alt) %>%
    data.frame()


# Placeholder for the resulting pairs
city_station_pair <- data.frame(
    matrix(NA, nrow = 0, ncol = 3
        , dimnames = list(NULL, c("city", "id", "dist"))))

target_cities <- unique(cherry_sub$city)

c_liestal <- which(target_cities == "Liestal")
if (c_liestal != 1){
    temp_city <- target_cities[1]
    target_cities[1] <- target_cities[c_liestal]
    target_cities[c_liestal] <- temp_city
}
target_cities

In [27]:
for (c in seq_len(length(target_cities))) {
    
    skip_to_next <- 0
    ct <- target_cities[c]
    
    # Replace any special characters in the city names
    ct_converted <- str_replace(str_replace(str_replace(str_replace(ct, "-", "."), " ", "."), ",", "."), "'",".")
    tryCatch(
        {
            # rbind the city's geographical features and the weather stations'.
            temp_merged <- temp_df %>% 
                filter(city == ct) %>%
                dplyr::select(city, lat, long, alt) %>%
                distinct() %>%
                rbind(., temp_station) %>%
                data.frame(.)
        
            # Compute the Euclidean distance between the city and the stations.
            temp_merged$dist <- apply(temp_merged, MARGIN = 1, FUN = function(row){
                geo = c("lat", "long")
                x = temp_merged[temp_merged$city == ct, geo]
                y = row[geo]
                dist = as.numeric(dist(rbind(x, y)))
                return(dist)
                })
            
            # Sort the rows by ascending dist.
            temp_merged <- temp_merged %>%
                arrange(dist)

            # The first row is the city itself. Select the second row as the closest weather station.        
            
            station_id <- temp_merged[2, "city"]
            station_dist <- temp_merged[2, "dist"]
            idx = 2
            while (station_id %in% city_station_pair$id) {
                
                idx = idx + 1
                station_id <- temp_merged[idx, "city"]
                station_dist <- temp_merged[idx, "dist"]

            }
        }
        
        , error = function(e) skip_to_next <<-1
    )
    if (skip_to_next == 1) {
        # If error, skip to the next city.
        next
    }

    city_station_pair[nrow(city_station_pair) + 1, ] <- c(ct, station_id, station_dist)
}


In [28]:
city_station_pairs <- city_station_pair %>% 
    mutate(dist = as.numeric(dist)) %>%
    filter(dist < 1) %>%   # Only include the pairs that are close enough.
    arrange(dist) %>%
    group_by(id) %>%
    as.data.frame(.)

row_liestal = which(city_station_pairs$city == "Liestal")
if (row_liestal != 1){
    temp_row <- city_station_pairs[1, ]
    city_station_pairs[1, ] <- city_station_pairs[row_liestal, ]
    city_station_pairs[row_liestal, ] <- temp_row
}
dim(city_station_pairs)

In [29]:
meteoswiss_temp_file = paste0(data_dir, "/A21_meteoswiss_temperatures.csv")

if (file.exists(meteoswiss_temp_file)) {
    cherry_temp_raw <- data.frame(data.table::fread(meteoswiss_temp_file))
} else {
    cherry_temp_raw <- F01_get_imp_temperature(
    # pull weather data for the cities listed in city_station_pairs. Impute missing tmax, tmin, prcp
    city_station_pairs
    , date_max = "2024-05-31"
    )
    write.csv(cherry_temp_raw, paste0(data_dir, "/A21_meteoswiss_temperatures.csv"), row.names=FALSE)
}
# head(cherry_temp_raw)

In [30]:
# Exclude years which do not have temperature data for the entire year.
cherry_temp_n <- cherry_temp_raw %>%
    group_by(id, year) %>%
    summarise(n = n()) %>%
    filter(n >=365) %>%
    merge(y = city_station_pairs[, c("id", "city")], by = "id", how = "inner") %>%
    merge(y = cherry_sub[, c("city", "year", "bloom_doy", "bloom_date", "lat", "long", "alt")], by = c("city", "year"), how = "inner")
head(cherry_temp_n)

[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.


Unnamed: 0_level_0,city,year,id,n,bloom_doy,bloom_date,lat,long,alt
Unnamed: 0_level_1,<chr>,<int>,<chr>,<int>,<int>,<date>,<dbl>,<dbl>,<int>
1,Aurigeno,1956,SZ000009480,366,104,1956-04-13,46.23608,8.725292,350
2,Aurigeno,1958,SZ000009480,365,115,1958-04-25,46.23608,8.725292,350
3,Aurigeno,1959,SZ000009480,365,94,1959-04-04,46.23608,8.725292,350
4,Aurigeno,1961,SZ000009480,365,87,1961-03-28,46.23608,8.725292,350
5,Aurigeno,1963,SZ000009480,365,92,1963-04-02,46.23608,8.725292,350
6,Aurigeno,1964,SZ000009480,366,81,1964-03-21,46.23608,8.725292,350


In [31]:
# Extract id and year pairs from cherry_temp_raw that are included in cherry_temp_n
cherry_temp <- cherry_temp_raw %>%
    merge(y = cherry_temp_n[, c("id", "year", "city", "bloom_doy", "bloom_date", "lat", "long", "alt")], by = c("id", "year"), all.x = TRUE) %>%
    drop_na(city) %>%
    mutate(date = as.Date(date, format = "%Y-%m-%d")) %>%
    mutate(bloom_date = as.Date(bloom_date, format = "%Y-%m-%d")) %>%
    arrange(id, date)

# Remove non-UTF8 characters
cherry_temp <- cherry_temp %>% mutate(city = iconv(city, to = "UTF-8", sub = ""))

# Save the japan temperature data
write.csv(cherry_temp, paste0(data_dir, "/A21_meteoswiss_temperatures2.csv"), row.names = FALSE)

tail(cherry_temp)

Unnamed: 0_level_0,id,year,date,tmax,tmin,prcp,month,day,city,bloom_doy,bloom_date,lat,long,alt
Unnamed: 0_level_1,<chr>,<int>,<date>,<dbl>,<dbl>,<dbl>,<int>,<int>,<chr>,<int>,<date>,<dbl>,<dbl>,<int>
211832,SZE00116090,2017,2017-12-26,6.4,-2.5,2.5,12,26,Orvin,99,2017-04-09,47.15775,7.214464,700
211833,SZE00116090,2017,2017-12-27,4.4,-0.5,3.1,12,27,Orvin,99,2017-04-09,47.15775,7.214464,700
211834,SZE00116090,2017,2017-12-28,2.2,-1.3,0.0,12,28,Orvin,99,2017-04-09,47.15775,7.214464,700
211835,SZE00116090,2017,2017-12-29,3.5,-2.5,5.1,12,29,Orvin,99,2017-04-09,47.15775,7.214464,700
211836,SZE00116090,2017,2017-12-30,10.4,2.4,3.5,12,30,Orvin,99,2017-04-09,47.15775,7.214464,700
211837,SZE00116090,2017,2017-12-31,12.1,3.3,1.7,12,31,Orvin,99,2017-04-09,47.15775,7.214464,700
