# Version 3: Sequential Chill-Forcing Model
Cherry phenology follows a two-stage domrancy process:
1. Winter chilling requirment
2. Spring heat accumulation 

This model accumulates chill daily starting Nov 1st of the previous year. When the chill threshold is reached, that date is marked and GDD accumulate for the season begins. Bloom is predicted to occur when GDD threshold is reached. 

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [37]:
temps = pd.read_csv("Data/full_temps_dc.csv")
temps['date'] = pd.to_datetime(temps['date'])
temps["month"] = temps["date"].dt.month

temps["season_year"] = np.where(
    temps["month"] >= 10,
    temps["year"] + 1,
    temps["year"]
)

bloom = pd.read_csv("Data/blooms_dc.csv")
bloom = bloom[bloom['year'] > 1942]

temps['date'] = pd.to_datetime(temps['date'])
temps['tavg'] = (temps['tmin'] + temps['tmax']) / 2
temps['year'] = temps['date'].dt.year
temps['doy'] = temps['date'].dt.dayofyear

## Simulation Function
This function first accumulates chill daily. When chill threshold is met it starts accumulating GDD. Accumulation stops when GDD reaches forcing requirment. The day the function stops is the predicted bloom DOY.

In [38]:
def simulate_bloom_year(temps, bloom_year,
                        chill_threshold,
                        forcing_requirement,
                        chill_base=7.2,
                        gdd_base=4.4):

    df = temps[temps["season_year"] == bloom_year].copy()
    df = df.sort_values("date")

    df["is_chill"] = df["tavg"] < chill_base
    df["gdd"] = np.maximum(df["tavg"] - gdd_base, 0)

    chill_cum = 0
    forcing_cum = 0
    chill_met = False

    for _, row in df.iterrows():

        if not chill_met:
            chill_cum += row["is_chill"]
            if chill_cum >= chill_threshold:
                chill_met = True

        else:
            forcing_cum += row["gdd"]
            if forcing_cum >= forcing_requirement:
                return row["doy"]

    return None

## Estimating Forcing Requirment From Historical Bloom Dates

This function finds the chill release date for each historical year, and calculates the total GDD from actual bloom. It averages those GDD values and that average becomes the forcing requirement. 

In [None]:
def estimate_forcing_requirement(
        temps, bloom_df,
        chill_threshold,
        chill_base=7.2,
        gdd_base=4.4):

    forcing_values = []

    for year in bloom_df["year"]:

        df = temps[temps["season_year"] == year].copy()
        df = df.sort_values("date")

        bloom_doy = bloom_df.loc[
            bloom_df["year"] == year, "bloom_doy"
        ].values[0]

        df["is_chill"] = df["tavg"] < chill_base
        df["gdd"] = np.maximum(df["tavg"] - gdd_base, 0)

        chill_cum = 0
        chill_met = False
        forcing_cum = 0

        for _, row in df.iterrows():

            if not chill_met:
                chill_cum += row["is_chill"]
                if chill_cum >= chill_threshold:
                    chill_met = True

            elif row["doy"] <= bloom_doy:
                forcing_cum += row["gdd"]

            else:
                break

        if chill_met:
            forcing_values.append(forcing_cum)

    if len(forcing_values) == 0:
        return None

    return np.mean(forcing_values)

## Threshold Model Evaluation

In [61]:
def evaluate_threshold(temps, bloom_df, chill_threshold):

    forcing_req = estimate_forcing_requirement(
        temps, bloom_df, chill_threshold
    )

    if forcing_req is None:
        return None, None, None

    predictions = []
    actuals = []

    for year in bloom_df["year"]:

        pred = simulate_bloom_year(
            temps,
            year,
            chill_threshold,
            forcing_req
        )

        if pred is not None:
            predictions.append(pred)
            actuals.append(
                bloom_df.loc[
                    bloom_df["year"] == year, "bloom_doy"
                ].values[0]
            )

    if len(predictions) == 0:
        return None, None, None

    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    r2 = r2_score(actuals, predictions)

    return rmse, r2, forcing_req

## Grid Search for Optimal Chill Threshold
Estimating chill threshold using forcing multiplier gird. Without grid search, the forcing requirment would just be the mean GDD, which makes the forcing requirment fixed. 

In [62]:
results = []

for t in range(40, 120):   # October–Feb now included
    rmse, r2, forcing_req = evaluate_threshold(temps, bloom, t)

    if rmse is None:
        continue

    results.append((t, rmse, r2, forcing_req))

results_df = pd.DataFrame(
    results,
    columns=["chill_threshold", "rmse", "r2", "forcing_requirement"]
)

best_row = results_df.loc[results_df["rmse"].idxmin()]

print(best_row)

chill_threshold         54.000000
rmse                     4.420021
r2                       0.627053
forcing_requirement    204.453049
Name: 14, dtype: float64


This optimizer says cherry tress in DC require  approximately 54 chill days (<7.2°C). Then approximately 204 growing degree days (4.4°C base), for peak bloom to occur. 

A huge jump in $R^2$ compared to previous model. 


## Optimized Model

In [44]:
best_threshold = int(best_row["chill_threshold"])
best_forcing = best_row["forcing_requirement"]

predictions = []
actuals = []

for year in bloom["year"]:
    
    pred = simulate_bloom_year(
        temps,              
        year,             
        best_threshold,
        best_forcing
    )
    
    if pred is not None:
        predictions.append(pred)
        actuals.append(
            bloom.loc[
                bloom["year"] == year, "bloom_doy"
            ].values[0]
        )

print("Final RMSE:", np.sqrt(mean_squared_error(actuals, predictions)))
print("Final R²:", r2_score(actuals, predictions))

Final RMSE: 4.420020968938231
Final R²: 0.627052624406641


## 2026 Predictions

In [49]:
temps_2026 = pd.read_csv("Data/dc_Temps_2026.csv")
temps_2026['date'] = pd.to_datetime(temps_2026['date'])
temps_2026["month"] = temps_2026["date"].dt.month

temps_2026["season_year"] = np.where(
    temps_2026["month"] >= 10,
    temps_2026["year"] + 1,
    temps_2026["year"]
)


In [50]:
prediction_2026 = simulate_bloom_year(
    temps_2026,
    2026,
    best_threshold,
    best_forcing
)

print("Predicted 2026 Bloom DOY:", prediction_2026)

Predicted 2026 Bloom DOY: 93


April 3rd is the predicted peak bloom date for 2026. 