In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")


In [2]:
full_temps_dc = pd.read_csv('Data/full_temps_dc.csv')
full_temps_dc['tavg'] = (full_temps_dc['tmin'] + full_temps_dc['tmax']) / 2
full_temps_dc['date'] = pd.to_datetime(full_temps_dc['date'])

temps_2026_dc = pd.read_csv("Data/dc_Temps_2026.csv")
temps_2026_dc['date'] = pd.to_datetime(temps_2026_dc['date'])
temps_2026_dc['tavg'] = (temps_2026_dc['tmin'] + temps_2026_dc['tmax']) / 2
temps_2026_dc['year'] = 2026
temps_2026_dc['doy'] = temps_2026_dc['date'].dt.dayofyear

cherry_dc = pd.read_csv("Q_blooms_dc.csv")

## Creating Chill and GDD Features

Chill hours is defined as total hours per day where temperature < 7.2°C using daily min/max temperatures and a triangular approximation.

Growing Degree Days (GDD) are a cumulative total of average temperatures being over the 4°C threshold.

#### Chill Hours

In [3]:
def calculate_chill_hours(df, base_temp=7.2):
    df = df.copy()
    df['is_chill'] = df['tavg'] < base_temp
    
    chill = (
        df.groupby(df['date'].dt.year)['is_chill']
        .sum()
        .reset_index()
        .rename(columns={'date': 'year', 'is_chill': 'chill_hours'})
    )
    
    chill.columns = ['year', 'chill_hours']
    return chill

#### Cumulative GDD

In [4]:
def calculate_cumulative_gdd(df, base_temp=4.4):
    df = df.copy()
    df['gdd'] = np.maximum(df['tavg'] - base_temp, 0)
    
    gdd = (
        df.groupby(df['date'].dt.year)['gdd']
        .sum()
        .reset_index()
    )
    
    gdd.columns = ['year', 'cumulative_gdd']
    return gdd

#### Threshold GDD

In [5]:
def calculate_threshold_gdd(df, chill_df, chill_threshold=800, base_temp=4.4):
    df = df.copy()
    df['gdd'] = np.maximum(df['tavg'] - base_temp, 0)
    df['year'] = df['date'].dt.year
    
    merged = df.merge(chill_df, on='year')
    
    threshold_results = []
    
    for year in merged['year'].unique():
        year_data = merged[merged['year'] == year]
        
        if year_data['chill_hours'].iloc[0] >= chill_threshold:
            gdd_total = year_data['gdd'].sum()
        else:
            gdd_total = 0
        
        threshold_results.append({
            'year': year,
            'threshold_gdd': gdd_total
        })
    
    return pd.DataFrame(threshold_results)

### Severe Frost Feature

Binary shock variable for temperatures under -4.4°C or 24°F, which can damage up to 90% of explosed blossoms affected by such low temperatures. [Source](
https://www.nps.gov/nama/learn/news/cherry-blossom-update-cold-temperatures-causing-damage-to-advanced-stage-blossoms.htm#:~:text=Cherry%20blossoms%20start%20to%20sustain,exposed%20blossoms%20can%20be%20affected.
)

 Since we are concerned about the effect of already exposed blossoms, this feature only tracks low temperatures after Febuary 15th.

In [6]:
def calculate_severe_frost(df):
    df = df.copy()
    df['year'] = df['date'].dt.year
    
    frost_list = []
    
    for year in df['year'].unique():
        year_data = df[
            (df['year'] == year) &
            (df['date'] >= pd.Timestamp(year=year, month=2, day=15))
        ]
        
        severe_frost = int((year_data['tmin'] <= -4.4).any())
        
        frost_list.append({
            'year': year,
            'severe_frost': severe_frost
        })
        
    return pd.DataFrame(frost_list)

### Base Dataframe

In [7]:
chill_df = calculate_chill_hours(full_temps_dc)
cum_gdd_df = calculate_cumulative_gdd(full_temps_dc)
frost_df = calculate_severe_frost(full_temps_dc)

base_df = (
    cherry_dc
    .merge(chill_df, on='year')
    .merge(cum_gdd_df, on='year')
    .merge(frost_df, on='year')
)

Biologcial machanism is working because gdd is negatively correlated with bloom_doy, and chill_hours has weak, positive correlation with bloom_doy. 

## Optimizing GDD Threshold

In [8]:
candidate_thresholds = [3, 4, 4.5, 5, 6, 7]

best_thresh = None
best_rmse = np.inf

results = []

for thresh in candidate_thresholds:

    # Compute historical GDD at bloom
    def calc_gdd(year, bloom_doy):
        gdd_data = full_temps_dc[
            (full_temps_dc['year'] == year) &
            (full_temps_dc['doy'] <= bloom_doy)
        ]
        return np.sum(np.maximum(gdd_data['tavg'] - thresh, 0))

    temp_df = base_df.copy()
    temp_df['gdd'] = temp_df.apply(
        lambda r: calc_gdd(r['year'], r['bloom_doy']),
        axis=1
    )

    # Fit chill → required GDD model
    X = temp_df[['chill_hours']]
    y = temp_df['gdd']

    reg = LinearRegression()
    reg.fit(X, y)

    # Predict bloom DOY historically using threshold simulation 
    predicted_doys = []

    for _, row in temp_df.iterrows():

        year = row['year']
        chill = row['chill_hours']

        required_gdd = reg.predict([[chill]])[0]

        year_temps = full_temps_dc[
            full_temps_dc['year'] == year
        ].copy()

        year_temps['gdd'] = np.maximum(
            year_temps['tavg'] - thresh, 0
        )
        year_temps['cum_gdd'] = year_temps['gdd'].cumsum()

        pred_doy = year_temps.loc[
            year_temps['cum_gdd'] >= required_gdd
        ]['doy'].iloc[0]

        predicted_doys.append(pred_doy)

    # Evaluate DOY accuracy
    rmse = np.sqrt(
        mean_squared_error(temp_df['bloom_doy'], predicted_doys)
    )

    results.append((thresh, rmse))
    print(f"Base {thresh}°C → Bloom DOY RMSE: {rmse:.2f}")

    if rmse < best_rmse:
        best_rmse = rmse
        best_thresh = thresh


print("\nBest base temperature:")
print(f"{best_thresh}°C (RMSE = {best_rmse:.2f} days)")

Base 3°C → Bloom DOY RMSE: 5.09
Base 4°C → Bloom DOY RMSE: 5.05
Base 4.5°C → Bloom DOY RMSE: 5.10
Base 5°C → Bloom DOY RMSE: 5.08
Base 6°C → Bloom DOY RMSE: 5.38
Base 7°C → Bloom DOY RMSE: 5.60

Best base temperature:
4°C (RMSE = 5.05 days)


Since 4°C was deemed the most optimal threshold, which is close to what the literature says is typical for Yoshino trees at 5°C.

# Cumulative GDD Model Testing

## Model 1 - Linear

In [9]:
model_linear = base_df.copy()

features = ['chill_hours', 'cumulative_gdd']
X = model_linear[features]
y = model_linear['bloom_doy']

lin_model = LinearRegression()
lin_model.fit(X, y)

preds = lin_model.predict(X)

print("R2:", r2_score(y, preds))
print("Coefficients:", dict(zip(features, lin_model.coef_)))

R2: 0.3913719783919145
Coefficients: {'chill_hours': 0.36230701385770164, 'cumulative_gdd': 0.0006243537025314793}


* year: -0.87, blooming trends slightly earlier over time
    * Warming signal
* chill hours: 0.0057, more chill forces later bloom 
    * counterintuitive
* gdd: -0.0033, more gdd the earlier the bloom
    * biologially correct

Linear model is strugling to capture the uniquenss of the features


## Model 2 - Polynomial

In [10]:
model_poly = base_df.copy()

features = ['chill_hours', 'cumulative_gdd']

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(model_poly[features])

poly_model = LinearRegression()
poly_model.fit(X_poly, y)

preds_poly = poly_model.predict(X_poly)

print("Polynomial R2:", r2_score(y, preds_poly))

Polynomial R2: 0.39549437059224335


Nonlinear adds almost nothing to the model.

### Model 3 - Linear with Frost

In [11]:
model_frost = base_df.copy()

features = ['chill_hours', 'cumulative_gdd', 'severe_frost']

X = model_frost[features]
y = model_frost['bloom_doy']

frost_model = LinearRegression()
frost_model.fit(X, y)

print("Frost R2:", r2_score(y, frost_model.predict(X)))

Frost R2: 0.3914645197241955


## 2026 Predictions

In [12]:
chill_2026 = calculate_chill_hours(temps_2026_dc)
gdd_2026 = calculate_cumulative_gdd(temps_2026_dc)
frost_2026 = calculate_severe_frost(temps_2026_dc)

df_2026 = (
    chill_2026
    .merge(gdd_2026, on='year')
    .merge(frost_2026, on='year')
)

In [17]:
def predict_year(model, df, feature_cols, poly=None):
    
    X = df[feature_cols]
    
    if poly is not None:
        X = poly.transform(X)
    
    return model.predict(X)

features_poly = list(poly.feature_names_in_)
bloom_2026 = predict_year(poly_model, df_2026.drop(columns=['year']), features, poly=poly)
print("Predicted Bloom DOY 2026:", bloom_2026)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- severe_frost


### Comparison of 2026 Prediction to Historical Distribution

In [None]:
historical_bloom = base_df['bloom_doy']

plt.hist(historical_bloom, bins=15, color='skyblue', edgecolor='black')
plt.axvline(105, color='red', linestyle='--', label='Predicted 2026')
plt.title("Historical Peak Bloom DOY Distribution (Washington, DC)")
plt.xlabel("Day of Year (DOY)")
plt.ylabel("Frequency")
plt.legend()
plt.show()

In [None]:
percentile = np.mean(historical_bloom <= 105) * 100
print("2026 prediction percentile:", percentile)

## Modeling Change: GDD Threshold

Initial modeling with GDD was using cumulative GDD to predict bloom date, but that caused circular modeling, especially when trying to predict future bloom dates. Now GDD is modeled as a threshold:
$GDD_{crit} = a + b * Chill$ 

In [None]:
def calculate_threshold_gdd(df, chill_df, chill_threshold=800, base_temp=4.4):
    df = df.copy()
    df['gdd'] = np.maximum(df['tavg'] - base_temp, 0)
    df['year'] = df['date'].dt.year
    
    merged = df.merge(chill_df, on='year')
    
    threshold_results = []
    
    for year in merged['year'].unique():
        year_data = merged[merged['year'] == year]
        
        if year_data['chill_hours'].iloc[0] >= chill_threshold:
            gdd_total = year_data['gdd'].sum()
        else:
            gdd_total = 0
        
        threshold_results.append({
            'year': year,
            'threshold_gdd': gdd_total
        })
    
    return pd.DataFrame(threshold_results)

threshold_gdd_df = calculate_threshold_gdd(
    full_temps_dc,
    chill_df,
    chill_threshold=800,
    base_temp=4.4
)

### Model 1 - Linear

In [None]:
model_threshold = (
    base_df[['year', 'bloom_doy', 'chill_hours']]  # keep what you want
    .merge(threshold_gdd_df, on='year')
)

In [None]:
features = ['chill_hours', 'threshold_gdd']

X = model_threshold[features]
y = model_threshold['bloom_doy']

threshold_model = LinearRegression()
threshold_model.fit(X, y)

preds = threshold_model.predict(X)

print("Threshold Model R2:", r2_score(y, preds))
print("Coefficients:", dict(zip(features, threshold_model.coef_)))

#### Threshold-Only Model

In [None]:
features = ['threshold_gdd']

X = model_threshold[features]
y = model_threshold['bloom_doy']

threshold_only_model = LinearRegression()
threshold_only_model.fit(X, y)

print("Threshold Only R2:",
      r2_score(y, threshold_only_model.predict(X)))