In [16]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

## Random Forest with Cross-Val

In [17]:
model_df = pd.read_csv("model_df.csv")
X = model_df[['year', 'chill_hours', 'gdd']]
y = model_df['bloom_doy']

rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=4,
    random_state=42
)

scores = cross_val_score(rf, X, y, cv=10, scoring='r2')

print("CV R^2 scores:", scores)
print("Mean CV R^2:", np.mean(scores))

CV R^2 scores: [-0.30952823  0.06626288  0.20708805  0.25519639 -0.59613754 -0.52067277
  0.19398089 -0.88046908  0.00801942 -0.97397243]
Mean CV R^2: -0.255023241815252


A couple negative mean CV R2 scores meaning RandomForest model is worse than predicting the mean. Linear or polynomial regression is probably better.

## Polynomial Linear Model for 2026 Predictions

### Updating DC 2026 Temperatures Data set

In [None]:
temps_2026_dc = pd.read_csv("Data/dc_Temps_2026.csv")

In [23]:
chill_thresh = 7.2

chill_start = pd.Timestamp(year=2025, month=10, day=1)
chill_end   = pd.Timestamp(year=2026, month=2, day=28)

chill_data = temps_2026_dc[
    (temps_2026_dc['date'] >= chill_start) &
    (temps_2026_dc['date'] <= chill_end)
]

chill_hours_tot = 0

for _, row in chill_data.iterrows():
    tmin = row['tmin']
    tmax = row['tmax']

    if tmax < chill_thresh:
        chill_hours = 24
    elif tmin >= chill_thresh:
        chill_hours = 0
    else:
        chill_hours = 24 * (chill_thresh - tmin) / (tmax - tmin)

    chill_hours_tot += chill_hours

print("2026 Chill Hours:", chill_hours_tot)

2026 Chill Hours: 1183.2473653639247


In [24]:
gdd_thresh = 10

gdd_data = temps_2026_dc[
    (temps_2026_dc['year'] == 2026) &
    (temps_2026_dc['date'] <= "2026-04-30")
]

gdd_tot = 0

for _, row in gdd_data.iterrows():
    gdd_day = max(row['tavg'] - gdd_thresh, 0)
    gdd_tot += gdd_day

print("2026 GDD:", gdd_tot)

2026 GDD: 164.165


In [25]:
feat_2026 = pd.DataFrame({
    'year': [2026],
    'chill_hours': [chill_hours_tot],
    'gdd': [gdd_tot]
})

feat_2026['chill_sq'] = feat_2026['chill_hours']**2
feat_2026['gdd_sq'] = feat_2026['gdd']**2

### Rebuilding model for 2026 Predictions

In [27]:
model_df = pd.read_csv("model_df.csv")

In [None]:
critical_gdd

In [20]:
model_df['gdd_sq'] = model_df['gdd'] ** 2
model_df['chill_sq'] = model_df['chill_hours'] ** 2

X = model_df[['year', 'chill_hours', 'gdd', 
              'gdd_sq', 'chill_sq'
              ]]

model = LinearRegression()
model.fit(X,y)

print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print("R^2:", model.score(X,y))

Intercept: 221.71628982179564
Coefficients: [-9.32323864e-02  7.45336978e-02 -4.75083252e-01  3.01644756e-03
 -1.88701342e-05]
R^2: 0.32747520472360836


### 2026 Prediction