In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

## Random Forest with Cross-Val

In [6]:
model_df = pd.read_csv("model_df.csv")
X = model_df[['year', 'chill_hours', 'gdd']]
y = model_df['bloom_doy']

rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=4,
    random_state=42
)

scores = cross_val_score(rf, X, y, cv=10, scoring='r2')

print("CV R^2 scores:", scores)
print("Mean CV R^2:", np.mean(scores))

CV R^2 scores: [-0.30952823  0.06626288  0.20708805  0.25519639 -0.59613754 -0.52067277
  0.19398089 -0.88046908  0.00801942 -0.97397243]
Mean CV R^2: -0.255023241815252


A couple negative mean CV R2 scores meaning RandomForest model is worse than predicting the mean. Linear or polynomial regression is probably better.

## Polynomial Linear Model for 2026 Predictions

In [8]:
model_df['gdd_sq'] = model_df['gdd'] ** 2
model_df['chill_sq'] = model_df['chill_hours'] ** 2

X = model_df[['year', 'chill_hours', 'gdd', 
              'gdd_sq', 'chill_sq'
              ]]

model = LinearRegression()
model.fit(X,y)

print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print("R^2:", model.score(X,y))

Intercept: 221.71628982179564
Coefficients: [-9.32323864e-02  7.45336978e-02 -4.75083252e-01  3.01644756e-03
 -1.88701342e-05]
R^2: 0.32747520472360836
