In [2]:
import pandas as pd
import numpy as numpy

from sklearn.linear_model import LinearRegression

## Creating Chill Hours feature

Defined as total hours per day where temperature < 45Â°F using daily min/max temperatures and a triangular approximation.

In [3]:
full_temps_dc = pd.read_csv('full_temps_dc.csv')
full_temps_dc['tavg'] = (full_temps_dc['tmin'] + full_temps_dc['tmax']) / 2
full_temps_dc['date'] = pd.to_datetime(full_temps_dc['date'])

cherry_dc = pd.read_csv('Q_blooms_dc.csv')
print(cherry_dc.head())

   year  bloom_date  bloom_doy
0  1942  1942-04-05         95
1  1943  1943-04-04         94
2  1944  1944-04-09        100
3  1945  1945-03-20         79
4  1946  1946-03-23         82


In [4]:
def chill_and_gdd(temp_df, bloom_df, chill_thresh = 7.2, gdd_thresh=10):
    results = []

    for year in bloom_df['year'].unique():
        bloom_doy = bloom_df.loc[bloom_df['year'] == year, 'bloom_doy'].values[0]

        # Chill Period: Oct 1 (prev year) -> Feb 28 (current year)
        chill_start = pd.Timestamp(year=year-1, month=10, day=1)
        chill_end = pd.Timestamp(year=year, month=2, day=28)

        chill_data = temp_df[
            (temp_df['date'] >= chill_start) &
            (temp_df['date'] <= chill_end)
        ].copy()

        chill_hours_tot = 0

        for _, row in chill_data.iterrows():
            tmin = row['tmin']
            tmax = row['tmax']

            if tmax < chill_thresh:
                chill_hours = 24
            elif tmin >= chill_thresh:
                chill_hours = 0
            else:
                # Partidal day below threshold (Triangular approximation)
                chill_hours = 24 * (chill_thresh - tmin) / (tmax-tmin)
            chill_hours_tot += chill_hours
        
        gdd_data = temp_df[
            (temp_df['year'] == year) &
            (temp_df['doy'] <= bloom_doy)
        ]

        gdd_tot = 0

        for _, row in gdd_data.iterrows():
            tavg = row['tavg']
            gdd_day = max(tavg-gdd_thresh, 0)
            gdd_tot += gdd_day

        results.append({
            'year': year,
            'chill_hours': chill_hours_tot,
            'gdd': gdd_tot
        })
    return pd.DataFrame(results)

model_df = chill_and_gdd(full_temps_dc, cherry_dc)

print(model_df.head())
print(model_df.describe())

   year  chill_hours    gdd
0  1942  1239.408517  41.05
1  1943  2186.390113  70.20
2  1944  2111.196179  57.30
3  1945  2312.851436  62.85
4  1946  2022.637663  69.05
              year  chill_hours         gdd
count    83.000000    83.000000   83.000000
mean   1983.000000  1921.259383   60.843976
std      24.103942   252.012593   19.905269
min    1942.000000  1239.408517   12.700000
25%    1962.500000  1773.585214   44.875000
50%    1983.000000  1961.704061   59.950000
75%    2003.500000  2117.483690   74.975000
max    2024.000000  2448.101020  103.450000


In [5]:
model_df = model_df.merge(cherry_dc[['year','bloom_doy']], on='year')
model_df.corr()

Unnamed: 0,year,chill_hours,gdd,bloom_doy
year,1.0,-0.378548,0.104073,-0.371851
chill_hours,-0.378548,1.0,-0.349791,0.323713
gdd,0.104073,-0.349791,1.0,-0.366077
bloom_doy,-0.371851,0.323713,-0.366077,1.0


Biologcial machanism is working because gdd is negatively correlated with bloom_doy, and chill_hours has weak, positive correlation with bloom_doy. 

## Basic Model for Feature Investigation

In [6]:
X = model_df[['year','chill_hours', 'gdd']]
y = model_df['bloom_doy']

model = LinearRegression()
model.fit(X, y)

print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print("R^2:", model.score(X,y))


Intercept: 272.6605887889021
Coefficients: [-0.09039968  0.00303623 -0.10835694]
R^2: 0.2550980940307255


Blood advances ~ 0.09 days per year, thus 0.9 days per decade, approximately 7-8 days ealier since 1942.
This shows an independent long-term advancement trend.

+100 chill hours -> bloom ~0.003 days later.

+10 GDD -> bloom ~ 1.08 days earlier

Heat is driving bloom timing much more than chilling. Spring heat accumulation matters most.


### With Interaction

In [10]:
model_df['interaction'] = model_df['chill_hours'] * model_df['gdd']

X = model_df[['year', 'chill_hours', 'gdd', 'interaction']]

model.fit(X,y)

print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print("R^2:", model.score(X,y))

Intercept: 273.2204569267005
Coefficients: [-8.78776463e-02  1.64213222e-04 -1.95019588e-01  4.54114381e-05]
R^2: 0.2560535350019685


Chill X Heat interaction is not adding much explanatory power. Barely any improvement in R^2 from the previous model. 

### Polynomial Terms

In [11]:
model_df['gdd_sq'] = model_df['gdd'] ** 2
model_df['chill_sq'] = model_df['chill_hours'] ** 2

X = model_df[['year', 'chill_hours', 'gdd', 
              'gdd_sq', 'chill_sq'
              ]]

model.fit(X,y)

print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print("R^2:", model.score(X,y))

Intercept: 221.7162898212925
Coefficients: [-9.32323864e-02  7.45336978e-02 -4.75083252e-01  3.01644756e-03
 -1.88701342e-05]
R^2: 0.32747520472360925


Bloom response to heat is nonlinear; likely accelerating with higher GDD. This matches biological forcing models. 

Models for far (based on R^2):
* Biological only: 0.178
* +Year: 0.255
* +Interaction: 0.256
* +Polynomial: 0.327