In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\clean_labeled_climate_data_with_phz.pkl"
df = pd.read_pickle(path)
df.shape

(470320, 129)

In [11]:
months = ["jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec"]

def cols(var):
    """Return monthly column names for a given variable name pattern (e.g. 'tmin')."""
    return [f"{m}_{var}" for m in months]

# Seasonal precipitation

In [9]:
df['winter_precip'] = df.dec_precip + df.dec_precip + df.feb_precip
df['spring_precip'] = df.mar_precip + df.apr_precip + df.may_precip
df['summer_precip'] = df.jun_precip + df.jul_precip + df.aug_precip
df['fall_precip'] = df.sep_precip + df.oct_precip + df.nov_precip

In [28]:
df["precip_seasonality"] = df[cols("precip")].std(axis=1) / df[cols("precip")].mean(axis=1)

In [30]:
df["precip_wettest"] = df[cols("precip")].max(axis=1)

In [31]:
df["precip_driest"] = df[cols("precip")].min(axis=1)

# Temperature features

In [17]:
df["temp_annual_range"] = df[cols("maxt")].max(axis=1) - df[cols("tmin")].min(axis=1)

In [18]:
df["tmax_warmest"] = df[cols("maxt")].max(axis=1)

In [19]:
df["tmin_coldest"] = df[cols("tmin")].min(axis=1)

In [95]:
df["mean_diurnal_range"] = np.mean([df[f"{m}_maxt"] - df[f"{m}_tmin"] for m in months], axis=0)

In [27]:
base_temp = 10
df["gdd_base10"] = ((df[cols("meant")].clip(lower=base_temp) - base_temp).sum(axis=1) * 30)

# Vapor Pressure & Humidity Features

In [38]:
df["vpd_range"] = df["annual_vpdmax"] - df["annual_vpdmin"]

In [39]:
df["dewpoint_depression"] = df["annual_meant"] - df["annual_dptmean"]

In [40]:
df["dryness_index"] = (df[cols("vpdmax")].mean(axis=1) / (df[cols("precip")].mean(axis=1) + 1e-6))

In [92]:
df["hydrothermal_coeff"] = (df[cols("precip")].sum(axis=1) / (df[cols("meant")].where(df[cols("meant")] >= 10, np.nan).sum(axis=1) + 1e-6))

In [47]:
df["temp_ppt_ratio"] = df["annual_meant"] / (df["annual_precip"] + 1e-6)

In [48]:
df["aridity_index"] = df["annual_precip"] / (df["annual_meant"] + 1e-6)

In [50]:
df["moisture_stress"] = (df["annual_maxt"] - df["annual_tmin"]) / (df["annual_precip"] + 1e-6)

# Max min months

In [72]:
df["month_max_temp"] = df[cols("maxt")].idxmax(axis=1).str.split('_', expand=True)[0]

month_dict = {'jan' : 1, 'feb' : 2, 'mar' : 3, 
              'apr' : 4, 'may' : 5, 'jun' : 6, 
              'jul' : 7, 'aug' : 8, 'sep' : 9, 
              'oct' : 10, 'nov' : 11, 'dec' : 12}

df["month_max_temp"] = df["month_max_temp"].replace(month_dict)

  df["month_max_temp"] = df["month_max_temp"].replace(month_dict)


In [79]:
df["month_min_temp"] = df[cols("tmin")].idxmin(axis=1).str.split('_', expand=True)[0]

df["month_min_temp"] = df["month_min_temp"].replace(month_dict)

  df["month_min_temp"] = df["month_min_temp"].replace(month_dict)


In [75]:
df["month_max_precip"] = df[cols("precip")].idxmax(axis=1).str.split('_', expand=True)[0]

df["month_max_precip"] = df["month_max_precip"].replace(month_dict)

  df["month_max_precip"] = df["month_max_precip"].replace(month_dict)


In [77]:
df["month_min_precip"] = df[cols("precip")].idxmin(axis=1).str.split('_', expand=True)[0]

df["month_min_precip"] = df["month_min_precip"].replace(month_dict)

  df["month_min_precip"] = df["month_min_precip"].replace(month_dict)


# Bio climate variables

In [None]:
months = ["jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec"]

def cols(var):
    return [f"{m}_{var}" for m in months]

# Monthly mean temperature
tmean = df[cols("meant")]

# Monthly min and max temperature
tmin = df[cols("tmin")]
tmax = df[cols("maxt")]

# Monthly precipitation
ppt = df[cols("precip")]

# Bioclimatic variable calculations
bio = pd.DataFrame(index=df.index)

# BIO1: Annual Mean Temperature
bio["BIO1_AnnualMeanTemp"] = tmean.mean(axis=1)

# BIO2: Mean Diurnal Range (mean of monthly (Tmax - Tmin))
bio["BIO2_MeanDiurnalRange"] = (tmax - tmin).mean(axis=1)

# BIO3: Isothermality (BIO2/BIO7 * 100)
# (computed later after BIO7 is known)

# BIO4: Temperature Seasonality (standard deviation * 100)
bio["BIO4_TempSeasonality"] = tmean.std(axis=1) * 100

# BIO5: Max Temperature of Warmest Month
bio["BIO5_MaxTempWarmestMonth"] = tmax.max(axis=1)

# BIO6: Min Temperature of Coldest Month
bio["BIO6_MinTempColdestMonth"] = tmin.min(axis=1)

# BIO7: Temperature Annual Range (BIO5 - BIO6)
bio["BIO7_TempAnnualRange"] = bio["BIO5_MaxTempWarmestMonth"] - bio["BIO6_MinTempColdestMonth"]

# BIO3: Isothermality (BIO2 / BIO7 * 100)
bio["BIO3_Isothermality"] = (bio["BIO2_MeanDiurnalRange"] / bio["BIO7_TempAnnualRange"]) * 100

# BIO8: Mean Temperature of Wettest Quarter
# BIO9: Mean Temperature of Driest Quarter
# BIO10: Mean Temperature of Warmest Quarter
# BIO11: Mean Temperature of Coldest Quarter
# -> Use rolling 3-month windows

# function to get rolling quarterly means
def rolling_quarterly_means(df_monthly):
    vals = df_monthly.values
    rolled = np.stack([np.roll(vals, -i, axis=1)[:, :3] for i in range(12)], axis=2)
    quarterly = rolled.mean(axis=1)
    return quarterly

# Rolling quarterly means
tmean_quarters = rolling_quarterly_means(tmean)
ppt_quarters = rolling_quarterly_means(ppt)

# Identify wettest, driest, warmest, coldest quarters
wettest_q = ppt_quarters.argmax(axis=1)
driest_q = ppt_quarters.argmin(axis=1)
warmest_q = tmean_quarters.argmax(axis=1)
coldest_q = tmean_quarters.argmin(axis=1)

# Extract values for those quarters
bio["BIO8_MeanTempWettestQuarter"] = [tmean_quarters[i, wettest_q[i]] for i in range(len(df))]
bio["BIO9_MeanTempDriestQuarter"] = [tmean_quarters[i, driest_q[i]] for i in range(len(df))]
bio["BIO10_MeanTempWarmestQuarter"] = [tmean_quarters[i, warmest_q[i]] for i in range(len(df))]
bio["BIO11_MeanTempColdestQuarter"] = [tmean_quarters[i, coldest_q[i]] for i in range(len(df))]

# BIO12: Annual Precipitation
bio["BIO12_AnnualPrecip"] = ppt.sum(axis=1)

# BIO13: Precipitation of Wettest Month
bio["BIO13_PrecipWettestMonth"] = ppt.max(axis=1)

# BIO14: Precipitation of Driest Month
bio["BIO14_PrecipDriestMonth"] = ppt.min(axis=1)

# BIO15: Precipitation Seasonality (Coefficient of Variation)
bio["BIO15_PrecipSeasonality"] = (ppt.std(axis=1) / ppt.mean(axis=1)) * 100

# BIO16: Precipitation of Wettest Quarter
bio["BIO16_PrecipWettestQuarter"] = [ppt_quarters[i, wettest_q[i]] * 3 for i in range(len(df))]

# BIO17: Precipitation of Driest Quarter
bio["BIO17_PrecipDriestQuarter"] = [ppt_quarters[i, driest_q[i]] * 3 for i in range(len(df))]

# BIO18: Precipitation of Warmest Quarter
bio["BIO18_PrecipWarmestQuarter"] = [ppt_quarters[i, warmest_q[i]] * 3 for i in range(len(df))]

# BIO19: Precipitation of Coldest Quarter
bio["BIO19_PrecipColdestQuarter"] = [ppt_quarters[i, coldest_q[i]] * 3 for i in range(len(df))]

# ============================================================
# Final clean-up
# ============================================================
bio.replace([np.inf, -np.inf], np.nan, inplace=True)
bio = bio.add_prefix("")  # optional: remove prefix if you don’t want BIO_
bio_df = pd.concat([df, bio], axis=1)

# ============================================================
# Save
# ============================================================
bio_df.to_csv("climate_data_with_bioclim.csv", index=False)
print("✅ BIOCLIM feature computation complete. Added 19 variables.")
