In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\clean_labeled_climate_data_with_phz.pkl"
df = pd.read_pickle(path)
df.shape

(470320, 129)

In [3]:
months = ["jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec"]

def cols(var):
    """Return monthly column names for a given variable name pattern (e.g. 'tmin')."""
    return [f"{m}_{var}" for m in months]

# Seasonal precipitation

In [4]:
df['winter_precip'] = df.dec_precip + df.dec_precip + df.feb_precip
df['spring_precip'] = df.mar_precip + df.apr_precip + df.may_precip
df['summer_precip'] = df.jun_precip + df.jul_precip + df.aug_precip
df['fall_precip'] = df.sep_precip + df.oct_precip + df.nov_precip

In [5]:
df["precip_seasonality"] = df[cols("precip")].std(axis=1) / df[cols("precip")].mean(axis=1)

In [6]:
df["precip_wettest"] = df[cols("precip")].max(axis=1)

In [7]:
df["precip_driest"] = df[cols("precip")].min(axis=1)

# Temperature features

In [8]:
df["temp_annual_range"] = df[cols("maxt")].max(axis=1) - df[cols("tmin")].min(axis=1)

In [9]:
df["tmax_warmest"] = df[cols("maxt")].max(axis=1)

In [10]:
df["tmin_coldest"] = df[cols("tmin")].min(axis=1)

In [11]:
df["mean_diurnal_range"] = np.mean([df[f"{m}_maxt"] - df[f"{m}_tmin"] for m in months], axis=0)

In [12]:
base_temp = 10
df["gdd_base10"] = ((df[cols("meant")].clip(lower=base_temp) - base_temp).sum(axis=1) * 30)

# Vapor Pressure & Humidity Features

In [13]:
df["vpd_range"] = df["annual_vpdmax"] - df["annual_vpdmin"]

In [14]:
df["dewpoint_depression"] = df["annual_meant"] - df["annual_dptmean"]

In [15]:
df["dryness_index"] = (df[cols("vpdmax")].mean(axis=1) / (df[cols("precip")].mean(axis=1) + 1e-6))

In [16]:
df["hydrothermal_coeff"] = (df[cols("precip")].sum(axis=1) / (df[cols("meant")].where(df[cols("meant")] >= 10, np.nan).sum(axis=1) + 1e-6))

In [17]:
df["temp_ppt_ratio"] = df["annual_meant"] / (df["annual_precip"] + 1e-6)

In [18]:
df["aridity_index"] = df["annual_precip"] / (df["annual_meant"] + 1e-6)

In [19]:
df["moisture_stress"] = (df["annual_maxt"] - df["annual_tmin"]) / (df["annual_precip"] + 1e-6)

# Max min months

In [20]:
df["month_max_temp"] = df[cols("maxt")].idxmax(axis=1).str.split('_', expand=True)[0]

month_dict = {'jan' : 1, 'feb' : 2, 'mar' : 3, 
              'apr' : 4, 'may' : 5, 'jun' : 6, 
              'jul' : 7, 'aug' : 8, 'sep' : 9, 
              'oct' : 10, 'nov' : 11, 'dec' : 12}

df["month_max_temp"] = df["month_max_temp"].replace(month_dict)

  df["month_max_temp"] = df["month_max_temp"].replace(month_dict)


In [21]:
df["month_min_temp"] = df[cols("tmin")].idxmin(axis=1).str.split('_', expand=True)[0]

df["month_min_temp"] = df["month_min_temp"].replace(month_dict)

  df["month_min_temp"] = df["month_min_temp"].replace(month_dict)


In [22]:
df["month_max_precip"] = df[cols("precip")].idxmax(axis=1).str.split('_', expand=True)[0]

df["month_max_precip"] = df["month_max_precip"].replace(month_dict)

  df["month_max_precip"] = df["month_max_precip"].replace(month_dict)


In [23]:
df["month_min_precip"] = df[cols("precip")].idxmin(axis=1).str.split('_', expand=True)[0]

df["month_min_precip"] = df["month_min_precip"].replace(month_dict)

  df["month_min_precip"] = df["month_min_precip"].replace(month_dict)


# Bio climate variables

In [24]:
months = ["jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec"]

def monthly_cols(var):
    """Return expected monthly column names for a given var prefix."""
    return [f"{m}_{var}" for m in months]

def ensure_numeric_monthly(df, var):
    """Coerce monthly columns for var to numeric and return DataFrame (n x 12)."""
    cols = monthly_cols(var)
    present = [c for c in cols if c in df.columns]
    if len(present) != 12:
        raise KeyError(f"Expected 12 monthly columns for '{var}', found {len(present)}: missing {set(cols)-set(present)}")
    mat = df[present].apply(pd.to_numeric, errors="coerce")
    # return as numpy (n,12) and keep column order
    return mat, present

def rolling_quarter_array(arr):
    """
    Given arr shape (n,12) returns (n,12) where each column i is the mean (or sum)
    of months i, i+1, i+2 (wrapping around).
    """
    nrows = arr.shape[0]
    quarters = np.empty((nrows, 12), dtype=float)
    for i in range(12):
        inds = [(i + k) % 12 for k in range(3)]
        # mean across the 3 months, handle NaNs via nanmean
        quarters[:, i] = np.nanmean(arr[:, inds], axis=1)
    return quarters

def rolling_quarter_sum_array(arr):
    nrows = arr.shape[0]
    quarters = np.empty((nrows, 12), dtype=float)
    for i in range(12):
        inds = [(i + k) % 12 for k in range(3)]
        quarters[:, i] = np.nansum(arr[:, inds], axis=1)
    return quarters

def compute_bioclim(df):
    # --- load monthly matrices (as DataFrames and numpy arrays) ---
    tmin_df, tmin_cols = ensure_numeric_monthly(df, "tmin")
    tmax_df, tmax_cols = ensure_numeric_monthly(df, "maxt")
    tmean_df, tmean_cols = ensure_numeric_monthly(df, "meant")
    ppt_df, ppt_cols = ensure_numeric_monthly(df, "precip")
    
    tmin = tmin_df.to_numpy(dtype=float)   # shape (n,12)
    tmax = tmax_df.to_numpy(dtype=float)
    tmean = tmean_df.to_numpy(dtype=float)
    ppt = ppt_df.to_numpy(dtype=float)
    
    n = tmin.shape[0]
    bio = pd.DataFrame(index=df.index)
    
    # BIO1 - Annual mean temperature (Â°C)
    bio["BIO1_AnnualMeanTemp"] = np.nanmean(tmean, axis=1)
    
    # BIO2 - Mean diurnal range = mean monthly (tmax - tmin)
    diurnal = tmax - tmin
    bio["BIO2_MeanDiurnalRange"] = np.nanmean(diurnal, axis=1)
    
    # BIO5 and BIO6 needed for BIO7
    # BIO5 - Max temperature of warmest month (monthly tmax max)
    bio["BIO5_MaxTempWarmestMonth"] = np.nanmax(tmax, axis=1)
    # BIO6 - Min temperature of coldest month (monthly tmin min)
    bio["BIO6_MinTempColdestMonth"] = np.nanmin(tmin, axis=1)
    
    # BIO7 - Temperature annual range = BIO5 - BIO6
    bio["BIO7_TemperatureAnnualRange"] = bio["BIO5_MaxTempWarmestMonth"] - bio["BIO6_MinTempColdestMonth"]
    
    # BIO3 - Isothermality = (BIO2 / BIO7) * 100 (guard divide-by-zero)
    with np.errstate(divide='ignore', invalid='ignore'):
        bio3 = (bio["BIO2_MeanDiurnalRange"].to_numpy() / bio["BIO7_TemperatureAnnualRange"].to_numpy()) * 100.0
    bio["BIO3_Isothermality"] = pd.Series(np.where(np.isfinite(bio3), bio3, np.nan), index=df.index)
    
    # BIO4 - Temperature seasonality (std dev of monthly mean * 100)
    bio["BIO4_TemperatureSeasonality"] = np.nanstd(tmean, axis=1) * 100.0
    
    # QUARTER calculations:
    # mean temp quarters (n x 12) ; each column = mean temp of that 3-month window
    mean_quarters = rolling_quarter_array(tmean)
    # precipitation quarter sums (n x 12)
    sum_quarters = rolling_quarter_sum_array(ppt)
    
    # for indexing: for each row choose argmax/argmin over the 12 quarters
    wet_q_idx = np.nanargmax(sum_quarters, axis=1)       # index 0..11 of wettest quarter
    dry_q_idx = np.nanargmin(sum_quarters, axis=1)
    warm_q_idx = np.nanargmax(mean_quarters, axis=1)
    cold_q_idx = np.nanargmin(mean_quarters, axis=1)
    
    # resolve rows where all-NaN may cause argmax to return 0 incorrectly:
    # mark positions where all quarters are NaN for sums/means
    sum_quarters_allnan = np.isnan(sum_quarters).all(axis=1)
    mean_quarters_allnan = np.isnan(mean_quarters).all(axis=1)
    
    # BIO8 - Mean temp of wettest quarter
    bio8 = np.full(n, np.nan, dtype=float)
    for i in range(n):
        if not sum_quarters_allnan[i]:
            idx = int(wet_q_idx[i])
            bio8[i] = mean_quarters[i, idx]
    bio["BIO8_MeanTempWettestQuarter"] = bio8
    
    # BIO9 - Mean temp of driest quarter
    bio9 = np.full(n, np.nan, dtype=float)
    for i in range(n):
        if not sum_quarters_allnan[i]:
            idx = int(dry_q_idx[i])
            bio9[i] = mean_quarters[i, idx]
    bio["BIO9_MeanTempDriestQuarter"] = bio9
    
    # BIO10 - Mean temp of warmest quarter
    bio10 = np.full(n, np.nan, dtype=float)
    for i in range(n):
        if not mean_quarters_allnan[i]:
            idx = int(warm_q_idx[i])
            bio10[i] = mean_quarters[i, idx]
    bio["BIO10_MeanTempWarmestQuarter"] = bio10
    
    # BIO11 - Mean temp of coldest quarter
    bio11 = np.full(n, np.nan, dtype=float)
    for i in range(n):
        if not mean_quarters_allnan[i]:
            idx = int(cold_q_idx[i])
            bio11[i] = mean_quarters[i, idx]
    bio["BIO11_MeanTempColdestQuarter"] = bio11
    
    # BIO12 - Annual precipitation (sum of monthly ppt)
    bio["BIO12_AnnualPrecip"] = np.nansum(ppt, axis=1)
    
    # BIO13 - Precipitation of wettest month
    bio["BIO13_PrecipWettestMonth"] = np.nanmax(ppt, axis=1)
    
    # BIO14 - Precipitation of driest month
    bio["BIO14_PrecipDriestMonth"] = np.nanmin(ppt, axis=1)
    
    # BIO15 - Precipitation seasonality (coefficient of variation *100)
    # be careful with division by zero: where mean==0 produce NaN
    ppt_mean = np.nanmean(ppt, axis=1)
    ppt_std = np.nanstd(ppt, axis=1)
    with np.errstate(divide='ignore', invalid='ignore'):
        bio15 = (ppt_std / ppt_mean) * 100.0
    bio["BIO15_PrecipSeasonality"] = pd.Series(np.where(np.isfinite(bio15), bio15, np.nan), index=df.index)
    
    # BIO16 - Precip of wettest quarter (sum of 3 months) -> use sum_quarters.max
    bio["BIO16_PrecipWettestQuarter"] = np.nanmax(rolling_quarter_sum_array(ppt), axis=1)
    
    # BIO17 - Precip of driest quarter
    bio["BIO17_PrecipDriestQuarter"] = np.nanmin(rolling_quarter_sum_array(ppt), axis=1)
    
    # BIO18 - Precip of warmest quarter (sum of ppt in warmest quarter)
    bio18 = np.full(n, np.nan, dtype=float)
    sum_quarters_full = rolling_quarter_sum_array(ppt)
    for i in range(n):
        if not mean_quarters_allnan[i]:
            idx = int(warm_q_idx[i])
            bio18[i] = sum_quarters_full[i, idx]
    bio["BIO18_PrecipWarmestQuarter"] = bio18
    
    # BIO19 - Precip of coldest quarter
    bio19 = np.full(n, np.nan, dtype=float)
    for i in range(n):
        if not mean_quarters_allnan[i]:
            idx = int(cold_q_idx[i])
            bio19[i] = sum_quarters_full[i, idx]
    bio["BIO19_PrecipColdestQuarter"] = bio19
    
    # Final: replace inf with NaN
    bio.replace([np.inf, -np.inf], np.nan, inplace=True)
    return bio

In [25]:
bio_df = compute_bioclim(df)

In [26]:
bio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470320 entries, 0 to 470319
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   BIO1_AnnualMeanTemp           470320 non-null  float64
 1   BIO2_MeanDiurnalRange         470320 non-null  float64
 2   BIO5_MaxTempWarmestMonth      470320 non-null  float64
 3   BIO6_MinTempColdestMonth      470320 non-null  float64
 4   BIO7_TemperatureAnnualRange   470320 non-null  float64
 5   BIO3_Isothermality            470320 non-null  float64
 6   BIO4_TemperatureSeasonality   470320 non-null  float64
 7   BIO8_MeanTempWettestQuarter   470320 non-null  float64
 8   BIO9_MeanTempDriestQuarter    470320 non-null  float64
 9   BIO10_MeanTempWarmestQuarter  470320 non-null  float64
 10  BIO11_MeanTempColdestQuarter  470320 non-null  float64
 11  BIO12_AnnualPrecip            470320 non-null  float64
 12  BIO13_PrecipWettestMonth      470320 non-nul

In [27]:
df = pd.concat([df, bio_df], axis=1)
df.shape

(470320, 171)

# Saving 

In [30]:
output_path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\climate_data_complete.parquet"
df.to_parquet(output_path, index=False)