In [None]:
# Instead of original training dataframe, we need to make a table of
# with climate data as columns and individual months for individual glaciers
# as rows. In training we need to batch the months to match summer, winter and
# annual mass balance, aggregate the results for each month and evaluate it against
# the summer, winter and annual mass balance using a custom loss function. We can shuffle the batches, 
# but we need to keep the respective batches themselves in the correct order. 

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb

In [8]:
# Create dummy feature dataset
def create_feature_dataset(n_rows):

    np.random.seed(0)

    data = {
        "t2m": np.random.uniform(-5, 5, n_rows),   # Temperature between -5 and 5
        "tp": np.random.uniform(0, 100, n_rows),   # Total precipitation between 0 and 100
        "fal": np.random.uniform(0, 1, n_rows),    # Albedo between 0 and 1
        "ssr": np.random.uniform(0, 100, n_rows)   # Solar radiation between 0 and 100
    }

    # Create the DataFrame
    df = pd.DataFrame(data)

    return df

num_rows = 600
    
# Using the function to create the dataset
df_features = create_feature_dataset(num_rows)

In [7]:
X_values = df_features.values

In [23]:
months = 6
num_rows_y = int(num_rows/months)

y_values = np.random.uniform(-5, 5, num_rows_y)
print(y_values)
print(y_values.shape)

[-3.5405211   1.32765641  2.92555167  2.7333222  -4.07720402  1.85512719
  2.16023921  3.62166267  0.08044339 -0.38906     4.65116325  2.96512256
  0.58730991 -1.69382929  3.45237996 -0.44563611 -4.07314808 -0.45095727
  3.71968396 -0.51717853 -4.85650852  1.11485324  4.95830003  3.17248584
  1.17237947  4.14398887  3.13581682 -0.01360647  0.91220285  2.31292809
  3.46426164 -4.41823598  2.74803857 -0.5249656   1.6079856   2.64633011
  0.39501925 -3.41485149 -0.90473168 -4.23127006  1.89300691  4.53706182
  2.95252812 -1.10213577  0.84291496 -1.95799324  0.410456   -1.99273094
  2.38324757 -2.40658492  4.6402039   1.61948732 -4.31222152 -4.89222236
  4.97046344 -2.79494422 -1.87437252 -2.91585986  4.28101772  1.5201286
  0.04386324  0.57650814 -1.82127153  1.14175002 -0.80399637 -3.00771854
  1.566051   -1.26159431  2.85066379 -0.84605042  0.08271375  2.87466479
 -0.52121907  4.66221135  2.93028667  3.26430405  4.53920467 -1.11149371
  0.80080264  1.93591741  1.73458484  4.52936795 -2.

In [24]:
#np.tile(Y_values,(6,1))
y_values_rep = np.repeat(y_values, 6)

In [14]:
#dtrain = xgb.DMatrix(X_values, label=Y_values)


In [22]:
#y_true = dtrain.get_label()
#y_true_seasonal = y_true.reshape(-1,6)
#print(y_true_seasonal)
#y_true_agg = np.mean(y_true_seasonal, axis=1)
#print(y_true_agg)
#print(y_true_agg.shape)

In [60]:
# When defining a custom loss function this automatically 
# takes in arguments preds (predictions) and dtrain
def seasonal_mse(preds, dtrain):
    """
    Custom Mean Squared Error loss for seasonal data.
    """
    # Get the true values
    y_true = dtrain.get_label()

    # Assuming 6-month seasons, reshape predictions and true values
    preds_seasonal = preds.reshape(-1, 6)
    y_true_seasonal = y_true.reshape(-1, 6)

    # Aggregate (e.g., sum or average) predictions and true values over the season
    preds_agg = np.sum(preds_seasonal, axis=1)
    y_true_agg = np.mean(y_true_seasonal, axis=1)

    # Calculate the gradient (first derivative) and hessian (second derivative) for MSE
    gradient = preds_agg - y_true_agg
    hessian = np.ones_like(gradient)

    # Reshape gradient and hessian to match the original shape
    gradient = np.repeat(gradient, 6)
    hessian = np.repeat(hessian, 6)

    return gradient, hessian

# Example dataset (replace with your actual data)
#X_train, X_test, y_train, y_test = # load or create your dataset here

# Convert data to DMatrix, which is a data format used by XGBoost
dtrain = xgb.DMatrix(X_values, label=y_values_rep)

# Set up parameters (these are example parameters, tune for your specific case)
params = {'max_depth': 6, 'eta': 0.3}

# Train the model
model = xgb.train(params, dtrain, obj=seasonal_mse, num_boost_round=100)
# num_boost_round is in the case of the Booster interface the same as n_estimators in XGBregressor

# Predictions (you will aggregate these later for evaluation)
#dpred = xgb.DMatrix(X_test)
#preds = model.predict(dpred)

In [26]:
model

<xgboost.core.Booster at 0x19a98811900>

In [28]:
model.attributes()

{'best_iteration': '99', 'best_ntree_limit': '100'}

In [31]:
model.get_fscore()

{'f0': 212.0, 'f1': 124.0, 'f2': 121.0, 'f3': 186.0}

In [32]:
model.get_score()

{'f0': 212.0, 'f1': 124.0, 'f2': 121.0, 'f3': 186.0}

In [61]:
preds = model.predict(dtrain)

In [62]:
print(preds[:6].sum())

-3.5404155


In [63]:
print(y_values[0])

-3.540521098184869


## Using Scikit-learn XGBRegressor interface

In [55]:

# Define a custom objective function for seasonal aggregation
def custom_seasonal_obj(y_true, y_pred):
    # Reshape the predictions and true values to represent seasons (6 months in this case)
    y_pred_seasonal = y_pred.reshape(-1, 6)
    y_true_seasonal = y_true.reshape(-1, 6)

    # Aggregate predictions and true values over the season
    y_pred_agg = np.sum(y_pred_seasonal, axis=1)
    y_true_agg = np.mean(y_true_seasonal, axis=1)

    # Compute gradients
    #grad = (y_pred_agg - y_true_agg).repeat(6) / 6
    # Compute Hessians
    #hess = np.ones_like(y_true)

    # Calculate the gradient (first derivative) and hessian (second derivative) for MSE
    grad = y_pred_agg - y_true_agg
    hess = np.ones_like(grad)

    # Reshape gradient and hessian to match the original shape
    grad = np.repeat(grad, 6)
    hess = np.repeat(hess, 6)

    return grad, hess

# Initialize XGBRegressor with the custom objective
model_reg = xgb.XGBRegressor(objective=custom_seasonal_obj)

# Train the model
best_model = model_reg.fit(X_values, y_values_rep)

# Make predictions
#predictions = model.predict(X_test)

# Further code for evaluation, etc.
#predictions[:10]  # Display the first few predictions

In [56]:
best_model.feature_importances_

array([0.19910096, 0.25051194, 0.29249424, 0.25789285], dtype=float32)

In [57]:
predictions = best_model.predict(X_values)

In [58]:
predictions[:10]

array([-0.9803081 ,  0.07559904, -0.41468713, -1.4404745 , -0.20725927,
       -0.57328564, -0.03200315,  0.6866808 ,  0.21791591,  0.12155277],
      dtype=float32)

In [59]:
print(predictions[:6].sum())
print(y_values[0])


-3.5404155
-3.540521098184869
