In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import optuna
import numpy as np

# Set the Optuna logger to output only WARNING and higher levels
optuna.logging.set_verbosity(optuna.logging.WARNING)

sns.set_theme()
sns.set_context("notebook")
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dtype_dict = {
    'Date': 'str',
    'FarmName_Pseudo': 'str',
    'SE_Number': 'str',
    'Age': 'Int64',
    'BreedName': 'str',
    'DailyYield': 'float',
    'PreviousDailyYield': 'float',
    'DailyYieldChange': 'float',
    'DaysInMilk': 'Int64',
    'YearSeason': 'str',
    'LactationNumber': 'Int64',
    'ExpectedYield': 'float',
    'NormalizedDailyYield': 'float',
    'NormalizedDailyYieldChange': 'float',
    'HeatStress': 'Int64',
    'Temp15Threshold': 'Int64',
    'HW': 'Int64',
    'cum_HW': 'Int64',
    'MeanTemperature': 'float',
    'MeanTHI_adj': 'float'
}

milk_data = pd.read_csv('../Data/MergedData/MilkApproachYieldData.csv', dtype=dtype_dict)
milk_data['Date'] = pd.to_datetime(milk_data['Date'], format='%Y-%m-%d')
milk_data.head(-5)

Unnamed: 0,Date,FarmName_Pseudo,SE_Number,Age,BreedName,LactationNumber,DaysInMilk,YearSeason,DailyYield,PreviousDailyYield,DailyYieldChange,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatStress,Temp15Threshold,HW,cum_HW,MeanTemperature,MeanTHI_adj
0,2022-01-01,a624fb9a,SE-064c0cec-1189,3095,02 SLB,7,191,2022-1,30.77,0.000000,0.000000,35.914865,0.856748,0.000000,0,0,0,0,-3.025000,28.012944
1,2022-01-02,a624fb9a,SE-064c0cec-1189,3096,02 SLB,7,192,2022-1,48.22,30.770000,8.725000,35.799613,1.103224,0.243718,0,0,0,0,-0.279167,32.898193
2,2022-01-03,a624fb9a,SE-064c0cec-1189,3097,02 SLB,7,193,2022-1,30.53,39.495000,-2.988333,35.684360,1.023044,-0.083744,0,0,0,0,2.033333,36.760487
3,2022-01-04,a624fb9a,SE-064c0cec-1189,3098,02 SLB,7,194,2022-1,42.26,36.506667,1.438333,35.569108,1.066796,0.040438,0,0,0,0,0.066667,31.939524
4,2022-01-05,a624fb9a,SE-064c0cec-1189,3099,02 SLB,7,195,2022-1,38.49,37.945000,0.109000,35.453856,1.073339,0.003074,0,0,0,0,-3.700000,26.498206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483097,2023-06-03,f454e660,SE-fcdf259d-0044-0,4150,41 Fjällko,10,347,2023-3,12.67,14.652000,-0.622000,13.608593,1.030966,-0.045706,0,1,0,0,12.666667,53.132530
483098,2023-06-04,f454e660,SE-fcdf259d-0044-0,4151,41 Fjällko,10,348,2023-3,22.31,14.030000,0.954000,13.516773,1.108549,0.070579,0,1,0,0,13.079167,56.726870
483099,2023-06-05,f454e660,SE-fcdf259d-0044-0,4152,41 Fjällko,10,349,2023-3,12.84,14.984000,-0.092000,13.424952,1.109278,-0.006853,0,1,0,0,14.237500,58.482418
483100,2023-06-06,f454e660,SE-fcdf259d-0044-0,4153,41 Fjällko,10,350,2023-3,9.47,14.892000,-0.284000,13.333131,1.095617,-0.021300,0,1,0,0,15.345833,60.546358


In [3]:
# Specify the farm ID for analysis
farm_id = 'a624fb9a'

# Filter data for the specific farm
farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id]

# Calculate the baseline (mean of NormalizedDailyYield for all days at the farm)
baseline_mean = farm_data['NormalizedDailyYield'].mean()
print(f"Baseline mean of NormalizedDailyYield for all days: {baseline_mean:.4f}")

# Define the feature combinations
feature_combinations = [
    ['MeanTemperature', 'HW'],
    ['MeanTemperature', 'cum_HW'],
    ['MeanTHI_adj', 'HW'],
    ['MeanTHI_adj', 'cum_HW']
]

# Define target variable
target = 'NormalizedDailyYield'

# Filter data for heat stress conditions
data_heatstress = farm_data[farm_data['HeatStress'] == 1]

# Split the data into train and test sets
train_heatstress, test_heatstress = train_test_split(data_heatstress, test_size=0.3, random_state=42)

# Scale the continuous features (MeanTemperature and MeanTHI_adj)
scaler = StandardScaler()
train_heatstress[['MeanTemperature', 'MeanTHI_adj']] = scaler.fit_transform(train_heatstress[['MeanTemperature', 'MeanTHI_adj']])
test_heatstress[['MeanTemperature', 'MeanTHI_adj']] = scaler.transform(test_heatstress[['MeanTemperature', 'MeanTHI_adj']])

# Function to optimize hyperparameters using Optuna
def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
    }

    # Cross-validation to estimate performance
    model = xgb.XGBRegressor(**param)
    scores = cross_val_score(model, train_heatstress[features], train_heatstress[target], cv=5, scoring='neg_mean_squared_error')
    mse = -scores.mean()
    return mse

# Function to fit the model and print results
def fit_xgboost_and_print_results(train_data, test_data, features, baseline_mean):
    print(f"\nSelected features: {features}")

    # Optimize hyperparameters using Optuna
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100, timeout=600)
    best_params = study.best_params
    print(f"Best parameters: {best_params}")

    # Train final model using the best parameters
    best_model = xgb.XGBRegressor(**best_params)
    best_model.fit(train_data[features], train_data[target])

    # K-Fold Cross-Validation for final model evaluation
    final_scores = cross_val_score(best_model, train_data[features], train_data[target], cv=5, scoring='neg_mean_squared_error')
    final_mse = -final_scores.mean()
    final_std = final_scores.std()
    print(f"Final model performance: MSE = {final_mse:.4f} ± {final_std:.4f}")

    # Predictions and evaluation on the test set
    y_pred = best_model.predict(test_data[features])
    mse = mean_squared_error(test_data[target], y_pred)
    print(f"Mean Squared Error on test set: {mse:.4f}")

    # Estimation of average milk production during heat stress
    average_production_heat_stress = y_pred.mean()
    print(f"Estimated average milk production during heat stress: {average_production_heat_stress:.4f}")

    # Calculate the reduction compared to baseline
    reduction_percentage = ((baseline_mean - average_production_heat_stress) / baseline_mean) * 100
    print(f"Reduction in milk production during heat stress: {reduction_percentage:.2f}%")

    return average_production_heat_stress, reduction_percentage

# Train the model with the feature combinations on the training set and print results
for features in feature_combinations:
    avg_prod_heatstress, reduction = fit_xgboost_and_print_results(train_heatstress, test_heatstress, features, baseline_mean)

Baseline mean of NormalizedDailyYield for all days: 1.0002

Selected features: ['MeanTemperature', 'HW']
Best parameters: {'lambda': 0.0004481574143079313, 'alpha': 0.6173102077760996, 'subsample': 0.9860651620899856, 'colsample_bytree': 0.670473021702612, 'learning_rate': 0.08779046356804424, 'n_estimators': 696, 'max_depth': 4}
Final model performance: MSE = 0.0107 ± 0.0029
Mean Squared Error on test set: 0.0074
Estimated average milk production during heat stress: 0.9825
Reduction in milk production during heat stress: 1.77%

Selected features: ['MeanTemperature', 'cum_HW']
Best parameters: {'lambda': 1.852183385268695e-05, 'alpha': 0.003204572046942967, 'subsample': 0.526542119828887, 'colsample_bytree': 0.5962659642410891, 'learning_rate': 0.008608121330926555, 'n_estimators': 848, 'max_depth': 6}
Final model performance: MSE = 0.0107 ± 0.0029
Mean Squared Error on test set: 0.0074
Estimated average milk production during heat stress: 0.9825
Reduction in milk production during hea

[W 2024-08-02 13:57:53,421] Trial 26 failed with parameters: {'lambda': 4.2630399017226016e-07, 'alpha': 9.433043435560652e-05, 'subsample': 0.7604092029079305, 'colsample_bytree': 0.8687179766262431, 'learning_rate': 0.017384580393649124, 'n_estimators': 507, 'max_depth': 8} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/user/anaconda3/envs/GIGACOW/lib/python3.11/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/n0/fphw_xw93vv749r_ntt01qd80000gn/T/ipykernel_15187/3643896788.py", line 49, in objective
    scores = cross_val_score(model, train_heatstress[features], train_heatstress[target], cv=5, scoring='neg_mean_squared_error')
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/user/anaconda3/envs/GIGACOW/lib/python3.11/site-packages/

KeyboardInterrupt: 

In [None]:
# Create results DataFrame
results_df = pd.DataFrame({
    'FarmName_Pseudo': pd.Series(dtype='object'),
    'Relative Change HeatStress = 1 (%)': pd.Series(dtype='float64'),
})

# Create a new DataFrame row
new_row = pd.DataFrame({
    'FarmName_Pseudo': [farm_id],
    'Relative Change HeatStress = 1 (%)': [reduction],
})

# Append the new row to the results DataFrame using pd.concat
results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df

Unnamed: 0,FarmName_Pseudo,Relative Change HeatStress = 1 (%)
0,a624fb9a,7.164465


In [None]:
# Specify the farm ID for analysis
farm_id = '5c06d92d'

# Filter data for the specific farm
farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id]

# Calculate the baseline (mean of NormalizedDailyYield for all days at the farm)
baseline_mean = farm_data['NormalizedDailyYield'].mean()
print(f"Baseline mean of NormalizedDailyYield for all days: {baseline_mean:.4f}")

# Define the feature combinations
feature_combinations = [
    ['MeanTemperature', 'HW'],
    ['MeanTemperature', 'cum_HW'],
    ['MeanTHI_adj', 'HW'],
    ['MeanTHI_adj', 'cum_HW']
]

# Define target variable
target = 'NormalizedDailyYield'

# Filter data for heat stress conditions
data_heatstress = farm_data[farm_data['HeatStress'] == 1]

# Split the data into train and test sets
train_heatstress, test_heatstress = train_test_split(data_heatstress, test_size=0.3, random_state=42)

# Scale the continuous features (MeanTemperature and MeanTHI_adj)
scaler = StandardScaler()
train_heatstress[['MeanTemperature', 'MeanTHI_adj']] = scaler.fit_transform(train_heatstress[['MeanTemperature', 'MeanTHI_adj']])
test_heatstress[['MeanTemperature', 'MeanTHI_adj']] = scaler.transform(test_heatstress[['MeanTemperature', 'MeanTHI_adj']])

# Function to optimize hyperparameters using Optuna
def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
    }

    # Cross-validation to estimate performance
    model = xgb.XGBRegressor(**param)
    scores = cross_val_score(model, train_heatstress[features], train_heatstress[target], cv=5, scoring='neg_mean_squared_error')
    mse = -scores.mean()
    return mse

# Function to fit the model and print results
def fit_xgboost_and_print_results(train_data, test_data, features, baseline_mean):
    print(f"\nSelected features: {features}")

    # Optimize hyperparameters using Optuna (Bayesian Optimization)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100, timeout=600)
    best_params = study.best_params
    print(f"Best parameters: {best_params}")

    # Train final model using the best parameters
    best_model = xgb.XGBRegressor(**best_params)
    best_model.fit(train_data[features], train_data[target])

    # K-Fold Cross-Validation for final model evaluation
    final_scores = cross_val_score(best_model, train_data[features], train_data[target], cv=5, scoring='neg_mean_squared_error')
    final_mse = -final_scores.mean()
    final_std = final_scores.std()
    print(f"Final model performance: MSE = {final_mse:.4f} ± {final_std:.4f}")

    # Predictions and evaluation on the test set
    y_pred = best_model.predict(test_data[features])
    mse = mean_squared_error(test_data[target], y_pred)
    print(f"Mean Squared Error on test set: {mse:.4f}")

    # Estimation of average milk production during heat stress
    average_production_heat_stress = y_pred.mean()
    print(f"Estimated average milk production during heat stress: {average_production_heat_stress:.4f}")

    # Calculate the reduction compared to baseline
    reduction_percentage = ((baseline_mean - average_production_heat_stress) / baseline_mean) * 100
    print(f"Reduction in milk production during heat stress: {reduction_percentage:.2f}%")

    return average_production_heat_stress, reduction_percentage

# Train the model with the feature combinations on the training set and print results
for features in feature_combinations:
    avg_prod_heatstress, reduction = fit_xgboost_and_print_results(train_heatstress, test_heatstress, features, baseline_mean)

Baseline mean of NormalizedDailyYield for all days: 0.9960

Selected features: ['MeanTemperature', 'HW']
Best parameters: {'lambda': 2.8128333149473076e-06, 'alpha': 0.046464638549997245, 'subsample': 0.6383000195343301, 'colsample_bytree': 0.5006066419952546, 'learning_rate': 0.08375819308322532, 'n_estimators': 423, 'max_depth': 8}
Final model performance: MSE = 0.0364 ± 0.0014
Mean Squared Error on test set: 0.0342
Estimated average milk production during heat stress: 0.7857
Reduction in milk production during heat stress: 21.11%

Selected features: ['MeanTemperature', 'cum_HW']
Best parameters: {'lambda': 5.761831914549366e-05, 'alpha': 6.780267615584031e-06, 'subsample': 0.8635616113669007, 'colsample_bytree': 0.6182321028951115, 'learning_rate': 0.0944408367530121, 'n_estimators': 536, 'max_depth': 5}
Final model performance: MSE = 0.0364 ± 0.0014
Mean Squared Error on test set: 0.0341
Estimated average milk production during heat stress: 0.7862
Reduction in milk production durin

In [None]:
# Create a new DataFrame row
new_row = pd.DataFrame({
    'FarmName_Pseudo': [farm_id],
    'Relative Change HeatStress = 1 (%)': [reduction],
})

# Append the new row to the results DataFrame using pd.concat
results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df

Unnamed: 0,FarmName_Pseudo,Relative Change HeatStress = 1 (%)
0,a624fb9a,7.164465
1,5c06d92d,21.085694


In [None]:
# Specify the farm ID for analysis
farm_id = '752efd72'

# Filter data for the specific farm
farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id]

# Calculate the baseline (mean of NormalizedDailyYield for all days at the farm)
baseline_mean = farm_data['NormalizedDailyYield'].mean()
print(f"Baseline mean of NormalizedDailyYield for all days: {baseline_mean:.4f}")

# Define the feature combinations
feature_combinations = [
    ['MeanTemperature', 'HW'],
    ['MeanTemperature', 'cum_HW'],
    ['MeanTHI_adj', 'HW'],
    ['MeanTHI_adj', 'cum_HW']
]

# Define target variable
target = 'NormalizedDailyYield'

# Filter data for heat stress conditions
data_heatstress = farm_data[farm_data['HeatStress'] == 1]

# Split the data into train and test sets
train_heatstress, test_heatstress = train_test_split(data_heatstress, test_size=0.3, random_state=42)

# Scale the continuous features (MeanTemperature and MeanTHI_adj)
scaler = StandardScaler()
train_heatstress[['MeanTemperature', 'MeanTHI_adj']] = scaler.fit_transform(train_heatstress[['MeanTemperature', 'MeanTHI_adj']])
test_heatstress[['MeanTemperature', 'MeanTHI_adj']] = scaler.transform(test_heatstress[['MeanTemperature', 'MeanTHI_adj']])

# Function to optimize hyperparameters using Optuna
def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
    }

    # Cross-validation to estimate performance
    model = xgb.XGBRegressor(**param)
    scores = cross_val_score(model, train_heatstress[features], train_heatstress[target], cv=5, scoring='neg_mean_squared_error')
    mse = -scores.mean()
    return mse

# Function to fit the model and print results
def fit_xgboost_and_print_results(train_data, test_data, features, baseline_mean):
    print(f"\nSelected features: {features}")

    # Optimize hyperparameters using Optuna
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100, timeout=600)
    best_params = study.best_params
    print(f"Best parameters: {best_params}")

    # Train final model using the best parameters
    best_model = xgb.XGBRegressor(**best_params)
    best_model.fit(train_data[features], train_data[target])

    # K-Fold Cross-Validation for final model evaluation
    final_scores = cross_val_score(best_model, train_data[features], train_data[target], cv=5, scoring='neg_mean_squared_error')
    final_mse = -final_scores.mean()
    final_std = final_scores.std()
    print(f"Final model performance: MSE = {final_mse:.4f} ± {final_std:.4f}")

    # Predictions and evaluation on the test set
    y_pred = best_model.predict(test_data[features])
    mse = mean_squared_error(test_data[target], y_pred)
    print(f"Mean Squared Error on test set: {mse:.4f}")

    # Estimation of average milk production during heat stress
    average_production_heat_stress = y_pred.mean()
    print(f"Estimated average milk production during heat stress: {average_production_heat_stress:.4f}")

    # Calculate the reduction compared to baseline
    reduction_percentage = ((baseline_mean - average_production_heat_stress) / baseline_mean) * 100
    print(f"Reduction in milk production during heat stress: {reduction_percentage:.2f}%")

    return average_production_heat_stress, reduction_percentage

# Train the model with the feature combinations on the training set and print results
for features in feature_combinations:
    avg_prod_heatstress, reduction = fit_xgboost_and_print_results(train_heatstress, test_heatstress, features, baseline_mean)

Baseline mean of NormalizedDailyYield for all days: 0.9936

Selected features: ['MeanTemperature', 'HW']
Best parameters: {'lambda': 2.5378409786854936e-07, 'alpha': 4.115993120233171e-07, 'subsample': 0.9016072329543205, 'colsample_bytree': 0.7883549212734269, 'learning_rate': 0.009465744237737056, 'n_estimators': 915, 'max_depth': 5}
Final model performance: MSE = 0.0234 ± 0.0014
Mean Squared Error on test set: 0.0276
Estimated average milk production during heat stress: 0.8941
Reduction in milk production during heat stress: 10.02%

Selected features: ['MeanTemperature', 'cum_HW']
Best parameters: {'lambda': 0.19430503014330436, 'alpha': 1.1827326300125545e-07, 'subsample': 0.706473570487271, 'colsample_bytree': 0.9749193029899594, 'learning_rate': 0.038890617901634845, 'n_estimators': 151, 'max_depth': 7}
Final model performance: MSE = 0.0234 ± 0.0015
Mean Squared Error on test set: 0.0276
Estimated average milk production during heat stress: 0.8940
Reduction in milk production dur

In [None]:
# Create a new DataFrame row
new_row = pd.DataFrame({
    'FarmName_Pseudo': [farm_id],
    'Relative Change HeatStress = 1 (%)': [reduction],
})

# Append the new row to the results DataFrame using pd.concat
results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df

Unnamed: 0,FarmName_Pseudo,Relative Change HeatStress = 1 (%)
0,a624fb9a,7.164465
1,5c06d92d,21.085694
2,752efd72,10.030104


In [None]:
# Specify the farm ID for analysis
farm_id = 'f454e660'

# Filter data for the specific farm
farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id]

# Calculate the baseline (mean of NormalizedDailyYield for all days at the farm)
baseline_mean = farm_data['NormalizedDailyYield'].mean()
print(f"Baseline mean of NormalizedDailyYield for all days: {baseline_mean:.4f}")

# Define the feature combinations
feature_combinations = [
    ['MeanTemperature', 'HW'],
    ['MeanTemperature', 'cum_HW'],
    ['MeanTHI_adj', 'HW'],
    ['MeanTHI_adj', 'cum_HW']
]

# Define target variable
target = 'NormalizedDailyYield'

# Filter data for heat stress conditions
data_heatstress = farm_data[farm_data['HeatStress'] == 1]

# Split the data into train and test sets
train_heatstress, test_heatstress = train_test_split(data_heatstress, test_size=0.3, random_state=42)

# Scale the continuous features (MeanTemperature and MeanTHI_adj)
scaler = StandardScaler()
train_heatstress[['MeanTemperature', 'MeanTHI_adj']] = scaler.fit_transform(train_heatstress[['MeanTemperature', 'MeanTHI_adj']])
test_heatstress[['MeanTemperature', 'MeanTHI_adj']] = scaler.transform(test_heatstress[['MeanTemperature', 'MeanTHI_adj']])

# Function to optimize hyperparameters using Optuna
def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
    }

    # Cross-validation to estimate performance
    model = xgb.XGBRegressor(**param)
    scores = cross_val_score(model, train_heatstress[features], train_heatstress[target], cv=5, scoring='neg_mean_squared_error')
    mse = -scores.mean()
    return mse

# Function to fit the model and print results
def fit_xgboost_and_print_results(train_data, test_data, features, baseline_mean):
    print(f"\nSelected features: {features}")

    # Optimize hyperparameters using Optuna
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100, timeout=600)
    best_params = study.best_params
    print(f"Best parameters: {best_params}")

    # Train final model using the best parameters
    best_model = xgb.XGBRegressor(**best_params)
    best_model.fit(train_data[features], train_data[target])

    # K-Fold Cross-Validation for final model evaluation
    final_scores = cross_val_score(best_model, train_data[features], train_data[target], cv=5, scoring='neg_mean_squared_error')
    final_mse = -final_scores.mean()
    final_std = final_scores.std()
    print(f"Final model performance: MSE = {final_mse:.4f} ± {final_std:.4f}")

    # Predictions and evaluation on the test set
    y_pred = best_model.predict(test_data[features])
    mse = mean_squared_error(test_data[target], y_pred)
    print(f"Mean Squared Error on test set: {mse:.4f}")

    # Estimation of average milk production during heat stress
    average_production_heat_stress = y_pred.mean()
    print(f"Estimated average milk production during heat stress: {average_production_heat_stress:.4f}")

    # Calculate the reduction compared to baseline
    reduction_percentage = ((baseline_mean - average_production_heat_stress) / baseline_mean) * 100
    print(f"Reduction in milk production during heat stress: {reduction_percentage:.2f}%")

    return average_production_heat_stress, reduction_percentage

# Train the model with the feature combinations on the training set and print results
for features in feature_combinations:
    avg_prod_heatstress, reduction = fit_xgboost_and_print_results(train_heatstress, test_heatstress, features, baseline_mean)

Baseline mean of NormalizedDailyYield for all days: 0.9981

Selected features: ['MeanTemperature', 'HW']
Best parameters: {'lambda': 0.46284015147225926, 'alpha': 6.478223561589985e-08, 'subsample': 0.9600521035372284, 'colsample_bytree': 0.8380503210108585, 'learning_rate': 0.001911752665349074, 'n_estimators': 870, 'max_depth': 10}
Final model performance: MSE = 0.0605 ± 0.0036
Mean Squared Error on test set: 0.0606
Estimated average milk production during heat stress: 0.9511
Reduction in milk production during heat stress: 4.71%

Selected features: ['MeanTemperature', 'cum_HW']
Best parameters: {'lambda': 0.2256663498326168, 'alpha': 0.00023958171093556817, 'subsample': 0.9854525789899331, 'colsample_bytree': 0.9772673635650675, 'learning_rate': 0.002128098344781793, 'n_estimators': 742, 'max_depth': 11}
Final model performance: MSE = 0.0605 ± 0.0036
Mean Squared Error on test set: 0.0606
Estimated average milk production during heat stress: 0.9512
Reduction in milk production durin

In [None]:
# Create a new DataFrame row
new_row = pd.DataFrame({
    'FarmName_Pseudo': [farm_id],
    'Relative Change HeatStress = 1 (%)': [reduction],
})

# Append the new row to the results DataFrame using pd.concat
results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df

Unnamed: 0,FarmName_Pseudo,Relative Change HeatStress = 1 (%)
0,a624fb9a,7.164465
1,5c06d92d,21.085694
2,752efd72,10.030104
3,f454e660,4.716855


In [None]:
# Assign a minus to the relative change for all reductions
results_df['Relative Change HeatStress = 1 (%)'] = -results_df['Relative Change HeatStress = 1 (%)']
results_df

Unnamed: 0,FarmName_Pseudo,Relative Change HeatStress = 1 (%)
0,a624fb9a,-7.164465
1,5c06d92d,-21.085694
2,752efd72,-10.030104
3,f454e660,-4.716855


In [None]:
# Convert percentages to absolute changes (as per your requirement)
results_df['Relative Change HeatStress = 1'] = results_df['Relative Change HeatStress = 1 (%)'] / 100

# Create a dictionary for quick lookup
heatstress_change_dict = {}
for idx, row in results_df.iterrows():
    heatstress_change_dict[(row['FarmName_Pseudo'], 1)] = row['Relative Change HeatStress = 1']

In [None]:
# Function to assign the relative change based on FarmName_Pseudo and HeatStress
def assign_relative_change(row):
    if row['HeatStress'] == 1:
        return heatstress_change_dict.get((row['FarmName_Pseudo'], 1), np.nan)
    else:
        return np.nan

# Apply the function to create the new column
milk_data['FarmHeatStressMilkProduction'] = milk_data.apply(assign_relative_change, axis=1)

# Display the first few rows to verify the changes
milk_data.head()

Unnamed: 0,Date,FarmName_Pseudo,SE_Number,Age,LactationNumber,DaysInMilk,YearSeason,DailyYield,PreviousDailyYield,DailyYieldChange,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatStress,Temp15Threshold,HW,cum_HW,MeanTemperature,MeanTHI_adj,FarmHeatStressMilkProduction
0,2022-01-01,a624fb9a,SE-064c0cec-1189,3095,7,191,2022-1,30.77,0.0,0.0,29.739372,1.034655,0.0,0,0,0,0,-3.025,28.012944,
1,2022-01-02,a624fb9a,SE-064c0cec-1189,3096,7,192,2022-1,48.22,30.77,17.45,29.692059,1.624003,0.587699,0,0,0,0,-0.279167,32.898193,
2,2022-01-03,a624fb9a,SE-064c0cec-1189,3097,7,193,2022-1,30.53,48.22,-17.69,29.644756,1.029862,-0.596733,0,0,0,0,2.033333,36.760487,
3,2022-01-04,a624fb9a,SE-064c0cec-1189,3098,7,194,2022-1,42.26,30.53,11.73,29.597463,1.427825,0.396318,0,0,0,0,0.066667,31.939524,
4,2022-01-05,a624fb9a,SE-064c0cec-1189,3099,7,195,2022-1,38.49,42.26,-3.77,29.550181,1.30253,-0.12758,0,0,0,0,-3.7,26.498206,


In [None]:
# Save the new data to a new CSV file called 'XGBMilkFarmYieldData.csv' in same folder
milk_data.to_csv('../Data/MergedData/XGBMilkFarmYieldData.csv', index=False)

### Variables Explanation for `XGBMilkFarmYieldData.csv`

1. **Date**:
   - Description: The date when the milk yield was recorded.
   - Datatype: `datetime`
   - Format: `YYYY-MM-DD`
   - Example: `2022-01-01`

2. **FarmName_Pseudo**:
   - Description: A pseudo-identifier for the farm where the data was collected.
   - Datatype: `str`
   - Example: `a624fb9a`

3. **SE_Number**:
   - Description: A unique identifier for the cow, which has been formatted to include the farm and the animal number.
   - Datatype: `str`
   - Example: `SE-064c0cec-1189`

4. **Age**:
   - Description: The age of the cow in days.
   - Datatype: `Int64`
   - Example: `3095`

5. **BreedName**:
   - Description: The breed name of the cow.
   - Datatype: `str`
   - Example: `02 SLB`

6. **LactationNumber**:
   - Description: The number assigned to the cow's lactation cycle.
   - Datatype: `Int64`
   - Example: `7`

7. **DaysInMilk**:
   - Description: The number of days the cow has been in milk (lactating) at the time of recording.
   - Datatype: `Int64`
   - Example: `191`

8. **YearSeason**:
   - Description: The seasonal period based on the year and the month range.
   - Datatype: `str`
   - Example: `2022-1`
   - YearSeason parameters in yield datasets:
     - 1: Dec-Feb
     - 2: Mar-May
     - 3: Jun-Aug
     - 4: Sep-Nov

9. **DailyYield**:
   - Description: The total amount of milk produced by the cow in a single day.
   - Datatype: `float`
   - Example: `30.77`

10. **PreviousDailyYield**:
    - Description: The total amount of milk produced by the cow on the previous day.
    - Datatype: `float`
    - Example: `0.0`

11. **DailyYieldChange**:
    - Description: The change in daily milk yield from the previous day.
    - Datatype: `float`
    - Example: `0.0`

12. **ExpectedYield**:
    - Description: The expected amount of milk yield based on certain models or predictions.
    - Datatype: `float`
    - Example: `35.914865`

13. **NormalizedDailyYield**:
    - Description: The daily yield normalized to account for various factors.
    - Datatype: `float`
    - Example: `0.856748`

14. **NormalizedDailyYieldChange**:
    - Description: The change in normalized daily yield from the previous day.
    - Datatype: `float`
    - Example: `0.0`

15. **HeatStress**:
    - Description: A binary variable indicating the presence of heat stress on the cow.
    - Datatype: `Int64`
    - Example: `0`

16. **Temp15Threshold**:
    - Description: A binary variable indicating if the temperature exceeded 15 degrees Celsius on the given day.
    - Datatype: `Int64`
    - Example: `0`

17. **HW**:
    - Description: A binary variable indicating the presence of a heatwave on the day.
    - Datatype: `Int64`
    - Example: `0`

18. **cum_HW**:
    - Description: Cumulative number of heatwave days up to the current date.
    - Datatype: `Int64`
    - Example: `0`

19. **MeanTemperature**:
    - Description: The mean temperature recorded on the day.
    - Datatype: `float`
    - Example: `-3.025`

20. **MeanTHI_adj**:
    - Description: The mean adjusted Temperature-Humidity Index for the day.
    - Datatype: `float`
    - Example: `28.012944`

21. **FarmHeatStressMilkProduction**:
    - Description: The relative change in milk production based on farm and heat stress conditions.
    - Datatype: `float`
    - Example: `0.009435`