In [25]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import optuna
import numpy as np

# Set the Optuna logger to output only WARNING and higher levels
optuna.logging.set_verbosity(optuna.logging.WARNING)

sns.set_theme()
sns.set_context("notebook")
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
dtype_dict = {
    'SE_Number': 'str',
    'FarmName_Pseudo': 'str',
    'InseminationDate': 'str',
    'CalvingDate': 'str',
    'Breeder': 'Int64',
    'BreedName': 'str',
    'BirthDate': 'str',
    'YearSeason': 'str',
    'Mother': 'str',
    'Father': 'str',
    'CalvingSireBullID': 'str',
    'CalvingEase': 'str',
    'LactationNumber': 'Int64',
    'PrevInsemination': 'str',
    'NextInsemination': 'str',
    'NINS': 'Int64',
    'NextCalving': 'str',
    'FirstInsemination': 'str',
    'LastInsemination': 'str',
    'FLI': 'Int64',
    'NextFirstInsemination': 'str',
    'NextLastInsemination': 'str',
    'CFI': 'Int64',
    'CLI': 'Int64',
    'GL': 'Int64',
    'CI': 'Int64',
    'MeanTemperature': 'float',
    'MeanRelativeHumidity': 'float',
    'MeanTHI_adj': 'float',
    'HW': 'Int64',
    'cum_HW': 'Int64',
    'MaxTemp15Threshold': 'Int64',
    'HeatStress': 'Int64'
}

# Load the data using the dtype_dict
data = pd.read_csv('../Data/MergedData/HeatStressFertilityData.csv', dtype=dtype_dict)

# Convert the date columns to datetime
date_columns = [
    'InseminationDate',
    'CalvingDate',
    'BirthDate',
    'PrevInsemination',
    'NextInsemination',
    'NextCalving',
    'FirstInsemination',
    'LastInsemination',
    'NextFirstInsemination',
    'NextLastInsemination'
]

for column in date_columns:
    data[column] = pd.to_datetime(data[column], errors='coerce')

# Filter out rows where NINS is NaN
data = data.dropna(subset=['NINS'])

data.head(-5)

Unnamed: 0,SE_Number,FarmName_Pseudo,InseminationDate,CalvingDate,Breeder,YearSeason,BreedName,BirthDate,Mother,Father,...,CLI,GL,CI,MeanTemperature,MeanRelativeHumidity,MeanTHI_adj,HW,cum_HW,MaxTemp15Threshold,HeatStress
0,SE-169e580a-3766,169e580a,2022-01-01,NaT,8531,2022-1,02 SLB,2017-01-17,SE-169e580a-3083,9-7543 ColludeX,...,,,,4.541667,0.984417,36.238206,0,0,0,0
1,SE-169e580a-3948,169e580a,2022-01-01,2022-11-05,1412,2022-1,02 SLB,2017-09-02,SE-169e580a-2919,9-7603 Fransisc,...,,281,,4.541667,0.984417,36.238206,0,0,0,0
2,SE-169e580a-4555,169e580a,2022-01-01,2022-10-08,8531,2022-1,02 SLB,2018-10-22,SE-169e580a-3610,9-3891Dragonhea,...,53,280,,4.541667,0.984417,36.238206,0,0,0,0
3,SE-169e580a-4628,169e580a,2022-01-01,2022-10-02,1412,2022-1,02 SLB,2018-12-12,SE-169e580a-3420,9-7749 Foul,...,121,274,,4.541667,0.984417,36.238206,0,0,0,0
4,SE-f454e660-729,f454e660,2022-01-01,2023-04-19,1423,2022-1,04 SJB,2020-09-25,SE-f454e660-433,Luxi 9-4471,...,,283,,1.479167,0.909500,33.479002,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468605,SE-f454e660-0699,f454e660,2023-11-13,NaT,1423,2023-4,99 Korsning/obestämbar ras,2020-06-07,SE-f454e660-507,Hays 9-4443 x-vik,...,,,,4.600000,0.932000,35.695600,0,0,0,0
468606,SE-f454e660-0699,f454e660,2023-11-13,NaT,1423,2023-4,99 Korsning/obestämbar ras,2020-06-07,SE-f454e660-507,Hays 9-4443 x-vik,...,,,,4.600000,0.932000,35.695600,0,0,0,0
468607,SE-f454e660-0699,f454e660,2023-11-13,NaT,1423,2023-4,99 Korsning/obestämbar ras,2020-06-07,SE-f454e660-507,Hays 9-4443 x-vik,...,,,,4.600000,0.932000,35.695600,0,0,0,0
468608,SE-f454e660-0699,f454e660,2023-11-13,NaT,1423,2023-4,99 Korsning/obestämbar ras,2020-06-07,SE-f454e660-507,Hays 9-4443 x-vik,...,,,,4.600000,0.932000,35.695600,0,0,0,0


In [27]:
# Specify the farm ID for analysis
farm_id = 'a624fb9a'

# Filter data for the specific farm
farm_data = data[data['FarmName_Pseudo'] == farm_id]

# Remove rows where NINS is NaN
farm_data = farm_data.dropna(subset=['NINS'])

# Calculate the baseline (mean of NINS for all data at the farm)
baseline_mean = farm_data['NINS'].mean()
print(f"Baseline mean of NINS for all days: {baseline_mean:.4f}")

# Define the feature combinations
feature_combinations = [
    ['MeanTemperature', 'HW'],
    ['MeanTemperature', 'cum_HW'],
    ['MeanTHI_adj', 'HW'],
    ['MeanTHI_adj', 'cum_HW']
]

# Function to optimize hyperparameters using Optuna
def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
    }

    # Cross-validation to estimate performance
    model = xgb.XGBRegressor(**param)
    scores = cross_val_score(model, train_heatstress[features], train_heatstress[target], cv=5, scoring='neg_mean_squared_error')
    mse = -scores.mean()
    return mse

# Function to fit the model and print results
def fit_xgboost_and_print_results(train_data, test_data, features, baseline_mean):
    print(f"\nSelected features: {features}")

    # Optimize hyperparameters using Optuna
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100, timeout=600)
    best_params = study.best_params
    print(f"Best parameters: {best_params}")

    # Train final model using the best parameters
    best_model = xgb.XGBRegressor(**best_params)
    best_model.fit(train_data[features], train_data[target])

    # K-Fold Cross-Validation for final model evaluation
    final_scores = cross_val_score(best_model, train_data[features], train_data[target], cv=5, scoring='neg_mean_squared_error')
    final_mse = -final_scores.mean()
    final_std = final_scores.std()
    print(f"Final model performance: MSE = {final_mse:.4f} ± {final_std:.4f}")

    # Predictions and evaluation on the test set
    y_pred = best_model.predict(test_data[features])
    mse = mean_squared_error(test_data[target], y_pred)
    print(f"Mean Squared Error on test set: {mse:.4f}")

    # Estimation of average NINS during the condition
    average_nins_condition = y_pred.mean()
    print(f"Estimated average NINS: {average_nins_condition:.4f}")

    # Calculate the change compared to baseline
    change_percentage = ((average_nins_condition - baseline_mean) / baseline_mean) * 100
    print(f"Change in NINS: {change_percentage:.2f}%")

    return average_nins_condition, change_percentage, mse

# Initialize variables to track the best model for each heat stress condition
best_mse_no_heatstress = float('inf')
best_model_results_no_heatstress = {}

best_mse_heatstress = float('inf')
best_model_results_heatstress = {}

# Analyze data for each heat stress condition and feature combination
for heatstress_condition in [0, 1]:
    condition_label = 'no heat stress' if heatstress_condition == 0 else 'during heat stress'
    data_heatstress = farm_data[farm_data['HeatStress'] == heatstress_condition]
    train_heatstress, test_heatstress = train_test_split(data_heatstress, test_size=0.3, random_state=42)
    
    scaler = StandardScaler()
    train_heatstress[['MeanTemperature', 'MeanTHI_adj']] = scaler.fit_transform(train_heatstress[['MeanTemperature', 'MeanTHI_adj']])
    test_heatstress[['MeanTemperature', 'MeanTHI_adj']] = scaler.transform(test_heatstress[['MeanTemperature', 'MeanTHI_adj']])

    for features in feature_combinations:
        avg_nins_condition, change, mse = fit_xgboost_and_print_results(train_heatstress, test_heatstress, features, baseline_mean)

        # Check if this model has the lowest MSE for the current condition
        if heatstress_condition == 0:
            if mse < best_mse_no_heatstress:
                best_mse_no_heatstress = mse
                best_model_results_no_heatstress = {
                    'FarmName_Pseudo': farm_id,
                    'Change in NINS HeatStress = 0 (%)': change
                }
        else:
            if mse < best_mse_heatstress:
                best_mse_heatstress = mse
                best_model_results_heatstress = {
                    'FarmName_Pseudo': farm_id,
                    'Change in NINS HeatStress = 1 (%)': change
                }

# Combine the best results into the final DataFrame
results_df = pd.DataFrame([{
    'FarmName_Pseudo': farm_id,
    'Change in NINS HeatStress = 0 (%)': best_model_results_no_heatstress.get('Change in NINS HeatStress = 0 (%)', None),
    'Change in NINS HeatStress = 1 (%)': best_model_results_heatstress.get('Change in NINS HeatStress = 1 (%)', None)
}])

Baseline mean of NINS for all days: 2.7443

Selected features: ['MeanTemperature', 'HW']


[W 2024-07-31 13:48:21,638] Trial 32 failed with parameters: {'lambda': 5.148726767881126e-08, 'alpha': 2.052816068006168e-06, 'subsample': 0.8840903840435642, 'colsample_bytree': 0.5579733752869998, 'learning_rate': 0.05868970367368797, 'n_estimators': 325, 'max_depth': 13} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/user/anaconda3/envs/GIGACOW/lib/python3.11/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/n0/fphw_xw93vv749r_ntt01qd80000gn/T/ipykernel_4030/1364121548.py", line 38, in objective
    scores = cross_val_score(model, train_heatstress[features], train_heatstress[target], cv=5, scoring='neg_mean_squared_error')
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/user/anaconda3/envs/GIGACOW/lib/python3.11/site-packages/sk

KeyboardInterrupt: 