In [2]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import optuna
import numpy as np

# Set the Optuna logger to output only WARNING and higher levels
optuna.logging.set_verbosity(optuna.logging.WARNING)

sns.set_theme()
sns.set_context("notebook")
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dtype_dict = {
    'SE_Number': 'str',
    'FarmName_Pseudo': 'str',
    'InseminationDate': 'str',
    'CalvingDate': 'str',
    'Breeder': 'Int64',
    'BreedName': 'str',
    'BirthDate': 'str',
    'YearSeason': 'str',
    'Mother': 'str',
    'Father': 'str',
    'CalvingSireBullID': 'str',
    'CalvingEase': 'str',
    'LactationNumber': 'Int64',
    'PrevInsemination': 'str',
    'NextInsemination': 'str',
    'NINS': 'Int64',
    'NextCalving': 'str',
    'FirstInsemination': 'str',
    'LastInsemination': 'str',
    'FLI': 'Int64',
    'NextFirstInsemination': 'str',
    'NextLastInsemination': 'str',
    'CFI': 'Int64',
    'CLI': 'Int64',
    'GL': 'Int64',
    'CI': 'Int64',
    'PregnancyCheck': 'Int64',
    'MeanTemperature': 'float',
    'MeanRelativeHumidity': 'float',
    'MeanTHI_adj': 'float',
    'HW': 'Int64',
    'Cum_HW': 'Int64',
    'MaxTemp15Threshold': 'Int64',
    'HeatStress': 'Int64'
}

# Load the data using the dtype_dict
data = pd.read_csv('../Data/MergedData/HeatStressFertilityData.csv', dtype=dtype_dict)

# Convert the date columns to datetime
date_columns = [
    'InseminationDate',
    'CalvingDate',
    'BirthDate',
    'PrevInsemination',
    'NextInsemination',
    'NextCalving',
    'FirstInsemination',
    'LastInsemination',
    'NextFirstInsemination',
    'NextLastInsemination'
]

for column in date_columns:
    data[column] = pd.to_datetime(data[column], errors='coerce')

# Drop NaN values for PregnancyCheck and NINS
data = data.dropna(subset=['PregnancyCheck', 'NINS'])

data.head(-5)

Unnamed: 0,SE_Number,FarmName_Pseudo,InseminationDate,CalvingDate,Breeder,YearSeason,BreedName,BirthDate,Mother,Father,...,GL,CI,PregnancyCheck,MeanTemperature,MeanRelativeHumidity,MeanTHI_adj,HW,cum_HW,MaxTemp15Threshold,HeatStress
0,SE-064c0cec-1189,a624fb9a,2022-07-06,NaT,2746,2022-3,02 SLB,2013-07-12,,,...,,,0,16.520833,0.686333,63.101586,0,0,1,0
1,SE-064c0cec-1189,a624fb9a,2022-07-06,NaT,2746,2022-3,02 SLB,2013-07-12,,,...,,,0,16.520833,0.686333,63.101586,0,0,1,0
2,SE-064c0cec-1189,a624fb9a,2022-07-06,NaT,2746,2022-3,02 SLB,2013-07-12,,,...,,,0,16.520833,0.686333,63.101586,0,0,1,0
3,SE-064c0cec-1189,a624fb9a,2022-07-06,NaT,2746,2022-3,02 SLB,2013-07-12,,,...,,,0,16.520833,0.686333,63.101586,0,0,1,0
4,SE-064c0cec-1189,a624fb9a,2022-07-06,NaT,2746,2022-3,02 SLB,2013-07-12,,,...,,,0,16.520833,0.686333,63.101586,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685386,SE-afdd9a78-1247,afdd9a78,2023-01-23,NaT,4548,2023-1,01 SRB,2021-06-29,SE-afdd9a78-1179,03-9923,...,,,1,-3.758333,0.939917,26.175930,0,0,0,0
685387,SE-afdd9a78-1248,afdd9a78,2023-03-20,NaT,4548,2023-2,01 SRB,2021-07-21,SE-afdd9a78-1192,09-6237,...,,,1,3.537500,0.975167,39.238674,0,0,0,0
685388,SE-afdd9a78-1249,afdd9a78,2022-12-08,NaT,9252,2022-1,01 SRB,2021-07-28,SE-afdd9a78-1113,22-8035,...,,,1,-9.512500,0.899875,19.766537,0,0,0,0
685389,SE-afdd9a78-1252,afdd9a78,2023-01-04,NaT,4548,2023-1,01 SRB,2021-08-17,SE-afdd9a78-1191,03-9920,...,,,0,-6.200000,0.920417,18.947200,0,0,0,0


In [4]:
# # Specify the farm ID for analysis
# farm_id = 'a624fb9a'

# # Filter data for the specific farm
# farm_data = data[data['FarmName_Pseudo'] == farm_id]

# # Remove rows where NINS is NaN
# farm_data = farm_data.dropna(subset=['NINS'])

# # Calculate the baseline (mean of NINS for all data at the farm)
# baseline_mean = farm_data['NINS'].mean()
# print(f"Baseline mean of NINS for all days: {baseline_mean:.4f}")

# target = 'NINS'

# # Define the feature combinations
# feature_combinations = [
#     ['MeanTemperature', 'HW'],
#     ['MeanTemperature', 'cum_HW'],
#     ['MeanTHI_adj', 'HW'],
#     ['MeanTHI_adj', 'cum_HW']
# ]

# # Function to optimize hyperparameters using Optuna
# def objective(trial):
#     param = {
#         'verbosity': 0,
#         'objective': 'reg:squarederror',
#         'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
#         'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
#         'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
#         'max_depth': trial.suggest_int('max_depth', 1, 15),
#     }

#     # Cross-validation to estimate performance
#     model = xgb.XGBRegressor(**param)
#     scores = cross_val_score(model, train_heatstress[features], train_heatstress[target], cv=5, scoring='neg_mean_squared_error')
#     mse = -scores.mean()
#     return mse

# # Function to fit the model and print results
# def fit_xgboost_and_print_results(train_data, test_data, features, baseline_mean):
#     print(f"\nSelected features: {features}")

#     # Optimize hyperparameters using Optuna
#     study = optuna.create_study(direction='minimize')
#     study.optimize(objective, n_trials=100, timeout=600)
#     best_params = study.best_params
#     print(f"Best parameters: {best_params}")

#     # Train final model using the best parameters
#     best_model = xgb.XGBRegressor(**best_params)
#     best_model.fit(train_data[features], train_data[target])

#     # K-Fold Cross-Validation for final model evaluation
#     final_scores = cross_val_score(best_model, train_data[features], train_data[target], cv=5, scoring='neg_mean_squared_error')
#     final_mse = -final_scores.mean()
#     final_std = final_scores.std()
#     print(f"Final model performance: MSE = {final_mse:.4f} ± {final_std:.4f}")

#     # Predictions and evaluation on the test set
#     y_pred = best_model.predict(test_data[features])
#     mse = mean_squared_error(test_data[target], y_pred)
#     print(f"Mean Squared Error on test set: {mse:.4f}")

#     # Estimation of average NINS during the condition
#     average_nins_condition = y_pred.mean()
#     print(f"Estimated average NINS: {average_nins_condition:.4f}")

#     # Calculate the change compared to baseline
#     change_percentage = ((average_nins_condition - baseline_mean) / baseline_mean) * 100
#     print(f"Change in NINS: {change_percentage:.2f}%")

#     return average_nins_condition, change_percentage, mse

# # Initialize variables to track the best model for each heat stress condition
# best_mse_no_heatstress = float('inf')
# best_model_results_no_heatstress = {}

# best_mse_heatstress = float('inf')
# best_model_results_heatstress = {}

# # Analyze data for each heat stress condition and feature combination
# for heatstress_condition in [0, 1]:
#     condition_label = 'no heat stress' if heatstress_condition == 0 else 'during heat stress'
#     data_heatstress = farm_data[farm_data['HeatStress'] == heatstress_condition]
#     train_heatstress, test_heatstress = train_test_split(data_heatstress, test_size=0.3, random_state=42)
    
#     scaler = StandardScaler()
#     train_heatstress[['MeanTemperature', 'MeanTHI_adj']] = scaler.fit_transform(train_heatstress[['MeanTemperature', 'MeanTHI_adj']])
#     test_heatstress[['MeanTemperature', 'MeanTHI_adj']] = scaler.transform(test_heatstress[['MeanTemperature', 'MeanTHI_adj']])

#     for features in feature_combinations:
#         avg_nins_condition, change, mse = fit_xgboost_and_print_results(train_heatstress, test_heatstress, features, baseline_mean)

#         # Check if this model has the lowest MSE for the current condition
#         if heatstress_condition == 0:
#             if mse < best_mse_no_heatstress:
#                 best_mse_no_heatstress = mse
#                 best_model_results_no_heatstress = {
#                     'FarmName_Pseudo': farm_id,
#                     'Change in NINS HeatStress = 0 (%)': change
#                 }
#         else:
#             if mse < best_mse_heatstress:
#                 best_mse_heatstress = mse
#                 best_model_results_heatstress = {
#                     'FarmName_Pseudo': farm_id,
#                     'Change in NINS HeatStress = 1 (%)': change
#                 }

# # Combine the best results into the final DataFrame
# results_df = pd.DataFrame([{
#     'FarmName_Pseudo': farm_id,
#     'Change in NINS HeatStress = 0 (%)': best_model_results_no_heatstress.get('Change in NINS HeatStress = 0 (%)', None),
#     'Change in NINS HeatStress = 1 (%)': best_model_results_heatstress.get('Change in NINS HeatStress = 1 (%)', None)
# }])

In [7]:
# Specify the farm ID for analysis
farm_id = 'a624fb9a'

# Filter data for the specific farm and ensure PregnancyCheck is binary
farm_data = data[(data['FarmName_Pseudo'] == farm_id) & (data['PregnancyCheck'].isin([0, 1]))]

# Verify that the target variable is binary
assert farm_data['PregnancyCheck'].isin([0, 1]).all(), "PregnancyCheck should only contain 0 or 1."

# Define the target and feature combinations
target = 'PregnancyCheck'
feature_combinations = [
    ['MeanTemperature', 'HW', 'HeatStress'],
    ['MeanTemperature', 'cum_HW', 'HeatStress'],
    ['MeanTHI_adj', 'HW', 'HeatStress'],
    ['MeanTHI_adj', 'cum_HW', 'HeatStress']
]

# Function to optimize hyperparameters using Optuna
def objective_classification(trial):
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
    }

    model = xgb.XGBClassifier(**param)
    scores = cross_val_score(model, train_heatstress[features], train_heatstress[target], cv=5, scoring='roc_auc')
    return scores.mean()

# Function to fit the model and print results
def fit_xgboost_and_print_results(train_data, test_data, features):
    print(f"\nSelected features: {features}")

    # Optimize hyperparameters using Optuna
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_classification, n_trials=100, timeout=600)
    best_params = study.best_params
    print(f"Best parameters: {best_params}")

    # Train final model using the best parameters
    best_model = xgb.XGBClassifier(**best_params)
    best_model.fit(train_data[features], train_data[target])

    # K-Fold Cross-Validation for final model evaluation
    final_scores = cross_val_score(best_model, train_data[features], train_data[target], cv=5, scoring='roc_auc')
    final_roc_auc = final_scores.mean()
    final_std = final_scores.std()
    print(f"Final model performance: ROC AUC = {final_roc_auc:.4f} ± {final_std:.4f}")

    # Predictions and evaluation on the test set
    y_pred_proba = best_model.predict_proba(test_data[features])[:, 1]
    roc_auc = roc_auc_score(test_data[target], y_pred_proba)
    print(f"ROC AUC on test set: {roc_auc:.4f}")

    # Estimation of average Pregnancy probability during the condition
    average_pregnancy_probability = y_pred_proba.mean()
    print(f"Estimated average pregnancy probability: {average_pregnancy_probability:.4f}")

    return average_pregnancy_probability, roc_auc

# Initialize variables to track the best model for each heat stress condition
best_roc_auc_no_heatstress = 0
best_model_results_no_heatstress = {}

best_roc_auc_heatstress = 0
best_model_results_heatstress = {}

# Analyze data for each heat stress condition and feature combination
for heatstress_condition in [0, 1]:
    condition_label = 'no heat stress' if heatstress_condition == 0 else 'during heat stress'
    data_heatstress = farm_data[farm_data['HeatStress'] == heatstress_condition]
    train_heatstress, test_heatstress = train_test_split(data_heatstress, test_size=0.3, random_state=42)
    
    for features in feature_combinations:
        scaler = StandardScaler()
        train_heatstress[features] = scaler.fit_transform(train_heatstress[features])
        test_heatstress[features] = scaler.transform(test_heatstress[features])
        
        avg_pregnancy_probability, roc_auc = fit_xgboost_and_print_results(train_heatstress, test_heatstress, features)

        # Check if this model has the best ROC AUC for the current condition
        if heatstress_condition == 0:
            if roc_auc > best_roc_auc_no_heatstress:
                best_roc_auc_no_heatstress = roc_auc
                best_model_results_no_heatstress = {
                    'FarmName_Pseudo': farm_id,
                    'Avg Pregnancy Probability HeatStress = 0': avg_pregnancy_probability
                }
        else:
            if roc_auc > best_roc_auc_heatstress:
                best_roc_auc_heatstress = roc_auc
                best_model_results_heatstress = {
                    'FarmName_Pseudo': farm_id,
                    'Avg Pregnancy Probability HeatStress = 1': avg_pregnancy_probability
                }

# Combine the best results into the final DataFrame
results_df = pd.DataFrame([{
    'FarmName_Pseudo': farm_id,
    'Avg Pregnancy Probability HeatStress = 0': best_model_results_no_heatstress.get('Avg Pregnancy Probability HeatStress = 0', None),
    'Avg Pregnancy Probability HeatStress = 1': best_model_results_heatstress.get('Avg Pregnancy Probability HeatStress = 1', None)
}])

results_df


Selected features: ['MeanTemperature', 'HW', 'HeatStress']
Best parameters: {'lambda': 2.238440497507435e-07, 'alpha': 0.0025785609316005846, 'subsample': 0.8090512799887943, 'colsample_bytree': 0.5823783558723196, 'learning_rate': 0.04452991111440105, 'n_estimators': 618, 'max_depth': 9}
Final model performance: ROC AUC = 0.9336 ± 0.0040
ROC AUC on test set: 0.9374
Estimated average pregnancy probability: 0.5047

Selected features: ['MeanTemperature', 'cum_HW', 'HeatStress']
Best parameters: {'lambda': 0.012487880288248944, 'alpha': 0.5707295094426696, 'subsample': 0.8154372614663399, 'colsample_bytree': 0.7839616526116695, 'learning_rate': 0.017295541051614603, 'n_estimators': 403, 'max_depth': 13}
Final model performance: ROC AUC = 0.9336 ± 0.0040
ROC AUC on test set: 0.9377
Estimated average pregnancy probability: 0.5049

Selected features: ['MeanTHI_adj', 'HW', 'HeatStress']
Best parameters: {'lambda': 0.0059758634547327945, 'alpha': 0.08327997385637813, 'subsample': 0.7858836936

Unnamed: 0,FarmName_Pseudo,Avg Pregnancy Probability HeatStress = 0,Avg Pregnancy Probability HeatStress = 1
0,a624fb9a,0.504901,0.551235


In [8]:
# Function to optimize hyperparameters using Optuna
def objective_regression(trial):
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
    }

    model = xgb.XGBRegressor(**param)
    scores = cross_val_score(model, train_data[features], train_data[target], cv=5, scoring='neg_mean_squared_error')
    mse = -scores.mean()
    return mse

# Fit and evaluate the model for each feature combination
for features in feature_combinations:
    train_data, test_data = train_test_split(successful_pregnancies, test_size=0.3, random_state=42)

    scaler = StandardScaler()
    train_data[features] = scaler.fit_transform(train_data[features])
    test_data[features] = scaler.transform(test_data[features])

    study = optuna.create_study(direction='minimize')
    study.optimize(objective_regression, n_trials=100, timeout=600)
    best_params = study.best_params

    model = xgb.XGBRegressor(**best_params)
    model.fit(train_data[features], train_data[target])
    y_pred = model.predict(test_data[features])
    mse = mean_squared_error(test_data[target], y_pred)
    print(f"MSE for {features}: {mse:.4f}")


NameError: name 'successful_pregnancies' is not defined