In [1]:
import pandas as pd
import numpy as np
np.int = int
np.bool = bool
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.ensemble import HistGradientBoostingRegressor
from skopt import BayesSearchCV
import gc
from sklearn.metrics import mean_absolute_error, max_error, mean_absolute_percentage_error, mean_squared_error, r2_score, median_absolute_error
from skopt import BayesSearchCV
from sklearn.model_selection import LeaveOneGroupOut

In [2]:

scans=pd.read_excel("../dummy_data.xlsx")
print(scans.shape)

(550, 135)


In [3]:
carbon = scans.copy()
carbon['protocol_map'] = carbon['protocol_map'].str.upper()

In [4]:
carbon = carbon[carbon['trigger_map'].isin(['HCG'])]
mat_scans = carbon[carbon['No. Mature Eggs']>-1]
mat_scans = mat_scans[mat_scans['Age at Egg Collection']<=35.0]

print(mat_scans.shape)

(263, 135)


In [5]:
mat_scans = mat_scans[~(pd.isna(mat_scans['DoT Follicles']))]
print(mat_scans.shape)

(263, 135)


In [6]:
mat_scans['Clinic'].value_counts()

Clinic
UK4    28
UK3    27
UK9    26
UK8    25
UK6    24
UK2    23
UK7    23
UK5    22
P1     22
P2     22
UK1    21
Name: count, dtype: int64

In [7]:

#create dataset
X = mat_scans[['6_mm', '7_mm', '8_mm', '9_mm','10_mm','11_mm','12_mm','13_mm','14_mm','15_mm','16_mm','17_mm','18_mm','19_mm','20_mm','21_mm','22_mm','23_mm','24_mm','25_mm','26_mm']]

y = mat_scans[['No. Mature Eggs']]
y=y.values.ravel()

y = np.log(y+1)

by_clinic = mat_scans['Clinic']
by_clinic = by_clinic.values.ravel()

In [8]:
X.shape

(263, 21)

In [9]:
y.shape

(263,)

In [10]:
by_clinic.shape

(263,)

In [11]:

param_grid = {
    'max_iter': (500, 5000),
    'learning_rate': (0.0001, 0.1),
    'l2_regularization': (0.0, 1.0),
    'min_samples_leaf': (5, 20),
    'loss': ('squared_error', 'absolute_error', 'poisson')
}

outer_cv = LeaveOneGroupOut()
outer_cv.get_n_splits(X, y, by_clinic)
splits = outer_cv.get_n_splits(groups=by_clinic)

# Initialize arrays to store the results
outer_scores = []
best_params_list = pd.DataFrame(index=param_grid)
inner_results = []
df_perm_importance = pd.DataFrame(index=X.columns)

shap_values_per_fold = []
explanation_per_fold = []
interactions_per_fold = []

pred_vals = pd.DataFrame(columns=['Clinic', 'actual', 'pred'])

# Perform nested cross-validation
for i, (train_index, test_index) in enumerate(outer_cv.split(X, y, by_clinic)):
    # Check the indices
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clinic_train, clinic_test = by_clinic[train_index], by_clinic[test_index]

    rf = HistGradientBoostingRegressor()

    inner_cv = LeaveOneGroupOut()

    print(X_train.shape)
    print(y_train.shape)

    bayes_opt = BayesSearchCV(
        estimator=rf,
        search_spaces=param_grid,
        n_iter=30,
        n_points=5,
        scoring='neg_mean_absolute_error',
        cv=inner_cv,
        n_jobs=-1,
        refit=True,
        verbose=1,
        return_train_score=True
    )
    bayes_opt.fit(X=X_train, y=y_train, groups=clinic_train)  ##added groups here

    # Get the best parameters and the corresponding model
    best_params = bayes_opt.best_params_
    best_model = bayes_opt.best_estimator_

    # Store the best parameters
    best_params_list[f'best_params_run_{i}'] = best_params
    print(f"best inner score_{i}:", bayes_opt.best_score_)
    # Store the results of each fold of the inner cross-validation
    inner_results.append(pd.DataFrame(bayes_opt.cv_results_))

    # Evaluate the best model on the outer test fold
    y_pred = best_model.predict(X_test)

    y_pred = np.exp(y_pred) - 1

    ##same exp for y_test
    y_test = np.exp(y_test) - 1

    # Append predictions and actual values to pred_vals
    temp_df = pd.DataFrame({'Clinic': clinic_test, 'actual': y_test, 'pred': y_pred})
    pred_vals = pd.concat([pred_vals, temp_df], ignore_index=True)

    mae = mean_absolute_error(y_test, y_pred)
    print(f'mae_{i}', mae)
    r2 = r2_score(y_test, y_pred)
    medae = median_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    max_er = max_error(y_test, y_pred)
    outer_scores.append([mae, r2, medae, mape, rmse, max_er])

    # Perform permutation importance on the outer test fold
    perm_result = permutation_importance(best_model, X_test, y_test, n_repeats=5)

    # Append mean and std of each run to the DataFrame for the current outer fold
    df_perm_importance[f'mean_run_{i}'] = perm_result.importances_mean
    df_perm_importance[f'std_run_{i}'] = perm_result.importances_std

# Calculate average of means across each row
df_perm_importance['final_mean'] = df_perm_importance.filter(regex=r'^mean_run_\d+$').mean(axis=1)

# Calculate average of standard deviations across each row
df_perm_importance['final_sd'] = np.sqrt(df_perm_importance.filter(regex=r'^std_run_\d+$').pow(2).mean(axis=1))

# Concatenate the inner_results DataFrames
inner_results_df = pd.concat(inner_results)

column_names = ['MAE', 'R2', 'MedAE', 'MAPE', 'RMSE', 'Max Error']
outer_scores_df = pd.DataFrame(outer_scores, columns=column_names)
print("development done")

mae_values = [score[0] for score in outer_scores]
r2_values = [score[1] for score in outer_scores]
medae_values = [score[2] for score in outer_scores]
mape_values = [score[3] for score in outer_scores]
rmse_values = [score[4] for score in outer_scores]
maxerror_values = [score[5] for score in outer_scores]

mae_mean = np.mean(mae_values)
mae_std = np.std(mae_values)

r2_mean = np.mean(r2_values)
r2_std = np.std(r2_values)

medae_mean = np.mean(medae_values)
medae_std = np.std(medae_values)

mape_mean = np.mean(mape_values)
mape_std = np.std(mape_values)

rmse_mean = np.mean(rmse_values)
rmse_std = np.std(rmse_values)

maxerror_mean = np.mean(maxerror_values)
maxerror_std = np.std(maxerror_values)

mae_mean = round(mae_mean, 4)
mae_std = round(mae_std, 4)
r2_mean = round(r2_mean, 4)
r2_std = round(r2_std, 4)

medae_mean = round(medae_mean, 4)
medae_std = round(medae_std, 4)

mape_mean = round(mape_mean, 4)
mape_std = round(mape_std, 4)

rmse_mean = round(rmse_mean, 4)
rmse_std = round(rmse_std, 4)

maxerror_mean = round(maxerror_mean, 4)
maxerror_std = round(maxerror_std, 4)

cv_averaged = {
    'Metric': ['MAE', 'R2', 'MedAE', 'MAPE', 'RMSE', 'Max Error'],
    'Mean': [mae_mean, r2_mean, medae_mean, mape_mean, rmse_mean, maxerror_mean],
    'Standard Deviation': [mae_std, r2_std, medae_std, mape_std, rmse_std, maxerror_std],
}

final_cv_scores = pd.DataFrame(cv_averaged)

print("MAE Mean:", mae_mean)
print("MAE Standard Deviation:", mae_std)
print("R2 Mean:", r2_mean)
print("R2 Standard Deviation:", r2_std)

import os
from datetime import datetime

# Create a folder in the current directory
folder_name = "MII"
os.makedirs(folder_name, exist_ok=True)
model_name = "MII_LEQ35"
# Get the current date in UK date format (DD-MM-YYYY)
current_date = datetime.now().strftime("%d-%m-%Y %H-%M")

# Save the DataFrames to CSV files
csv_file_paths = {
    "Importances": df_perm_importance,
    "Metrics": final_cv_scores,
    "BestParams": best_params_list,
    "InnerRuns": inner_results_df,
    "OuterScores": outer_scores_df,
    "Predictions": pred_vals
}

for name, df in csv_file_paths.items():
    file_name = f"{current_date}_HGBR_{model_name}_{name}.csv"
    csv_file_path = os.path.join(folder_name, file_name)
    df.to_csv(csv_file_path, index=False)

print("Files saved to:", folder_name)


(500, 21)
(500,)
Fitting 10 folds for each of 5 candidates, totalling 50 fits
