In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import catboost as cb
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import optuna

file_path = 'merged_2.csv'

df = pd.read_csv(file_path, encoding='euc-kr')
df = df.dropna()
df.head()

Unnamed: 0,id,antihypertensives,dbp,fasting,glucose,height,pulse,sbp,temp,weight,pat_sex,pat_birth
0,0,0.0,79.0,1.0,100.0,169.8,75.0,123.0,37.1,65.7,F,1971
1,0,0.0,70.0,1.0,106.0,171.1,69.0,109.0,36.4,65.0,F,1971
2,0,0.0,70.0,1.0,106.0,169.6,79.0,109.0,37.1,64.1,F,1971
3,1,0.0,82.0,1.0,100.0,148.7,58.0,140.0,35.9,49.4,F,1959
4,1,1.0,73.0,1.0,100.0,151.3,59.0,108.0,36.2,48.9,F,1959


In [6]:
# df = df[df['glucose'] <= 140]

features = df.drop(['glucose'], axis=1)
targets = df['glucose']

label_encoder = LabelEncoder()
features['pat_sex'] = label_encoder.fit_transform(features['pat_sex'])

# Remove 'id' column
features = features.drop(['id'], axis=1)

scaler = MinMaxScaler()
features = scaler.fit_transform(features)

# Split the training data into train and test sets for optimization
kf = KFold(n_splits=5, shuffle=True, random_state=1)

mse_scores = []
fold_indices = []

for fold_index, (train_index, test_index) in enumerate(kf.split(features)):
    X_train_fold, X_test_fold = features[train_index], features[test_index]
    y_train_fold, y_test_fold = targets.values[train_index], targets.values[test_index]
    
    model = cb.CatBoostRegressor(n_estimators=10000, learning_rate=0.005, max_depth=10, random_state=42, verbose=0)
    model.fit(X_train_fold, y_train_fold)
    
    y_pred_fold = model.predict(X_test_fold)
    mse = mean_squared_error(y_test_fold, y_pred_fold)
    
    mse_scores.append(mse)
    fold_indices.append((train_index, test_index))

print("Mean Squared Error (MSE) for each fold:", [f"{mse:.2f}" for mse in mse_scores])
print("Average MSE:", f"{np.mean(mse_scores):.2f}")

Mean Squared Error (MSE) for each fold: ['393.36', '1025.37', '187.68', '446.93', '751.04']
Average MSE: 560.88


In [7]:
# Find the median MSE fold
median_mse_index = np.argsort(mse_scores)[len(mse_scores) // 2]
selected_fold_train_index, selected_fold_test_index = fold_indices[median_mse_index]

print(f"Optimizing for fold with median MSE, fold index: {median_mse_index}")

X_train_opt = features[selected_fold_train_index]
y_train_opt = targets.values[selected_fold_train_index]
X_test_opt = features[selected_fold_test_index]
y_test_opt = targets.values[selected_fold_test_index]

# Objective function for Optuna
def objective(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 5000, 20000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.01, log=True),
        'depth': trial.suggest_int('depth', 6, 10),
        'random_state': 42,
        'verbose': 0
    }
    
    model = cb.CatBoostRegressor(**param)
    model.fit(X_train_opt, y_train_opt)
    y_pred = model.predict(X_test_opt)
    mse = mean_squared_error(y_test_opt, y_pred)
    return mse

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("Best hyperparameters:", study.best_params)
print("Optimized MSE:", f"{study.best_value:.2f}")

[I 2024-07-10 14:19:10,570] A new study created in memory with name: no-name-41f6fdfc-b35e-4524-b351-1d9750a89b9b


Optimizing for fold with median MSE, fold index: 3


[I 2024-07-10 14:19:56,971] Trial 0 finished with value: 423.97720869483345 and parameters: {'iterations': 20000, 'learning_rate': 0.0036160624950885662, 'depth': 9}. Best is trial 0 with value: 423.97720869483345.
[I 2024-07-10 14:20:08,551] Trial 1 finished with value: 418.17155648517013 and parameters: {'iterations': 5059, 'learning_rate': 0.007842259171626773, 'depth': 9}. Best is trial 1 with value: 418.17155648517013.
[I 2024-07-10 14:20:25,473] Trial 2 finished with value: 390.54725056332154 and parameters: {'iterations': 19481, 'learning_rate': 0.00399641719075067, 'depth': 7}. Best is trial 2 with value: 390.54725056332154.
[I 2024-07-10 14:20:43,155] Trial 3 finished with value: 416.0410166114171 and parameters: {'iterations': 12713, 'learning_rate': 0.004074381834069926, 'depth': 8}. Best is trial 2 with value: 390.54725056332154.
[I 2024-07-10 14:21:29,674] Trial 4 finished with value: 441.8999019838932 and parameters: {'iterations': 9158, 'learning_rate': 0.007333940062215

Best hyperparameters: {'iterations': 15301, 'learning_rate': 0.009602767848120607, 'depth': 6}
Optimized MSE: 357.34


In [8]:
# Train the best model
best_params = study.best_params
best_model = cb.CatBoostRegressor(**best_params, random_state=42, verbose=0)
best_model.fit(X_train_opt, y_train_opt)

# Predict and evaluate the model
y_pred_opt = best_model.predict(X_test_opt)
mse_opt = mean_squared_error(y_test_opt, y_pred_opt)
print("Final Optimized MSE:", f"{mse_opt:.2f}")

# Load the new data
merged_test_2 = pd.read_csv('merged_test_2.csv')

# Preprocess the new data
merged_test_2['pat_sex'] = label_encoder.transform(merged_test_2['pat_sex'])
merged_test_2 = merged_test_2.drop(['id'], axis=1)
merged_test_2 = scaler.transform(merged_test_2)

# Predict the glucose values
glucose_predictions = best_model.predict(merged_test_2)

# Save the predictions to a new CSV file
predictions_df = pd.DataFrame(glucose_predictions, columns=['glucose'])
predictions_df.to_csv('glucose_predictions.csv', index=False)

print("Predictions saved to glucose_predictions.csv")

Final Optimized MSE: 357.34
Predictions saved to glucose_predictions.csv
