In [1]:
%pip install optuna

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8.2 optuna-3.5.0


In [2]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [8]:
import optuna
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

# # Load the training data
# df = pd.read_csv('train_final3_updatedBMI.csv')

# # Split the data into features (X) and target variable (y)
# X = df.drop(['DiagPeriodL90D'], axis=1)
# y = df['DiagPeriodL90D']

# # Split the original training data into training and holdout sets
# X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)


# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3_updatedBMI.csv')
# Select only the desired features
selected_features = ['patient_id', 'patient_zip3','patient_age',
                       'veteran','PM25', 'N02',
                      'affected_site','race_black','race_white','breast_cancer_diagnosis_code_encoded',
                      'bmi']



# Include the target variable in the selected features
selected_features.append('DiagPeriodL90D')

# Subset the DataFrame with selected features
df_selected = df[selected_features]

# Split the data into features (X) and target variable (y)
X = df_selected.drop(['DiagPeriodL90D'], axis=1)
y = df_selected['DiagPeriodL90D']

# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)


# Define the objective function for hyperparameter tuning
def objective(trial):
    # Define hyperparameters to be optimized for each model
    lgbm_params = {
        'n_estimators': trial.suggest_int('lgbm_n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('lgbm_learning_rate', 0.001, 0.1),
        # Add other LGBM hyperparameters here
    }

    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('xgb_learning_rate', 0.001, 0.1),
        # Add other XGB hyperparameters here
    }

    gb_params = {
        'n_estimators': trial.suggest_int('gb_n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('gb_learning_rate', 0.001, 0.1),
        # Add other GradientBoosting hyperparameters here
    }

    ada_params = {
        'n_estimators': trial.suggest_int('ada_n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('ada_learning_rate', 0.001, 0.1),
        # Add other AdaBoost hyperparameters here
    }

    catboost_params = {
        'iterations': trial.suggest_int('catboost_iterations', 50, 500),
        'depth': trial.suggest_int('catboost_depth', 4, 10),
        'learning_rate': trial.suggest_float('catboost_learning_rate', 0.001, 0.1),
        # Add other CatBoost hyperparameters here
    }

    # Train individual models on the training set
    lgbm_model = LGBMClassifier(random_state=42, **lgbm_params)
    xgb_model = XGBClassifier(random_state=42, booster='gbtree', **xgb_params)
    gb_model = GradientBoostingClassifier(random_state=42, **gb_params)
    ada_model = AdaBoostClassifier(random_state=42, **ada_params)
    catboost_model = CatBoostClassifier(**catboost_params)

    lgbm_model.fit(X_train, y_train)
    xgb_model.fit(X_train, y_train)
    gb_model.fit(X_train, y_train)
    ada_model.fit(X_train, y_train)
    catboost_model.fit(X_train, y_train)

    # Generate predictions on the holdout set
    proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
    proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
    proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
    proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]
    proba_catboost_holdout = catboost_model.predict_proba(X_holdout)[:, 1]

    # Combine predictions into a DataFrame
    ensemble_predictions_holdout = pd.DataFrame({
        'LGBM': proba_lgbm_holdout,
        'XGB': proba_xgb_holdout,
        'GB': proba_gb_holdout,
        'ADA': proba_ada_holdout,
        'CatBoost': proba_catboost_holdout
    })

    # Train Voting Classifier as a meta-learner on the holdout set
    voting_clf = VotingClassifier(
        estimators=[
            ('lgbm', lgbm_model),
            ('xgb', xgb_model),
            ('gb', gb_model),
            ('ada', ada_model),
            ('catboost', catboost_model)
        ],
        voting='soft'
    )
    voting_clf.fit(ensemble_predictions_holdout, y_holdout)

    # Evaluate accuracy of the Voting Classifier on the holdout set
    holdout_pred = voting_clf.predict(ensemble_predictions_holdout)
    accuracy_meta = accuracy_score(y_holdout, holdout_pred)

    return accuracy_meta

# Set up Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1)  # You can adjust the number of trials

# Get the best hyperparameters from the study
best_params = study.best_params
print("Best Hyperparameters:", best_params)




[I 2024-02-25 07:50:34,250] A new study created in memory with name: no-name-fd9d088a-4158-4cf6-86bd-d3d06e1bdaee


[LightGBM] [Info] Number of positive: 6443, number of negative: 3881
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000345 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2123
[LightGBM] [Info] Number of data points in the train set: 10324, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624080 -> initscore=0.506901
[LightGBM] [Info] Start training from score 0.506901
0:	learn: 0.6599077	total: 4.63ms	remaining: 1.89s
1:	learn: 0.6321000	total: 9.04ms	remaining: 1.83s
2:	learn: 0.6086182	total: 12.6ms	remaining: 1.71s
3:	learn: 0.5891891	total: 16.5ms	remaining: 1.67s
4:	learn: 0.5724245	total: 20.3ms	remaining: 1.64s
5:	learn: 0.5580396	total: 24.2ms	remaining: 1.62s
6:	learn: 0.5459520	total: 27.9ms	remaining: 1.6s
7:	learn: 0.5354415	total: 31.8ms	remaining: 1.59s
8:	learn: 0.5264582	total: 35.5ms	remai

[I 2024-02-25 07:50:52,612] Trial 0 finished with value: 0.8272656855151046 and parameters: {'lgbm_n_estimators': 409, 'lgbm_learning_rate': 0.027525896092321804, 'xgb_n_estimators': 63, 'xgb_learning_rate': 0.004467184629502893, 'gb_n_estimators': 356, 'gb_learning_rate': 0.09411491102883605, 'ada_n_estimators': 280, 'ada_learning_rate': 0.0019238552520624333, 'catboost_iterations': 408, 'catboost_depth': 4, 'catboost_learning_rate': 0.07005931841345807}. Best is trial 0 with value: 0.8272656855151046.


Best Hyperparameters: {'lgbm_n_estimators': 409, 'lgbm_learning_rate': 0.027525896092321804, 'xgb_n_estimators': 63, 'xgb_learning_rate': 0.004467184629502893, 'gb_n_estimators': 356, 'gb_learning_rate': 0.09411491102883605, 'ada_n_estimators': 280, 'ada_learning_rate': 0.0019238552520624333, 'catboost_iterations': 408, 'catboost_depth': 4, 'catboost_learning_rate': 0.07005931841345807}


In [10]:
# Load the test data
# test_data = pd.read_csv('test_final3_updatedBMI.csv')
test_data = pd.read_csv('test_final3_updatedBMI.csv')

# Subset the test data with selected features
test_data = test_data[selected_features[:-1]]  # Exclude target variable

# Train models with best hyperparameters
best_lgbm_model = LGBMClassifier(**{k[5:]: v for k, v in best_params.items() if k.startswith('lgbm_')})
best_xgb_model = XGBClassifier(**{k[4:]: v for k, v in best_params.items() if k.startswith('xgb_')})
best_gb_model = GradientBoostingClassifier(**{k[3:]: v for k, v in best_params.items() if k.startswith('gb_')})
best_ada_model = AdaBoostClassifier(**{k[4:]: v for k, v in best_params.items() if k.startswith('ada_')})
best_catboost_model = CatBoostClassifier(**{k[9:]: v for k, v in best_params.items() if k.startswith('catboost_')})

# Fit models on the entire training data
best_lgbm_model.fit(X, y)
best_xgb_model.fit(X, y)
best_gb_model.fit(X, y)
best_ada_model.fit(X, y)
best_catboost_model.fit(X, y)

# Generate predictions on the test data
proba_lgbm_test = best_lgbm_model.predict_proba(test_data)[:, 1]
proba_xgb_test = best_xgb_model.predict_proba(test_data)[:, 1]
proba_gb_test = best_gb_model.predict_proba(test_data)[:, 1]
proba_ada_test = best_ada_model.predict_proba(test_data)[:, 1]
proba_catboost_test = best_catboost_model.predict_proba(test_data)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'ADA': proba_ada_test,
    'CatBoost': proba_catboost_test
})

# Train Voting Classifier as a meta-learner on the entire training data
voting_clf = VotingClassifier(
    estimators=[
        ('lgbm', best_lgbm_model),
        ('xgb', best_xgb_model),
        ('gb', best_gb_model),
        ('ada', best_ada_model),
        ('catboost', best_catboost_model)
    ],
    voting='soft'
)
voting_clf.fit(X, y)



[LightGBM] [Info] Number of positive: 8060, number of negative: 4846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2123
[LightGBM] [Info] Number of data points in the train set: 12906, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624516 -> initscore=0.508760
[LightGBM] [Info] Start training from score 0.508760
0:	learn: 0.6599533	total: 4.34ms	remaining: 1.77s
1:	learn: 0.6319400	total: 8.28ms	remaining: 1.68s
2:	learn: 0.6082326	total: 12.2ms	remaining: 1.65s
3:	learn: 0.5883276	total: 16.1ms	remaining: 1.62s
4:	learn: 0.5714351	total: 20.3ms	remaining: 1.63s
5:	learn: 0.5569739	total: 24.2ms	remaining: 1.62s
6:	learn: 0.5446151	total: 28.4ms	remaining: 1.63s
7:	learn: 0.5344738	total: 32.5ms	remaining: 1.63s
8:	learn: 0.5254935	total: 37ms	remain

In [11]:

# Make final predictions using the Voting Classifier as meta-learner
# test_predictions = voting_clf.predict(ensemble_predictions_test)
test_predictions = voting_clf.predict(test_data)

# Round off the probabilities to 1 decimal place
rounded_prob_predictions = [round(prob, 1) for prob in test_predictions]

# Assuming 'patient_id' column is present in the test data
patient_ids = test_data['patient_id']

# Ensure that the number of patient IDs matches the number of probability predictions
assert len(patient_ids) == len(rounded_prob_predictions), "Number of patient IDs does not match number of probability predictions"

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': patient_ids,
    'DiagPeriodL90D': rounded_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_optuna.csv', index=False)

from google.colab import files

# Download the file
files.download('submission_optuna.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>