In [1]:
# Feature Selection

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

train_data = pd.read_csv("train_final3_updatedBMI.csv")


# Split the data into features (X) and target variable (y)
X = train_data.drop(columns=["DiagPeriodL90D"])
y = train_data["DiagPeriodL90D"]
X = X.dropna()
y = y[X.index]  # Update target variable accordingly

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a base classifier
base_classifier = RandomForestClassifier()

# Create the RFE model and select 10 features
n_features_to_select = 10
rfe = RFE(estimator=base_classifier, n_features_to_select=n_features_to_select)
X_train_rfe = rfe.fit_transform(X_train, y_train)

# Get the names of the selected features
selected_feature_names = X_train.columns[rfe.support_]

# Create a DataFrame with selected features
selected_features_df = pd.DataFrame(X_train_rfe, columns=selected_feature_names)

# Print the names of selected features
print("Selected Features:")
print(selected_feature_names)

Selected Features:
Index(['patient_id', 'patient_age', 'age_10_to_19', 'widowed',
       'education_highschool', 'unemployment_rate', 'commute_time',
       'affected_site', 'breast_cancer_diagnosis_code_encoded', 'bmi'],
      dtype='object')


In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3_updatedBMI.csv')
# Select only the desired features
selected_features = ['patient_id', 'patient_zip3','patient_age',
                       'disabled','Ozone','PM25', 'N02',
                      'affected_site','patient_race_filled_encoded','breast_cancer_diagnosis_code_encoded',
                      'bmi']



# Include the target variable in the selected features
selected_features.append('DiagPeriodL90D')

# Subset the DataFrame with selected features
df_selected = df[selected_features]

# Split the data into features (X) and target variable (y)
X = df_selected.drop(['DiagPeriodL90D'], axis=1)
y = df_selected['DiagPeriodL90D']

# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)


# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

lgbm_grid.fit(X, y)
xgb_grid.fit(X, y)
gb_grid.fit(X, y)
ada_grid.fit(X, y)

lgbm_model = lgbm_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
gb_model = gb_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X, y)
xgb_model.fit(X, y)
gb_model.fit(X, y)
ada_model.fit(X, y)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'XGB': proba_xgb_holdout,
    'GB': proba_gb_holdout,
    'ADA': proba_ada_holdout
})
meta_learner = CatBoostClassifier(random_state=42)
# Train CatBoostClassifier as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (CatBoost) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3_updatedBMI.csv')

# Subset the test data with selected features
test_data = test_data[selected_features[:-1]]  # Exclude target variable

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_lgbm_test = lgbm_model.predict_proba(test_data)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data)[:, 1]
proba_gb_test = gb_model.predict_proba(test_data)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# Round off the probabilities to 1 decimal place
rounded_prob_predictions = [round(prob, 1) for prob in test_prob_predictions]
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': rounded_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_ift2_bmi.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_ift2_bmi.csv')

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score


# Evaluate the performance on the holdout set
holdout_pred_proba = meta_learner.predict_proba(ensemble_predictions_holdout)[:, 1]
holdout_pred_binary = (holdout_pred_proba > 0.5).astype(int)

# Accuracy
accuracy_meta = accuracy_score(y_holdout, holdout_pred_binary)
print(f'Meta-Learner (CatBoost) Accuracy: {accuracy_meta}')

# Precision, Recall, F1-Score
precision_meta = precision_score(y_holdout, holdout_pred_binary)
recall_meta = recall_score(y_holdout, holdout_pred_binary)
f1_meta = f1_score(y_holdout, holdout_pred_binary)

print(f'Meta-Learner (CatBoost) Precision: {precision_meta}')
print(f'Meta-Learner (CatBoost) Recall: {recall_meta}')
print(f'Meta-Learner (CatBoost) F1-Score: {f1_meta}')

# AUC-ROC
roc_auc_meta = roc_auc_score(y_holdout, holdout_pred_proba)
print(f'Meta-Learner (CatBoost) AUC-ROC: {roc_auc_meta}')

# AUC-PR
average_precision_meta = average_precision_score(y_holdout, holdout_pred_proba)
print(f'Meta-Learner (CatBoost) AUC-PR: {average_precision_meta}')


[LightGBM] [Info] Number of positive: 5373, number of negative: 3231
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000841 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1873
[LightGBM] [Info] Number of data points in the train set: 8604, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624477 -> initscore=0.508595
[LightGBM] [Info] Start training from score 0.508595
[LightGBM] [Info] Number of positive: 5373, number of negative: 3231
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001914 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1868
[LightGBM] [Info] Number of data points in the train set: 8604, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624477 -> initscore=0.508595
[LightGBM] 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Meta-Learner (CatBoost) Accuracy: 0.8741285824941906
Meta-Learner (CatBoost) Precision: 0.8569060773480663
Meta-Learner (CatBoost) Recall: 0.9591836734693877
Meta-Learner (CatBoost) F1-Score: 0.9051648672308141
Meta-Learner (CatBoost) AUC-ROC: 0.9445477295958421
Meta-Learner (CatBoost) AUC-PR: 0.9644715494568741


In [5]:
!pip install catboost



In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3_updatedBMI.csv')
# Select only the desired features
selected_features = ['patient_id', 'patient_zip3','patient_age',
                       'veteran','PM25', 'N02',
                      'affected_site','race_black','race_white','breast_cancer_diagnosis_code_encoded',
                      'bmi']



# Include the target variable in the selected features
selected_features.append('DiagPeriodL90D')

# Subset the DataFrame with selected features
df_selected = df[selected_features]

# Split the data into features (X) and target variable (y)
X = df_selected.drop(['DiagPeriodL90D'], axis=1)
y = df_selected['DiagPeriodL90D']

# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)


# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

lgbm_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
gb_grid.fit(X_train, y_train)
ada_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
gb_model = gb_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'XGB': proba_xgb_holdout,
    'GB': proba_gb_holdout,
    'ADA': proba_ada_holdout
})
meta_learner = CatBoostClassifier(random_state=42)
# Train CatBoostClassifier as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (CatBoost) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3_updatedBMI.csv')

# Subset the test data with selected features
test_data = test_data[selected_features[:-1]]  # Exclude target variable

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_lgbm_test = lgbm_model.predict_proba(test_data)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data)[:, 1]
proba_gb_test = gb_model.predict_proba(test_data)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# # Round off the probabilities to 1 decimal place
# rounded_prob_predictions = [round(prob, 1) for prob in test_prob_predictions]
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_sfnr1_bmi.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_sfnr1_bmi.csv')

# Evaluate the performance on the holdout set
holdout_pred_proba = meta_learner.predict_proba(ensemble_predictions_holdout)[:, 1]
holdout_pred_binary = (holdout_pred_proba > 0.5).astype(int)

# Accuracy
accuracy_meta = accuracy_score(y_holdout, holdout_pred_binary)
print(f'Meta-Learner (CatBoost) Accuracy: {accuracy_meta}')

# Precision, Recall, F1-Score
precision_meta = precision_score(y_holdout, holdout_pred_binary)
recall_meta = recall_score(y_holdout, holdout_pred_binary)
f1_meta = f1_score(y_holdout, holdout_pred_binary)

print(f'Meta-Learner (CatBoost) Precision: {precision_meta}')
print(f'Meta-Learner (CatBoost) Recall: {recall_meta}')
print(f'Meta-Learner (CatBoost) F1-Score: {f1_meta}')

# AUC-ROC
roc_auc_meta = roc_auc_score(y_holdout, holdout_pred_proba)
print(f'Meta-Learner (CatBoost) AUC-ROC: {roc_auc_meta}')

# AUC-PR
average_precision_meta = average_precision_score(y_holdout, holdout_pred_proba)
print(f'Meta-Learner (CatBoost) AUC-PR: {average_precision_meta}')

[LightGBM] [Info] Number of positive: 4295, number of negative: 2587
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001324 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2110
[LightGBM] [Info] Number of data points in the train set: 6882, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624092 -> initscore=0.506953
[LightGBM] [Info] Start training from score 0.506953
[LightGBM] [Info] Number of positive: 4296, number of negative: 2587
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2108
[LightGBM] [Info] Number of data points in the train set: 6883, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624146 -> initscore=0.507185
[LightGBM] [Info] Start training from score 0.507185
[LightGBM] [Info] Numb

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Meta-Learner (CatBoost) Accuracy: 0.8276529821843532
Meta-Learner (CatBoost) Precision: 0.8008213552361396
Meta-Learner (CatBoost) Recall: 0.9647495361781077
Meta-Learner (CatBoost) F1-Score: 0.8751753155680225
Meta-Learner (CatBoost) AUC-ROC: 0.9396175992771107
Meta-Learner (CatBoost) AUC-PR: 0.9610840684709366


In [9]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3_updatedBMI.csv')
selected_features = ['patient_id', 'patient_zip3','patient_age',
                       'PM25', 'N02',
                      'affected_site','race_black','race_white','breast_cancer_diagnosis_code_encoded',
                      'bmi']



# Include the target variable in the selected features
selected_features.append('DiagPeriodL90D')

# Subset the DataFrame with selected features
df_selected = df[selected_features]

# Split the data into features (X) and target variable (y)
X = df_selected.drop(['DiagPeriodL90D'], axis=1)
y = df_selected['DiagPeriodL90D']
# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)


# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

lgbm_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
gb_grid.fit(X_train, y_train)
ada_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
gb_model = gb_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'XGB': proba_xgb_holdout,
    'GB': proba_gb_holdout,
    'ADA': proba_ada_holdout
})
# Train CatBoostClassifier as a meta-learner
meta_learner = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, random_seed=42)

# Train CatBoostClassifier as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (CatBoost) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3_updatedBMI.csv')
# Subset the test data with selected features
test_data = test_data[selected_features[:-1]]  # Exclude target variable

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_lgbm_test = lgbm_model.predict_proba(test_data)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data)[:, 1]
proba_gb_test = gb_model.predict_proba(test_data)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# # Round off the probabilities to 1 decimal place
# rounded_prob_predictions = [round(prob, 1) for prob in test_prob_predictions]
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_sfnr_bmi.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_sfnr_bmi.csv')


[LightGBM] [Info] Number of positive: 4295, number of negative: 2587
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1866
[LightGBM] [Info] Number of data points in the train set: 6882, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624092 -> initscore=0.506953
[LightGBM] [Info] Start training from score 0.506953
[LightGBM] [Info] Number of positive: 4296, number of negative: 2587
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001326 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1863
[LightGBM] [Info] Number of data points in the train set: 6883, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624146 -> initscore=0.507185
[LightGBM] [Info] Start training from score 0.507185
[LightGBM] [Info] Numb

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
!pip install catboost



In [11]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3_updatedBMI.csv')

# Split the data into features (X) and target variable (y)
X = df.drop(['DiagPeriodL90D'], axis=1)
y = df['DiagPeriodL90D']
# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)


# Create individual models
cat_params = {'iterations': [100, 150, 200], 'depth': [6, 7, 8], 'learning_rate': [0.1, 0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

cat_grid = GridSearchCV(CatBoostClassifier(random_state=42), cat_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

cat_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
gb_grid.fit(X_train, y_train)
ada_grid.fit(X_train, y_train)

cat_model = cat_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
gb_model = gb_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
cat_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_cat_holdout = cat_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'CatBoost': proba_cat_holdout,
    'XGB': proba_xgb_holdout,
    'GB': proba_gb_holdout,
    'ADA': proba_ada_holdout
})
# Train LightGBM as a meta-learner
meta_learner = LGBMClassifier(n_estimators=100, max_leaves=31, learning_rate=0.1, random_state=42)

# Train LightGBM as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (LightGBM) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3_updatedBMI.csv')

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_cat_test = cat_model.predict_proba(test_data)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data)[:, 1]
proba_gb_test = gb_model.predict_proba(test_data)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'CatBoost': proba_cat_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# # Round off the probabilities to 1 decimal place
# rounded_prob_predictions = [round(prob, 1) for prob in test_prob_predictions]
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnrml3_bmi.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnrml3_bmi.csv')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
62:	learn: 0.3572428	total: 2.21s	remaining: 4.8s
63:	learn: 0.3553671	total: 2.24s	remaining: 4.76s
64:	learn: 0.3530737	total: 2.28s	remaining: 4.74s
65:	learn: 0.3509757	total: 2.31s	remaining: 4.7s
66:	learn: 0.3491192	total: 2.35s	remaining: 4.66s
67:	learn: 0.3474263	total: 2.38s	remaining: 4.62s
68:	learn: 0.3452322	total: 2.41s	remaining: 4.58s
69:	learn: 0.3427775	total: 2.45s	remaining: 4.54s
70:	learn: 0.3407445	total: 2.48s	remaining: 4.5s
71:	learn: 0.3395566	total: 2.52s	remaining: 4.48s
72:	learn: 0.3376705	total: 2.55s	remaining: 4.44s
73:	learn: 0.3356787	total: 2.58s	remaining: 4.4s
74:	learn: 0.3340031	total: 2.62s	remaining: 4.36s
75:	learn: 0.3331435	total: 2.65s	remaining: 4.32s
76:	learn: 0.3319036	total: 2.69s	remaining: 4.29s
77:	learn: 0.3305343	total: 2.73s	remaining: 4.27s
78:	learn: 0.3282363	total: 2.79s	remaining: 4.27s
79:	learn: 0.3265275	total: 2.85s	remaining: 4.27s
80:	learn: 0.3254529	

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier  # Change: Import CatBoostClassifier instead of XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3_updatedBMI.csv')

# Split the data into features (X) and target variable (y)
X = df.drop(['DiagPeriodL90D'], axis=1)
y = df['DiagPeriodL90D']
# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)


# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
catboost_params = {'iterations': [50, 100, 150], 'depth': [6, 7, 8], 'learning_rate': [0.1, 0.2, 0.3]}  # Change: CatBoost parameters
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
catboost_grid = GridSearchCV(CatBoostClassifier(random_state=42), catboost_params, cv=3)  # Change: Use CatBoostClassifier
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

lgbm_grid.fit(X_train, y_train)
catboost_grid.fit(X_train, y_train)  # Change: Fit CatBoost model
gb_grid.fit(X_train, y_train)
ada_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
catboost_model = catboost_grid.best_estimator_  # Change: Use CatBoost model
gb_model = gb_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
catboost_model.fit(X_train, y_train)  # Change: Fit CatBoost model
gb_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_catboost_holdout = catboost_model.predict_proba(X_holdout)[:, 1]  # Change: Use CatBoost model
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'CatBoost': proba_catboost_holdout,  # Change: Use CatBoost model
    'GB': proba_gb_holdout,
    'ADA': proba_ada_holdout
})

# Train XGBClassifier as a meta-learner
meta_learner = XGBClassifier(iterations=100, depth=6, learning_rate=0.1, random_seed=42)

# Train CatBoostClassifier as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (CatBoost) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3_updatedBMI.csv')

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_lgbm_test = lgbm_model.predict_proba(test_data)[:, 1]
proba_catboost_test = catboost_model.predict_proba(test_data)[:, 1]  # Change: Use CatBoost model
proba_gb_test = gb_model.predict_proba(test_data)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'CatBoost': proba_catboost_test,  # Change: Use CatBoost model
    'GB': proba_gb_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# # Round off the probabilities to 1 decimal place
# rounded_prob_predictions = [round(prob, 1) for prob in test_prob_predictions]
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnrmx3_bmi.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnrmx3_bmi.csv')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
62:	learn: 0.4034994	total: 2.21s	remaining: 1.29s
63:	learn: 0.4026709	total: 2.24s	remaining: 1.26s
64:	learn: 0.4013702	total: 2.27s	remaining: 1.22s
65:	learn: 0.4001525	total: 2.3s	remaining: 1.19s
66:	learn: 0.3989121	total: 2.34s	remaining: 1.15s
67:	learn: 0.3980049	total: 2.38s	remaining: 1.12s
68:	learn: 0.3970373	total: 2.41s	remaining: 1.08s
69:	learn: 0.3960341	total: 2.44s	remaining: 1.05s
70:	learn: 0.3954070	total: 2.48s	remaining: 1.01s
71:	learn: 0.3945769	total: 2.51s	remaining: 976ms
72:	learn: 0.3937540	total: 2.55s	remaining: 942ms
73:	learn: 0.3932725	total: 2.58s	remaining: 906ms
74:	learn: 0.3921079	total: 2.62s	remaining: 872ms
75:	learn: 0.3917542	total: 2.65s	remaining: 836ms
76:	learn: 0.3903706	total: 2.68s	remaining: 802ms
77:	learn: 0.3892827	total: 2.72s	remaining: 767ms
78:	learn: 0.3886461	total: 2.75s	remaining: 732ms
79:	learn: 0.3879585	total: 2.79s	remaining: 698ms
80:	learn: 0.38732

Parameters: { "depth", "iterations", "random_seed" } are not used.





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3_updatedBMI.csv')

# Split the data into features (X) and target variable (y)
X = df.drop(['DiagPeriodL90D'], axis=1)
y = df['DiagPeriodL90D']
# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)


# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
cat_params = {'iterations': [100, 150, 200], 'depth': [6, 7, 8], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
cat_grid = GridSearchCV(CatBoostClassifier(random_state=42), cat_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

lgbm_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
cat_grid.fit(X_train, y_train)
ada_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
cat_model = cat_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_cat_holdout = cat_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'XGB': proba_xgb_holdout,
    'CAT': proba_cat_holdout,
    'ADA': proba_ada_holdout
})

# Train GradientBoostingClassifier as a meta-learner
meta_learner = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train GradientBoostingClassifier as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (GradientBoosting) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3_updatedBMI.csv')

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_lgbm_test = lgbm_model.predict_proba(test_data)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data)[:, 1]
proba_cat_test = cat_model.predict_proba(test_data)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'CAT': proba_cat_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnrmg3_bmi.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnrmg3_bmi.csv')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
63:	learn: 0.3553671	total: 2.21s	remaining: 4.69s
64:	learn: 0.3530737	total: 2.24s	remaining: 4.66s
65:	learn: 0.3509757	total: 2.28s	remaining: 4.63s
66:	learn: 0.3491192	total: 2.32s	remaining: 4.6s
67:	learn: 0.3474263	total: 2.35s	remaining: 4.56s
68:	learn: 0.3452322	total: 2.38s	remaining: 4.52s
69:	learn: 0.3427775	total: 2.41s	remaining: 4.48s
70:	learn: 0.3407445	total: 2.45s	remaining: 4.45s
71:	learn: 0.3395566	total: 2.48s	remaining: 4.41s
72:	learn: 0.3376705	total: 2.52s	remaining: 4.38s
73:	learn: 0.3356787	total: 2.55s	remaining: 4.34s
74:	learn: 0.3340031	total: 2.58s	remaining: 4.31s
75:	learn: 0.3331435	total: 2.63s	remaining: 4.29s
76:	learn: 0.3319036	total: 2.69s	remaining: 4.29s
77:	learn: 0.3305343	total: 2.75s	remaining: 4.31s
78:	learn: 0.3282363	total: 2.82s	remaining: 4.32s
79:	learn: 0.3265275	total: 2.88s	remaining: 4.32s
80:	learn: 0.3254529	total: 2.94s	remaining: 4.32s
81:	learn: 0.32445

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3_updatedBMI.csv')

# Split the data into features (X) and target variable (y)
X = df.drop(['DiagPeriodL90D'], axis=1)
y = df['DiagPeriodL90D']
# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
cat_params = {'iterations': [50, 100, 150], 'depth': [6, 7, 8], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
cat_grid = GridSearchCV(CatBoostClassifier(random_state=42), cat_params, cv=3)

lgbm_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
gb_grid.fit(X_train, y_train)
cat_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
gb_model = gb_grid.best_estimator_
cat_model = cat_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_cat_holdout = cat_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'XGB': proba_xgb_holdout,
    'GB': proba_gb_holdout,
    'CAT': proba_cat_holdout
})
# Train AdaBoostClassifier as a meta-learner
meta_learner = AdaBoostClassifier(n_estimators=50, learning_rate=0.1, random_state=42)

# Train AdaBoostClassifier as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (AdaBoost) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3_updatedBMI.csv')

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_lgbm_test = lgbm_model.predict_proba(test_data)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data)[:, 1]
proba_gb_test = gb_model.predict_proba(test_data)[:, 1]
proba_cat_test = cat_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'CAT': proba_cat_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# # Round off the probabilities to 1 decimal place
# rounded_prob_predictions = [round(prob, 1) for prob in test_prob_predictions]
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnrma3_bmi.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnrma3_bmi.csv')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
31:	learn: 0.5760352	total: 2.61s	remaining: 5.55s
32:	learn: 0.5736740	total: 2.71s	remaining: 5.5s
33:	learn: 0.5711658	total: 2.8s	remaining: 5.44s
34:	learn: 0.5686615	total: 2.92s	remaining: 5.42s
35:	learn: 0.5663293	total: 3.02s	remaining: 5.37s
36:	learn: 0.5640835	total: 3.13s	remaining: 5.32s
37:	learn: 0.5618891	total: 3.22s	remaining: 5.26s
38:	learn: 0.5597934	total: 3.31s	remaining: 5.18s
39:	learn: 0.5575307	total: 3.42s	remaining: 5.13s
40:	learn: 0.5555024	total: 3.5s	remaining: 5.03s
41:	learn: 0.5534382	total: 3.57s	remaining: 4.93s
42:	learn: 0.5513935	total: 3.65s	remaining: 4.84s
43:	learn: 0.5494286	total: 3.77s	remaining: 4.79s
44:	learn: 0.5474884	total: 3.87s	remaining: 4.74s
45:	learn: 0.5456036	total: 4.04s	remaining: 4.74s
46:	learn: 0.5438271	total: 4.18s	remaining: 4.72s
47:	learn: 0.5421325	total: 4.27s	remaining: 4.62s
48:	learn: 0.5405009	total: 4.34s	remaining: 4.51s
49:	learn: 0.5388395

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3_updatedBMI.csv')

# Split the data into features (X) and target variable (y)
X = df.drop(['DiagPeriodL90D'], axis=1)
y = df['DiagPeriodL90D']
# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)


# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

lgbm_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
gb_grid.fit(X_train, y_train)
ada_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
gb_model = gb_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'XGB': proba_xgb_holdout,
    'GB': proba_gb_holdout,
    'ADA': proba_ada_holdout
})
# Train CatBoostClassifier as a meta-learner
meta_learner = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, random_seed=42)

# Train CatBoostClassifier as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (CatBoost) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3_updatedBMI.csv')

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_lgbm_test = lgbm_model.predict_proba(test_data)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data)[:, 1]
proba_gb_test = gb_model.predict_proba(test_data)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# # Round off the probabilities to 1 decimal place
# rounded_prob_predictions = [round(prob, 1) for prob in test_prob_predictions]
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnroc_bmi.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnroc_bmi.csv')


[LightGBM] [Info] Number of positive: 4295, number of negative: 2587
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17079
[LightGBM] [Info] Number of data points in the train set: 6882, number of used features: 78
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624092 -> initscore=0.506953
[LightGBM] [Info] Start training from score 0.506953
[LightGBM] [Info] Number of positive: 4296, number of negative: 2587
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007970 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17026
[LightGBM] [Info] Number of data points in the train set: 6883, number of used features: 78
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624146 -> initscore=0.507185
[LightGBM] [Info] Start training from score 0.507185
[LightGBM] [Info] Nu

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3_updatedBMI.csv')

# Extract the feature 'breast_cancer_code_encoded' and the target variable
X = df[['breast_cancer_diagnosis_code_encoded','affected_site']]
y = df['DiagPeriodL90D']

# Split the data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

lgbm_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
gb_grid.fit(X_train, y_train)
ada_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
gb_model = gb_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'XGB': proba_xgb_holdout,
    'GB': proba_gb_holdout,
    'ADA': proba_ada_holdout
})

# Train CatBoostClassifier as a meta-learner on the holdout set
meta_learner = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, random_seed=42)
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (CatBoost) Accuracy: {accuracy_meta}')

# Generate predictions on the test set using the trained models
test_data = pd.read_csv('test_final3_updatedBMI.csv')
test_data_feature = test_data[['breast_cancer_diagnosis_code_encoded', 'affected_site']]

proba_lgbm_test = lgbm_model.predict_proba(test_data_feature)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data_feature)[:, 1]
proba_gb_test = gb_model.predict_proba(test_data_feature)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data_feature)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnrocd_bmi.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnrocd_bmi.csv')


[LightGBM] [Info] Number of positive: 4295, number of negative: 2587
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51
[LightGBM] [Info] Number of data points in the train set: 6882, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624092 -> initscore=0.506953
[LightGBM] [Info] Start training from score 0.506953
[LightGBM] [Info] Number of positive: 4296, number of negative: 2587
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000314 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50
[LightGBM] [Info] Number of data points in the train set: 6883, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624146 -> initscore=0.507185
[LightGBM] [Info] Start training from score 0.507185
[LightGBM] [Info] Number of 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3_updatedBMI.csv')

# Extract the feature 'breast_cancer_code_encoded' and the target variable
X = df.drop(['DiagPeriodL90D'], axis=1)
y = df['DiagPeriodL90D']

# Split the data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

lgbm_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
gb_grid.fit(X_train, y_train)
ada_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
gb_model = gb_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'XGB': proba_xgb_holdout,
    'GB': proba_gb_holdout,
    'ADA': proba_ada_holdout
})

# Train CatBoostClassifier as a meta-learner on the holdout set
meta_learner = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, random_seed=42)
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (CatBoost) Accuracy: {accuracy_meta}')

# Generate predictions on the test set using the trained models
test_data = pd.read_csv('test_final3_updatedBMI.csv')

proba_lgbm_test = lgbm_model.predict_proba(test_data)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data)[:, 1]
proba_gb_test = gb_model.predict_proba(test_data)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnrwe_bmi.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnrwe_bmi.csv')

[LightGBM] [Info] Number of positive: 4295, number of negative: 2587
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006205 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17079
[LightGBM] [Info] Number of data points in the train set: 6882, number of used features: 78
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624092 -> initscore=0.506953
[LightGBM] [Info] Start training from score 0.506953
[LightGBM] [Info] Number of positive: 4296, number of negative: 2587
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17026
[LightGBM] [Info] Number of data points in the train set: 6883, number of used features: 78
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624146 -> initscore=0.507185
[LightGBM

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>