In [None]:
!pip install shap
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [None]:
import shap
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3.csv')

# Split the data into features (X) and target variable (y)
X = df.drop(['DiagPeriodL90D'], axis=1)
y = df['DiagPeriodL90D']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Impute missing values with the mean (you can choose a different strategy)
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Initialize Gradient Boosting classifier with specified parameters
gb_model = GradientBoostingClassifier(
    random_state=42,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    min_samples_leaf=1
)

# Fit the model to the training data
gb_model.fit(X_train, y_train)

# Generate predictions on the test set
gb_predictions = gb_model.predict(X_test)

# Evaluate accuracy
accuracy_gb = accuracy_score(y_test, gb_predictions)
print(f'Gradient Boosting Accuracy: {accuracy_gb}')

explainer = shap.Explainer(gb_model, X)
shap_values = explainer(X)

In [None]:
shap.plots.bar(shap_values)

In [None]:
shap.plots.bar(shap_values, max_display=83)

In [None]:
shap.plots.bar(shap_values[2], max_display=83)

In [None]:
# meta learner = xgboost (all features trained)
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3.csv')

# Split the data into features (X) and target variable (y)
X = df.drop(['DiagPeriodL90D'], axis=1)
y = df['DiagPeriodL90D']
# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)


# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
catboost_params = {'iterations': [50, 100, 150], 'depth': [6, 7, 8], 'learning_rate': [0.1, 0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
catboost_grid = GridSearchCV(CatBoostClassifier(random_state=42), catboost_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

lgbm_grid.fit(X_train, y_train)
catboost_grid.fit(X_train, y_train)
gb_grid.fit(X_train, y_train)
ada_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
catboost_model = catboost_grid.best_estimator_
gb_model = gb_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
catboost_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_catboost_holdout = catboost_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'CatBoost': proba_catboost_holdout,
    'GB': proba_gb_holdout,
    'ADA': proba_ada_holdout
})

# Train XGBClassifier as a meta-learner
meta_learner = XGBClassifier(iterations=100, depth=6, learning_rate=0.1, random_seed=42)

# Train CatBoostClassifier as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (XGBoost) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3.csv')

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_lgbm_test = lgbm_model.predict_proba(test_data)[:, 1]
proba_catboost_test = catboost_model.predict_proba(test_data)[:, 1]  # Change: Use CatBoost model
proba_gb_test = gb_model.predict_proba(test_data)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'CatBoost': proba_catboost_test,  # Change: Use CatBoost model
    'GB': proba_gb_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# # Round off the probabilities to 1 decimal place
# rounded_prob_predictions = [round(prob, 1) for prob in test_prob_predictions]
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnrmx3.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnrmx3.csv')


In [None]:
# meta learner = lgbm (all features trained)
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3.csv')

# Split the data into features (X) and target variable (y)
X = df.drop(['DiagPeriodL90D'], axis=1)
y = df['DiagPeriodL90D']
# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)


# Create individual models
cat_params = {'iterations': [100, 150, 200], 'depth': [6, 7, 8], 'learning_rate': [0.1, 0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

cat_grid = GridSearchCV(CatBoostClassifier(random_state=42), cat_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

cat_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
gb_grid.fit(X_train, y_train)
ada_grid.fit(X_train, y_train)

cat_model = cat_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
gb_model = gb_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
cat_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_cat_holdout = cat_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'CatBoost': proba_cat_holdout,
    'XGB': proba_xgb_holdout,
    'GB': proba_gb_holdout,
    'ADA': proba_ada_holdout
})
# Train LightGBM as a meta-learner
meta_learner = LGBMClassifier(n_estimators=100, max_leaves=31, learning_rate=0.1, random_state=42)

# Train LightGBM as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (LightGBM) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3.csv')

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_cat_test = cat_model.predict_proba(test_data)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data)[:, 1]
proba_gb_test = gb_model.predict_proba(test_data)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'CatBoost': proba_cat_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# # Round off the probabilities to 1 decimal place
# rounded_prob_predictions = [round(prob, 1) for prob in test_prob_predictions]
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnrml3.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnrml3.csv')


In [None]:
# meta learner = catboost (all features trained)
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3.csv')

# Extract the feature 'breast_cancer_code_encoded' and the target variable
X = df.drop(['DiagPeriodL90D'], axis=1)
y = df['DiagPeriodL90D']

# Split the data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

lgbm_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
gb_grid.fit(X_train, y_train)
ada_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
gb_model = gb_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'XGB': proba_xgb_holdout,
    'GB': proba_gb_holdout,
    'ADA': proba_ada_holdout
})

# Train CatBoostClassifier as a meta-learner on the holdout set
meta_learner = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, random_seed=42)
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (CatBoost) Accuracy: {accuracy_meta}')

# Generate predictions on the test set using the trained models
test_data = pd.read_csv('test_final3.csv')

proba_lgbm_test = lgbm_model.predict_proba(test_data)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data)[:, 1]
proba_gb_test = gb_model.predict_proba(test_data)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnrwe.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnrwe.csv')


In [None]:
# meta learner = gradient boost (all features trained)
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3.csv')

# Split the data into features (X) and target variable (y)
X = df.drop(['DiagPeriodL90D'], axis=1)
y = df['DiagPeriodL90D']
# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)


# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
cat_params = {'iterations': [100, 150, 200], 'depth': [6, 7, 8], 'learning_rate': [0.1, 0.2, 0.3]}
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
cat_grid = GridSearchCV(CatBoostClassifier(random_state=42), cat_params, cv=3)
ada_grid = GridSearchCV(AdaBoostClassifier(random_state=42), ada_params, cv=3)

lgbm_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
cat_grid.fit(X_train, y_train)
ada_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
cat_model = cat_grid.best_estimator_
ada_model = ada_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)
ada_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_cat_holdout = cat_model.predict_proba(X_holdout)[:, 1]
proba_ada_holdout = ada_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'XGB': proba_xgb_holdout,
    'CAT': proba_cat_holdout,
    'ADA': proba_ada_holdout
})

# Train GradientBoostingClassifier as a meta-learner
meta_learner = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train GradientBoostingClassifier as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (GradientBoosting) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3.csv')

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_lgbm_test = lgbm_model.predict_proba(test_data)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data)[:, 1]
proba_cat_test = cat_model.predict_proba(test_data)[:, 1]
proba_ada_test = ada_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'CAT': proba_cat_test,
    'ADA': proba_ada_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnrmg3.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnrmg3.csv')


In [None]:
# meta learner = ada boost  (all features trained)
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3.csv')

# Split the data into features (X) and target variable (y)
X = df.drop(['DiagPeriodL90D'], axis=1)
y = df['DiagPeriodL90D']
# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
cat_params = {'iterations': [50, 100, 150], 'depth': [6, 7, 8], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
cat_grid = GridSearchCV(CatBoostClassifier(random_state=42), cat_params, cv=3)

lgbm_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
gb_grid.fit(X_train, y_train)
cat_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
gb_model = gb_grid.best_estimator_
cat_model = cat_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_cat_holdout = cat_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'XGB': proba_xgb_holdout,
    'GB': proba_gb_holdout,
    'CAT': proba_cat_holdout
})
# Train AdaBoostClassifier as a meta-learner
meta_learner = AdaBoostClassifier(n_estimators=50, learning_rate=0.1, random_state=42)

# Train AdaBoostClassifier as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (AdaBoost) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3.csv')

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_lgbm_test = lgbm_model.predict_proba(test_data)[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data)[:, 1]
proba_gb_test = gb_model.predict_proba(test_data)[:, 1]
proba_cat_test = cat_model.predict_proba(test_data)[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'CAT': proba_cat_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# # Round off the probabilities to 1 decimal place
# rounded_prob_predictions = [round(prob, 1) for prob in test_prob_predictions]
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnrma3.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnrma3.csv')


In [None]:
# top shap values trained on meta learner adaboost(score: top3=0.808, top4=0.807, top9=0.801, top2=0.778)

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import re

# Assuming your DataFrame is named 'df' and the target variable is 'DiagPeriodL90D'
df = pd.read_csv('train_final3.csv')
# List of selected columns
selected_columns = ['patient_age', 'breast_cancer_diagnosis_code_encoded', 'metastatic_cancer_diagnosis_code_encoded']

# Split the data into features (X) and target variable (y) using only selected columns
X = df[selected_columns]
y = df['DiagPeriodL90D']
# Split the original training data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual models
lgbm_params = {'n_estimators': [9, 10, 11], 'max_leaves': [31, 32, 33], 'learning_rate': [0.2, 0.3]}
xgb_params = {'n_estimators': [18, 19, 20], 'max_depth': [3, 4, 5], 'learning_rate': [0.2, 0.3]}
gb_params = {'n_estimators': [100, 150, 200], 'learning_rate': [0.1, 0.2, 0.3]}
cat_params = {'iterations': [50, 100, 150], 'depth': [6, 7, 8], 'learning_rate': [0.01, 0.1, 0.2]}

lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3)
xgb_grid = GridSearchCV(XGBClassifier(random_state=42, booster='gbtree'), xgb_params, cv=3)
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3)
cat_grid = GridSearchCV(CatBoostClassifier(random_state=42), cat_params, cv=3)

lgbm_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
gb_grid.fit(X_train, y_train)
cat_grid.fit(X_train, y_train)

lgbm_model = lgbm_grid.best_estimator_
xgb_model = xgb_grid.best_estimator_
gb_model = gb_grid.best_estimator_
cat_model = cat_grid.best_estimator_

# Train individual models on the training set
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)

# Generate predictions on the holdout set
proba_lgbm_holdout = lgbm_model.predict_proba(X_holdout)[:, 1]
proba_xgb_holdout = xgb_model.predict_proba(X_holdout)[:, 1]
proba_gb_holdout = gb_model.predict_proba(X_holdout)[:, 1]
proba_cat_holdout = cat_model.predict_proba(X_holdout)[:, 1]

# Combine predictions into a DataFrame
ensemble_predictions_holdout = pd.DataFrame({
    'LGBM': proba_lgbm_holdout,
    'XGB': proba_xgb_holdout,
    'GB': proba_gb_holdout,
    'CAT': proba_cat_holdout
})
# Train AdaBoostClassifier as a meta-learner
meta_learner = AdaBoostClassifier(n_estimators=50, learning_rate=0.1, random_state=42)

# Train AdaBoostClassifier as a meta-learner on the holdout set
meta_learner.fit(ensemble_predictions_holdout, y_holdout)
holdout_pred = meta_learner.predict(ensemble_predictions_holdout)
# Evaluate accuracy of the meta-learner
accuracy_meta = accuracy_score(y_holdout, holdout_pred)
print(f'Meta-Learner (AdaBoost) Accuracy: {accuracy_meta}')

test_data = pd.read_csv('test_final3.csv')

# Clean feature names in test data (if needed)
test_data.columns = [re.sub(r'[^\w\s]', '', col) for col in test_data.columns]

# Now, generate predictions on your test set using the trained models
proba_lgbm_test = lgbm_model.predict_proba(test_data[selected_columns])[:, 1]
proba_xgb_test = xgb_model.predict_proba(test_data[selected_columns])[:, 1]
proba_gb_test = gb_model.predict_proba(test_data[selected_columns])[:, 1]
proba_cat_test = cat_model.predict_proba(test_data[selected_columns])[:, 1]

# Combine test set predictions into a DataFrame
ensemble_predictions_test = pd.DataFrame({
    'LGBM': proba_lgbm_test,
    'XGB': proba_xgb_test,
    'GB': proba_gb_test,
    'CAT': proba_cat_test
})

# Generate predictions from the meta-learner on the test set
test_prob_predictions = meta_learner.predict_proba(ensemble_predictions_test)[:, 1]

# # Round off the probabilities to 1 decimal place
# rounded_prob_predictions = [round(prob, 1) for prob in test_prob_predictions]
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': test_prob_predictions
})

# Write the submission DataFrame to a CSV file
submission_df.to_csv('submission_pnraot3.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission_pnraot3.csv')

In [None]:
# top shap values trained on meta learner xgboost

In [None]:
# top shap values trained on meta learner catboost

In [None]:
# top shap values trained on meta learner lgbm

In [None]:
# top shap values trained on meta learner gradient boost