In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl
/kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_42.cpkt
/kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt
/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


In [2]:
!pip install -q /kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl
!mkdir /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [3]:
from tabpfn import TabPFNClassifier

import os
from collections import defaultdict

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import probplot

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import f_classif
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import Binarizer, FunctionTransformer, OrdinalEncoder, PowerTransformer, StandardScaler

%matplotlib inline



In [4]:
def balanced_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)  # ensure probabilities are within valid range
    log_loss_0 = -1/N_0 * np.sum((1 - y_true) * np.log(1 - y_pred))
    log_loss_1 = -1/N_1 * np.sum(y_true * np.log(y_pred))
    return 0.5 * (log_loss_0 + log_loss_1)

In [5]:
PATH = "/kaggle/input/icr-identify-age-related-conditions/"

train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')
greeks = pd.read_csv(PATH + 'greeks.csv')
sample_submission = pd.read_csv(PATH + 'sample_submission.csv')

In [6]:
train.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 58 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Id      617 non-null    object 
 1   AB      617 non-null    float64
 2   AF      617 non-null    float64
 3   AH      617 non-null    float64
 4   AM      617 non-null    float64
 5   AR      617 non-null    float64
 6   AX      617 non-null    float64
 7   AY      617 non-null    float64
 8   AZ      617 non-null    float64
 9   BC      617 non-null    float64
 10  BD      617 non-null    float64
 11  BN      617 non-null    float64
 12  BP      617 non-null    float64
 13  BQ      557 non-null    float64
 14  BR      617 non-null    float64
 15  BZ      617 non-null    float64
 16  CB      615 non-null    float64
 17  CC      614 non-null    float64
 18  CD      617 non-null    float64
 19  CF      617 non-null    float64
 20  CH      617 non-null    float64
 21  CL      617 non-null    float64
 22  CR

In [8]:
numeric_descr = (train.drop('Class', axis = 1).describe(percentiles = [0.01,0.05,0.25,0.5,0.75,0.95,0.99]).drop('count').T)
numeric_descr

Unnamed: 0,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max
AB,0.477149,0.468388,0.081187,0.119644,0.152973,0.252107,0.354659,0.559763,1.07936,2.164873,6.161666
AF,3502.013221,2300.322717,192.59328,192.59328,1018.985044,2197.34548,3120.31896,4361.63739,6957.80734,10377.993534,28688.18766
AH,118.624513,127.83895,85.200147,85.200147,85.200147,85.200147,85.200147,113.73954,209.99315,541.428776,1910.123198
AM,38.968552,69.728226,3.177522,5.185988,7.152831,12.270314,20.53311,39.139886,111.939207,410.511686,630.51823
AR,10.128242,10.518877,8.138688,8.138688,8.138688,8.138688,8.138688,8.138688,17.119562,34.467239,178.943634
AX,5.545576,2.551696,0.699861,1.03544,2.870316,4.128294,5.031912,6.431634,9.247024,13.169435,38.27088
AY,0.06032,0.416817,0.025578,0.025578,0.025578,0.025578,0.025578,0.036845,0.123992,0.214405,10.315851
AZ,10.566447,4.350645,3.396778,3.396778,3.396778,8.12958,10.46132,12.969516,16.861631,22.914324,38.971568
BC,8.053012,65.166943,1.2299,1.2299,1.2299,1.2299,1.2299,5.081244,11.996796,50.660495,1463.693448
BD,5350.388655,3021.326641,1693.62432,2221.149688,3041.642788,4155.70287,4997.96073,6035.8857,7955.457632,10131.207181,53060.59924


In [9]:
numeric_data = train.select_dtypes("number")
numeric_cols = numeric_data.drop("Class", axis=1).columns
numeric_cols

Index(['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN', 'BP',
       'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU',
       'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY', 'EB',
       'EE', 'EG', 'EH', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI', 'FL', 'FR',
       'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL'],
      dtype='object')

In [10]:
r2_scores = defaultdict(tuple)

for feature in numeric_cols:
    orig = train[feature].dropna()
    _, (*_, R_orig) = probplot(orig, rvalue = True)
    _, (*_, R_log) = probplot(np.log(orig), rvalue = True)
    _, (*_, R_sqrt) = probplot(np.sqrt(orig), rvalue = True)
    _, (*_, R_reci) = probplot(np.reciprocal(orig), rvalue = True)
    _, (*_, R_boxcox) = probplot(stats.boxcox(orig)[0], rvalue = True)
    _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue = True)
    
    r2_scores[feature] = (
        R_orig * R_orig,
        R_log * R_log,
        R_sqrt * R_sqrt,
        R_reci * R_reci,
        R_boxcox * R_boxcox,
        R_yeojohn * R_yeojohn
    )

r2_scores = pd.DataFrame(
    r2_scores, index=("Original", "Log", "Sqrt", "Reciprocal", "BoxCox", "YeoJohnson")
).T

r2_scores["Winner"] = r2_scores.idxmax(axis=1)
r2_scores

Unnamed: 0,Original,Log,Sqrt,Reciprocal,BoxCox,YeoJohnson,Winner
AB,0.537195,0.976071,0.820288,0.919818,0.998107,0.991143,BoxCox
AF,0.761133,0.871797,0.945411,0.344347,0.955054,0.955098,YeoJohnson
AH,0.237829,0.56764,0.415764,0.678381,0.686058,0.686064,YeoJohnson
AM,0.383144,0.958737,0.715838,0.903156,0.996761,0.995844,BoxCox
AR,0.158397,0.42177,0.299171,0.505032,0.515449,0.515495,YeoJohnson
AX,0.7455,0.917768,0.912345,0.488744,0.937514,0.950425,YeoJohnson
AY,0.0386,0.572584,0.231879,0.641586,0.633691,0.626888,Reciprocal
AZ,0.942194,0.902788,0.953193,0.722089,0.956941,0.957626,YeoJohnson
BC,0.057783,0.740394,0.308317,0.723142,0.738835,0.744935,YeoJohnson
BD,0.412282,0.92437,0.730386,0.918367,0.961509,0.96152,YeoJohnson


In [11]:
no_transform_cols = r2_scores.loc[r2_scores['Winner'] == 'Original'].index
log_transform_cols = r2_scores.loc[r2_scores['Winner'] == 'Log'].index
sqrt_transform_cols = r2_scores.loc[r2_scores['Winner'] == 'Sqrt'].index
reciprocal_transform_cols = r2_scores.loc[r2_scores['Winner'] == 'Reciprocal'].index
boxcox_transform_cols = r2_scores.loc[r2_scores['Winner'] == 'BoxCox'].index
yeojohnson_transform_cols = r2_scores.loc[r2_scores['Winner'] == 'YeoJohnson'].index

In [12]:
problematic_cols = []

for idx, row in r2_scores.drop('Winner', axis = 1).iterrows():
    if all(val <= 0.7 for val in row):
        problematic_cols.append(idx)
        
problematic_cols

['AH', 'AR', 'AY', 'BZ', 'DF', 'DV']

In [13]:
semi_constant_mask = np.isclose(numeric_descr["min"], numeric_descr["50%"])
semi_constant_descr = numeric_descr[semi_constant_mask]
semi_const_cols_thresholds = semi_constant_descr["50%"].to_dict()

In [14]:
semi_const_cols = semi_const_cols_thresholds.keys()

no_transform_cols = no_transform_cols.drop(semi_const_cols, errors = 'ignore')
log_transform_cols = log_transform_cols.drop(semi_const_cols, errors = 'ignore')
# sqrt_transform_cols = sqrt_transform_cols.drop(semi_const_cols, errors = 'ignore')
reciprocal_transform_cols = reciprocal_transform_cols.drop(semi_const_cols, errors = 'ignore')
boxcox_transform_cols = boxcox_transform_cols.drop(semi_const_cols, errors = 'ignore')
yeojohnson_transform_cols = yeojohnson_transform_cols.drop(semi_const_cols, errors = 'ignore')

In [15]:
preliminary_preprocess = make_pipeline(
    make_column_transformer(
        (
            StandardScaler(),
            no_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.log, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            log_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.reciprocal, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            reciprocal_transform_cols.to_list(),
        ),
        (
            PowerTransformer(method="box-cox", standardize=True),
            boxcox_transform_cols.to_list(),
        ),
        (
            PowerTransformer(method="yeo-johnson", standardize=True),
            yeojohnson_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                SimpleImputer(strategy="most_frequent"),
                OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            ),
            make_column_selector(dtype_include=object),  # type: ignore
        ),
        *[
            (
                make_pipeline(
                    SimpleImputer(strategy="median"),
                    Binarizer(threshold=thresh),
                ),
                [col],
            )
            for col, thresh in semi_const_cols_thresholds.items()
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    ),
    KNNImputer(n_neighbors=10, weights="distance"),
).set_output(transform="pandas")

In [16]:
X_preliminary = preliminary_preprocess.fit_transform(train.drop(["Class", 'Id'], axis=1))

In [17]:
y = train['Class']
# X_preliminary, y

In [18]:
lgbm_params = {'max_depth': 9, 'num_leaves': 5, 'n_estimators': 348, 'learning_rate': 0.05242474282360704,"colsample_bytree": 0.1, "min_split_gain": 0.001, 'reg_alpha': 0.004218313752787253, 'reg_lambda': 0.041919397508377844}

xgb_params = {'max_depth': 9, 'n_estimators': 505, 'learning_rate': 0.16815024170969353, 'subsample': 0.8, 'min_child_weight': 0.1, 'max_delta_step': 0.1, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.1, 'min_split_loss': 0.001, 'reg_alpha': 0.001, 'reg_lambda': 0.0001}

cb_params = {'depth': 8, 'l2_leaf_reg': 2.4583124963339733, 'iterations': 622, 'learning_rate': 0.032999747029433896}

svc_params = {'C': 3.0293143161895992, 'kernel': 'rbf', 'probability': True}

In [19]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as imb_make_pipeline
from statistics import mean
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier

# estimators = [
# #     ('xgb', XGBClassifier(**xgb_params, random_state = 42)),
# #     ('lgbm', LGBMClassifier(**lgbm_params, random_state = 42, verbose = -1)),
# #     ('cb', CatBoostClassifier(random_state = 42, verbose = False)),
# #     ('svc', SVC(**svc_params)),
# # #    ('randomforest', RandomForestClassifier(random_state = 42)),
# # #    ('tabpfn1', TabPFNClassifier(N_ensemble_configurations=32,device='cuda:0')),
# # #     ('tabpfn2', TabPFNClassifier(N_ensemble_configurations=64,device='cuda:0')),
# # #     ('tabpfn3', TabPFNClassifier(N_ensemble_configurations=128,device='cuda:0'))
# #     ('voting1', VotingClassifier(
# #                 [
# #                     ("lgbm", LGBMClassifier(random_state=42, **lgbm_params)),
# #                     ("xgb", XGBClassifier(random_state=42, **xgb_params)),
# #                     ('cb', CatBoostClassifier(random_state = 42, verbose = False)),
# #                     ('rf', RandomForestClassifier(random_state = 42)),
# #                     ("svc", SVC(random_state=42, **svc_params)),
# #                 ],
# #                 voting="soft",
# #                 weights=(0.3, 0.3, 0.2,0.1, 0.1)
# #             )),
# #     ('voting2', VotingClassifier(
# #                 [
# #                     ("lgbm", LGBMClassifier(random_state=42, **lgbm_params)),
# #                     ("xgb", XGBClassifier(random_state=42, **xgb_params)),
# #                     ('cb', CatBoostClassifier(random_state = 42, verbose = False)),
# #                     #('rf', RandomForestClassifier(random_state = 42)),
# #                     ("svc", SVC(random_state=42, **svc_params)),
# #                 ],
# #                 voting="soft",
# #                 weights=(0.35, 0.35, 0.15, 0.15)
# #             )),
# #     ('voting3', VotingClassifier(
# #                 [
# #                     ("lgbm", LGBMClassifier(random_state=42, **lgbm_params)),
# #                     ("xgb", XGBClassifier(random_state=42, **xgb_params)),
# # #                     ('cb', CatBoostClassifier(random_state = 42, verbose = False)),
# #                     #('rf', RandomForestClassifier(random_state = 42)),
# #                     ("svc", SVC(random_state=42, **svc_params)),
# # #                     ('tabpfn1', tabpfnclassifier(n_ensemble_configurations=64))
                    
# #                 ],
# #                 voting="soft",
# #                 weights=(0.45, 0.45, 0.1)
# #           )),
#     ('stacked', StackingClassifier(estimators = [
#          ('voting1', VotingClassifier(
#                 [
#                     ("lgbm", LGBMClassifier(random_state=42, **lgbm_params)),
#                     ("xgb", XGBClassifier(random_state=42, **xgb_params)),
#                     ('cb', CatBoostClassifier(random_state = 42, verbose = False)),
#                     ('rf', RandomForestClassifier(random_state = 42)),
#                     ("svc", SVC(random_state=42, **svc_params)),
#                 ],
#                 voting="soft",
#                 weights=(0.3, 0.3, 0.2,0.1, 0.1)
#             )),
#     ('voting2', VotingClassifier(
#                 [
#                     ("lgbm", LGBMClassifier(random_state=42, **lgbm_params)),
#                     ("xgb", XGBClassifier(random_state=42, **xgb_params)),
#                     ('cb', CatBoostClassifier(random_state = 42, verbose = False)),
#                     #('rf', RandomForestClassifier(random_state = 42)),
#                     ("svc", SVC(random_state=42, **svc_params)),
#                 ],
#                 voting="soft",
#                 weights=(0.35, 0.35, 0.15, 0.15)
#             )),
#     ('voting3', VotingClassifier(
#                 [
#                     ("lgbm", LGBMClassifier(random_state=42, **lgbm_params)),
#                     ("xgb", XGBClassifier(random_state=42, **xgb_params)),
# #                     ('cb', CatBoostClassifier(random_state = 42, verbose = False)),
#                     #('rf', RandomForestClassifier(random_state = 42)),
#                     ("svc", SVC(random_state=42, **svc_params)),
# #                     ('tabpfn1', tabpfnclassifier(n_ensemble_configurations=64))
                    
#                 ],
#                 voting="soft",
#                 weights=(0.45, 0.45, 0.1)
#           ))
#     ], final_estimator = LogisticRegression()))
    
# ]

results = {}
N_BAGS = 20
N_FOLDS = 10
seeds = np.random.randint(0, 20000, size=N_BAGS)
classifiers = defaultdict(object)
y_proba = np.zeros_like(y, dtype=np.float64)

for bag, seed in enumerate(seeds):
    skf = StratifiedKFold(n_splits = N_FOLDS, random_state = seed, shuffle = True)

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_preliminary, y)):
        X_train, X_val = X_preliminary.iloc[train_idx], X_preliminary.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        estimator = VotingClassifier(
                [
                    ("lgbm", LGBMClassifier(random_state=42, **lgbm_params)),
                    ("xgb", XGBClassifier(random_state=42, **xgb_params)),
                    ('cb', CatBoostClassifier(**cb_params, random_state = 42, verbose = False)),
                    #('rf', RandomForestClassifier(random_state = 42)),
                    #("svc", SVC(random_state=42, **svc_params)),
#                     ('tabpfn1', tabpfnclassifier(n_ensemble_configurations=64))
                    
                ],
                voting="soft",
                weights=(0.45, 0.45, 0.1))
        pipe = imb_make_pipeline(RandomUnderSampler(random_state = seed), estimator)
        pipe.fit(X_train, y_train)

#         y_pred_prob = pipe.predict_proba(X_val)[:,1]
#         y_pred = pipe.predict(X_val)
        y_proba[val_idx] += pipe.predict_proba(X_val)[:, 1]
        classifiers[f"Voting Bag: {bag} Fold: {fold}"] = pipe

#         log_loss_ls.append(balanced_log_loss(y_val, y_pred_prob))
#         roc_auc_ls.append(roc_auc_score(y_val, y_pred_prob))
#         f1_score_ls.append(f1_score(y_val, y_pred))

#     results[name] = {'log_loss': mean(log_loss_ls),
#                     'roc_auc': mean(roc_auc_ls),
#                     'f1': mean(f1_score_ls)}

    
# results
y_proba_averaged = y_proba / N_BAGS
balanced_log_loss(y, y_proba_averaged)

0.24381731286241543

In [20]:
if np.all(np.isclose(test.select_dtypes("number").sum(), 0)):
    test_numeric_cols = test.select_dtypes("number").columns
    test[test_numeric_cols] += 1e-9
X_test = preliminary_preprocess.transform(test.drop(['Id'], axis=1))

In [21]:
test_ids = test['Id']
y_test = np.zeros_like(test_ids)
for classifier in classifiers.values():
    # Each classifier contains preprocessing, so we pass raw test dataset.
    y_test += classifier.predict_proba(X_test)[:, 1]
    
y_test_averaged = y_test / len(classifiers)
submission = pd.DataFrame(
    {
        "Id": test_ids,
        "class_0": 1 - y_test_averaged,
        "class_1": y_test_averaged,
    }
)

submission.to_csv("submission.csv", index = False)

In [22]:
# submission

In [23]:
# import optuna

# def objective(trial):
#     xgb_params = {
#         "max_depth": trial.suggest_int('max_depth', 7, 11),
#         "n_estimators": trial.suggest_int('n_estimators', 500, 600),
#         "learning_rate": trial.suggest_float('learning_rate', 1e-2, 3e-1),
#         "subsample": 0.8,
#         "min_child_weight": 0.1,
#         "max_delta_step": 0.1,
#         "colsample_bytree": trial.suggest_categorical('colsample_bytree', [0.1, 0.3, 0.5, 0.8]),
#         "colsample_bylevel": trial.suggest_categorical('colsample_bylevel', [0.1, 0.3, 0.5, 0.8]),
#         "min_split_loss": trial.suggest_categorical('min_split_loss', [1e-5, 1e-4, 1e-3, 1e-2]),
#         "reg_alpha": trial.suggest_categorical('reg_alpha', [1e-4, 1e-3, 1e-2, 1e-4]),
#         "reg_lambda": trial.suggest_categorical('reg_lambda', [1e-4, 1e-3, 1e-2, 1e-4]),
#     }
    
#     log_loss_ls = []
    
#     for train_idx, val_idx in skf.split(X_preliminary, y):
#         X_train, X_val = X_preliminary.iloc[train_idx], X_preliminary.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
#         estimator = XGBClassifier(**xgb_params)
    
#         pipe = imb_make_pipeline(RandomUnderSampler(random_state=42), estimator)
#         pipe.fit(X_train, y_train)

#         y_pred_prob = pipe.predict_proba(X_val)[:, 1]
#         # y_pred = pipe.predict(X_val)

#         log_loss_ls.append(balanced_log_loss(y_val, y_pred_prob))

# # Return the log loss you want to minimize
#     return np.mean(log_loss_ls)

# # Create the Optuna study and optimize the objective function
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# # Get the best hyperparameters and their performance
# best_params = study.best_params
# best_log_loss = study.best_value

# print("Best Hyperparameters:", best_params)
# print("Best Log Loss:", best_log_loss)


In [24]:
# import optuna

# def objective(trial):
#     lgbm_params = {
#         "max_depth": trial.suggest_int('max_depth', 9, 15),
#         "num_leaves": trial.suggest_categorical('num_leaves', [4,5,6]),
#         "n_estimators": trial.suggest_int('n_estimators', 300, 600),
#         "learning_rate": trial.suggest_float('learning_rate', 5e-2, 5e-1),
#         "colsample_bytree": 0.1,
#         "colsample_bylevel": 0.1,
#         "min_split_gain": 0.001,
#         "reg_alpha": trial.suggest_float('reg_alpha', 1e-4, 5e-3),
#         "reg_lambda": trial.suggest_float('reg_lambda', 1e-2, 5e-1)
#     }
    
#     log_loss_ls = []
    
#     for train_idx, val_idx in skf.split(X_preliminary, y):
#         X_train, X_val = X_preliminary.iloc[train_idx], X_preliminary.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
#         estimator = LGBMClassifier(**lgbm_params, verbose = -1)
    
#         pipe = imb_make_pipeline(RandomUnderSampler(random_state=42), estimator)
#         pipe.fit(X_train, y_train)

#         y_pred_prob = pipe.predict_proba(X_val)[:, 1]
#         # y_pred = pipe.predict(X_val)

#         log_loss_ls.append(balanced_log_loss(y_val, y_pred_prob))

# # Return the log loss you want to minimize
#     return np.mean(log_loss_ls)

# # Create the Optuna study and optimize the objective function
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# # Get the best hyperparameters and their performance
# best_params = study.best_params
# best_log_loss = study.best_value

# print("Best Hyperparameters:", best_params)
# print("Best Log Loss:", best_log_loss)


In [25]:
# import optuna

# skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)

# def objective(trial):
#     cb_params = {
#         "depth": trial.suggest_int('depth', 4, 9),
#         "l2_leaf_reg": trial.suggest_float('l2_leaf_reg', 1, 3),
#         "iterations": trial.suggest_int('iterations', 600, 1000),
#         "learning_rate": trial.suggest_float('learning_rate', 3e-2, 5e-1),
#     }
    
#     log_loss_ls = []
    
#     for train_idx, val_idx in skf.split(X_preliminary, y):
#         X_train, X_val = X_preliminary.iloc[train_idx], X_preliminary.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
#         estimator = CatBoostClassifier(**cb_params, verbose = False)
    
#         pipe = imb_make_pipeline(RandomUnderSampler(random_state=42), estimator)
#         pipe.fit(X_train, y_train)

#         y_pred_prob = pipe.predict_proba(X_val)[:, 1]
#         # y_pred = pipe.predict(X_val)

#         log_loss_ls.append(balanced_log_loss(y_val, y_pred_prob))

# # Return the log loss you want to minimize
#     return np.mean(log_loss_ls)

# # Create the Optuna study and optimize the objective function
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# # Get the best hyperparameters and their performance
# best_params = study.best_params
# best_log_loss = study.best_value

# print("Best Hyperparameters:", best_params)
# print("Best Log Loss:", best_log_loss)


In [26]:
# import optuna

# def objective(trial):
#     rf_params = {
#         'n_estimators': trial.suggest_int('n_estimators', 50, 500),
#         'max_depth': trial.suggest_int('max_depth', 1, 50),
#         'min_samples_split':trial.suggest_int('min_samples_split', 1, 20),
#         'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
#         'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
#         'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
#     }
    
#     log_loss_ls = []
    
#     for train_idx, val_idx in skf.split(X_preliminary, y):
#         X_train, X_val = X_preliminary.iloc[train_idx], X_preliminary.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
#         estimator = RandomForestClassifier(**rf_params, verbose = False)
    
#         pipe = imb_make_pipeline(RandomUnderSampler(random_state=42), estimator)
#         pipe.fit(X_train, y_train)

#         y_pred_prob = pipe.predict_proba(X_val)[:, 1]
#         # y_pred = pipe.predict(X_val)

#         log_loss_ls.append(balanced_log_loss(y_val, y_pred_prob))

# # Return the log loss you want to minimize
#     return np.mean(log_loss_ls)

# # Create the Optuna study and optimize the objective function
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# # Get the best hyperparameters and their performance
# best_params = study.best_params
# best_log_loss = study.best_value

# print("Best Hyperparameters:", best_params)
# print("Best Log Loss:", best_log_loss)


In [27]:
# import optuna

# def objective(trial):
#     svc_params = {
#         'probability': True,
#         'C': trial.suggest_float('C', 1, 4),
#         'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
#     }
    
#     log_loss_ls = []
    
#     for train_idx, val_idx in skf.split(X_preliminary, y):
#         X_train, X_val = X_preliminary.iloc[train_idx], X_preliminary.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
#         estimator = SVC(**svc_params)
    
#         pipe = imb_make_pipeline(RandomUnderSampler(random_state=42), estimator)
#         pipe.fit(X_train, y_train)

#         y_pred_prob = pipe.predict_proba(X_val)[:, 1]
#         # y_pred = pipe.predict(X_val)

#         log_loss_ls.append(balanced_log_loss(y_val, y_pred_prob))

# # Return the log loss you want to minimize
#     return np.mean(log_loss_ls)

# # Create the Optuna study and optimize the objective function
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# # Get the best hyperparameters and their performance
# best_params = study.best_params
# best_log_loss = study.best_value

# print("Best Hyperparameters:", best_params)
# print("Best Log Loss:", best_log_loss)
