In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping

from catboost import CatBoostClassifier, Pool

from sklearn.metrics import roc_auc_score, accuracy_score

import random
from sklearn.model_selection import KFold , StratifiedKFold
from xgboost import XGBClassifier
import optuna

from sklearn.linear_model import Lasso, Ridge

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

USE_OPTUNA = False

# Encoding category feature into numerical pipeline

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")
original = pd.read_csv("/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv")

original['Depression'] = original['Depression'].map({'Yes': 1, 'No': 0})
# train = pd.concat([train, original], ignore_index=True)

# Use original data to train a model for stacking

In [3]:
original = original.fillna('None').astype('string')
train = train.fillna('None').astype('string')
test = train.fillna('None').astype('string')

y = original['Depression']
X = original.drop(['Depression'], axis=1)


def cleaning(data):
    threshold = 10
    cat_feats = ["Sleep Duration", "Dietary Habits", "Degree"]
    
    for feat in cat_feats:
        data.loc[data[feat].value_counts(dropna=False)[data[feat]].values < threshold, feat] = 'None'
    
    return data

X = cleaning(X)
test = cleaning(test)

catboost_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'learning_rate': 0.08114394459649094,
    'iterations': 1000,
    'depth': 6,
    'random_strength':0,
    'l2_leaf_reg': 0.7047064221215757,
    'task_type':'GPU',
    'random_seed':42,
    'verbose':False    
}

cv = StratifiedKFold(5, shuffle=True, random_state=0)
cv_splits = cv.split(X, y)
scores = []
test_preds = []
models=[]
X_test_pool = Pool(test, cat_features=X.columns.values)
for i, (train_idx, val_idx) in enumerate(cv_splits):
    model = CatBoostClassifier(**catboost_params)
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=X.columns.values)
    X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=X.columns.values)
    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=False, early_stopping_rounds=200)
    models.append(model)
    val_pred = model.predict(X_valid_pool)
    score = accuracy_score(y_val_fold, val_pred)
    scores.append(score)
print(score)

train = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")
train = train.fillna('None').astype('string')
test = test.fillna('None').astype('string')

train = cleaning(train)
test = cleaning(test)

train_preds = np.zeros(train.shape[0])
train_pool = Pool(train.drop(columns=['Depression']), cat_features=X.columns.values)
for model in models:
    train_preds += model.predict_proba(train_pool)[:, 1] / len(models)

test_preds = np.zeros(test.shape[0])
test_pool = Pool(test , cat_features=X.columns.values)
for model in models:
    test_preds += model.predict_proba(test_pool)[:, 1] / len(models) 

train = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")

train['org_oof'] = train_preds
test['org_oof'] = test_preds

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


0.958904109589041


In [4]:
cat_cols = test.select_dtypes(include='object').columns

train.drop(columns=['Name','id'] , inplace =True)
test.drop(columns=['Name','id'] , inplace =True)


cat_cols = list(train.select_dtypes(include=['category','object']).columns.tolist())
num_cols = list(train.select_dtypes(include=['int','float']).columns)
if 'Depression' in num_cols:
    num_cols.remove('Depression')

train[cat_cols].fillna("unknown",inplace=True)
test[cat_cols].fillna("unknown",inplace=True)

train[num_cols].fillna(999, inplace=True)
test[num_cols].fillna(999, inplace=True)


train[cat_cols] = train[cat_cols].astype('category')
test[cat_cols] = test[cat_cols].astype('category')

feature_cols = cat_cols + num_cols

In [5]:
# nan_ratio = train[feature_cols].isna().mean()
# high_nan_cols = nan_ratio[nan_ratio > 0.7].index.tolist()


# feature_cols = [col for col in feature_cols if col not in high_nan_cols]
# cat_cols = [col for col in cat_cols if col not in high_nan_cols]

In [6]:
class CategoricalEncoder:
    def __init__(self, train, test):
        self.train = train
        self.test = test

    def one_hot_encode(self, cat_cols, feature_cols):
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoder.fit(self.train[cat_cols])

        new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]
        self.train[new_cat_cols] = encoder.transform(self.train[cat_cols])
        self.train[new_cat_cols] = self.train[new_cat_cols].astype('category')

        self.test[new_cat_cols] = encoder.transform(self.test[cat_cols])
        self.test[new_cat_cols] = self.test[new_cat_cols].astype('category')

        for col in cat_cols:
            feature_cols.remove(col)
        feature_cols.extend(new_cat_cols)

        return self.train, self.test, new_cat_cols, feature_cols

    def label_encode(self, cat_cols, feature_cols):

        encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        self.train[cat_cols] = encoder.fit_transform(self.train[cat_cols]).astype("float32")
        self.test[cat_cols] = encoder.transform(self.test[cat_cols]).astype("float32")
        
        return self.train, self.test, cat_cols, feature_cols

    def frequency_encode(self, cat_cols, feature_cols, drop_org=False):

        new_cat_cols = []
        for col in cat_cols:
            freq_encoding = self.train[col].value_counts().to_dict()

            self.train[f"{col}_freq"] = self.train[col].map(freq_encoding).astype('float32')
            self.test[f"{col}_freq"] = self.test[col].map(freq_encoding).astype('float32')

            new_col_name = f"{col}_freq"
            new_cat_cols.append(col)
            feature_cols.append(new_col_name)
            if drop_org:
                feature_cols.remove(col)

        return self.train, self.test, new_cat_cols, feature_cols

In [7]:
encoder = CategoricalEncoder(train, test)
train, test, cat_cols, feature_cols = encoder.frequency_encode(cat_cols, feature_cols,drop_org=False)
train, test, cat_cols, feature_cols = encoder.label_encode(cat_cols, feature_cols)

In [8]:
# train = train.fillna('None').astype('string')
# test = test.fillna('None').astype('string')

In [9]:
train[feature_cols].head()

Unnamed: 0,Gender,City,Working Professional or Student,Profession,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Family History of Mental Illness,Age,...,org_oof,Gender_freq,City_freq,Working Professional or Student_freq,Profession_freq,Sleep Duration_freq,Dietary Habits_freq,Degree_freq,Have you ever had suicidal thoughts ?_freq,Family History of Mental Illness_freq
0,0.0,50.0,1.0,10.0,29.0,7.0,33.0,0.0,0.0,49.0,...,0.002593,63236.0,5226.0,112799.0,2862.0,32726.0,44741.0,4305.0,71138.0,70758.0
1,1.0,93.0,1.0,55.0,27.0,20.0,63.0,1.0,0.0,26.0,...,0.240375,77464.0,4606.0,112799.0,24906.0,38784.0,46227.0,4348.0,69562.0,70758.0
2,1.0,97.0,0.0,,15.0,7.0,21.0,1.0,0.0,33.0,...,0.966341,77464.0,5176.0,27901.0,,32142.0,44741.0,5856.0,69562.0,70758.0
3,1.0,64.0,1.0,55.0,27.0,15.0,28.0,1.0,1.0,22.0,...,0.667784,77464.0,4966.0,112799.0,24906.0,38784.0,49705.0,5030.0,69562.0,69942.0
4,0.0,37.0,1.0,9.0,15.0,20.0,28.0,1.0,1.0,30.0,...,0.084959,63236.0,4398.0,112799.0,3161.0,32142.0,46227.0,5030.0,69562.0,69942.0


In [10]:
train[cat_cols].head()

Unnamed: 0,Gender,City,Working Professional or Student,Profession,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Family History of Mental Illness
0,0.0,50.0,1.0,10.0,29.0,7.0,33.0,0.0,0.0
1,1.0,93.0,1.0,55.0,27.0,20.0,63.0,1.0,0.0
2,1.0,97.0,0.0,,15.0,7.0,21.0,1.0,0.0
3,1.0,64.0,1.0,55.0,27.0,15.0,28.0,1.0,1.0
4,0.0,37.0,1.0,9.0,15.0,20.0,28.0,1.0,1.0


In [11]:
def FE(df, feature_cols, cat_cols,if_train=True):
    df['job_sat']  = (df['Work Pressure']/df['Job Satisfaction']).astype("float")
    df['age_wp'] = (df['Age'] * df['Work Pressure']).astype("float")

    if if_train:
        feature_cols += ['job_sat','age_wp']
    
    return df, feature_cols, cat_cols

In [12]:
# train['Depression'] = train['Depression'].map({'Yes': 1, 'No': 0 , 1 : 1, 0 : 0})

train , feature_cols, cat_cols= FE(train, feature_cols, cat_cols)
test ,_,_= FE(test, feature_cols, cat_cols,if_train=False)

X = train[feature_cols]
y = train['Depression']

In [13]:
callbacks = [log_evaluation(period=1000), early_stopping(stopping_rounds=250)]
def objective(trial):
    
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'random_state': 42,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 6, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 0.8),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 0.8),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
        'n_estimators': 3000,
        'verbose' : -1,
        'device':'gpu'
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = []
    
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model = lgb.LGBMClassifier(**param)
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='logloss',
                  callbacks = callbacks
                  
                )

        y_pred = model.predict_proba(X_val)[:, 1]
        score = roc_auc_score(y_val, y_pred)
        cv_results.append(score)

    return np.mean(cv_results)

In [14]:
if USE_OPTUNA:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20)  


    print("Best hyperparameters: ", study.best_params)
    lgbm_params = study.best_params

In [15]:
def train_models(model_type='lgbm'):
    models, scores = [], []
    oof_preds = np.zeros(len(X))

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        lgbm_params = {
            'objective': 'binary',
            'n_estimators': 3000,
            'metric': 'binary_logloss',
            'boosting_type': 'gbdt',
            'random_state': 42,
            'learning_rate': 0.009471852762599606,
            'num_leaves': 21,
            'max_depth': 11,
            'min_data_in_leaf': 87,
            'feature_fraction': 0.40471223405843854,
            'bagging_fraction': 0.6642922498166329, 
            'bagging_freq': 3,
            'device':'gpu',
            'verbose' : -1,
        }

        cat_params = {
            'loss_function': 'Logloss',
            'eval_metric': 'AUC',
            'learning_rate': 0.08114394459649094,
            'iterations': 1000,
            'depth': 6,
            'random_strength':0,
            'l2_leaf_reg': 0.7047064221215757,
            'task_type':'GPU',
            'random_seed':42,
            'verbose':0
        }

        xgb_params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'n_estimators': 3000,
            'learning_rate': 0.01,
            'max_depth': 6,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'tree_method': 'gpu_hist',  
        }
    



        if model_type=='lgbm':
            model = lgb.LGBMClassifier(**lgbm_params)
    
            model.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                eval_metric='auc',
                callbacks=callbacks,
                # categorical_feature=cat_cols
            )
            models.append(model)

        elif model_type == 'cat':
            model = CatBoostClassifier(**cat_params)
            model.fit(
                X_train, y_train,
                eval_set=(X_val, y_val),
                early_stopping_rounds=250,
            )
            models.append(model)

        elif model_type == 'xgb':
            model = XGBClassifier(**xgb_params)
            model.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=250,
                verbose=0
            )
            models.append(model)


        y_pred_proba = model.predict_proba(X_val)[:, 1]
        oof_preds[val_index] = y_pred_proba
        auc_score = roc_auc_score(y_val, y_pred_proba)
        scores.append(auc_score)
    

    oof_auc = roc_auc_score(y, oof_preds)
    print(f"oof_auc : {oof_auc}")
    print(f"each auc : {scores}")

    return models ,oof_preds

In [16]:
LGBM_models , lgbm_oof = train_models(model_type='lgbm')
cat_models , cat_oof = train_models(model_type='cat')
xgb_models , xgb_oof = train_models(model_type='xgb')



Training until validation scores don't improve for 250 rounds
[1000]	valid_0's auc: 0.975254	valid_0's binary_logloss: 0.15195
Early stopping, best iteration is:
[1607]	valid_0's auc: 0.975449	valid_0's binary_logloss: 0.151126
Training until validation scores don't improve for 250 rounds
[1000]	valid_0's auc: 0.975498	valid_0's binary_logloss: 0.149131
[2000]	valid_0's auc: 0.975936	valid_0's binary_logloss: 0.147629
[3000]	valid_0's auc: 0.976031	valid_0's binary_logloss: 0.147414
Did not meet early stopping. Best iteration is:
[2976]	valid_0's auc: 0.976039	valid_0's binary_logloss: 0.147409
Training until validation scores don't improve for 250 rounds
[1000]	valid_0's auc: 0.973627	valid_0's binary_logloss: 0.154598
Early stopping, best iteration is:
[1506]	valid_0's auc: 0.973788	valid_0's binary_logloss: 0.153772
Training until validation scores don't improve for 250 rounds
[1000]	valid_0's auc: 0.974047	valid_0's binary_logloss: 0.15466
Early stopping, best iteration is:
[1693]	

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


oof_auc : 0.9751451153314925
each auc : [0.9752153804458775, 0.9751897941214928, 0.9733301134660043, 0.9735332771194258, 0.9757525296343346, 0.9750714882813774, 0.9745190529717287, 0.9793324041948668, 0.9741332348418026, 0.9754803033186779]
oof_auc : 0.9754610617204087
each auc : [0.975328157352429, 0.975755088790322, 0.9735015384414646, 0.973862571092969, 0.9760064467430859, 0.9752150745848812, 0.9745262203978085, 0.9795532696184257, 0.9745078092843712, 0.9764570094896382]


In [17]:
lgbm_preds = np.zeros(test[feature_cols].shape[0])
for model in LGBM_models:
    lgbm_preds += model.predict_proba(test[feature_cols])[:, 1] / len(LGBM_models)

cat_preds = np.zeros(test[feature_cols].shape[0])
for model in cat_models:
    cat_preds += model.predict_proba(test[feature_cols])[:, 1] / len(cat_models)

xgb_preds = np.zeros(test[feature_cols].shape[0])
for model in xgb_models:
    xgb_preds += model.predict_proba(test[feature_cols])[:, 1] / len(xgb_models)


# All category feature pipeline

In [18]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

train = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")

# train['org_oof'] = train_preds
# test['org_oof'] = test_preds

train = train.fillna('None').astype('string')
test = test.fillna('None').astype('string')


y = train['Depression'].astype(int)
X = train.drop(['Depression'], axis=1)

X = cleaning(X)
test = cleaning(test)


catboost_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'learning_rate': 0.08114394459649094,
    'iterations': 1000,
    'depth': 6,
    'random_strength': 0,
    'l2_leaf_reg': 0.7047064221215757,
    'task_type': 'GPU',
    'random_seed': 42,
    'verbose': False
}

cv = StratifiedKFold(10, shuffle=True, random_state=0)
p2_oof_preds = np.zeros(len(train))
scores = []
p2_cat_models = []
X_test_pool = Pool(test, cat_features=X.columns.values)

for i, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    model = CatBoostClassifier(**catboost_params)
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=X.columns.values)
    X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=X.columns.values)
    
    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=False, early_stopping_rounds=200)
    p2_cat_models.append(model)
    
    val_pred_proba = model.predict_proba(X_valid_pool)[:, 1]
    p2_oof_preds[val_idx] = val_pred_proba  
    
    val_pred = model.predict(X_valid_pool)
    score = accuracy_score(y_val_fold, val_pred)
    scores.append(score)


p2_cat_preds = np.zeros(test.shape[0])
test_pool = Pool(test , cat_features=test.columns.values)
for model in p2_cat_models:
    p2_cat_preds += model.predict_proba(test_pool)[:, 1] / len(p2_cat_models) 

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


In [19]:
oof_data = pd.DataFrame({
    'lgbm': lgbm_oof,
    'cat': cat_oof,
    'xgb': xgb_oof,
    'p2_cat': p2_oof_preds
})
lasso = Lasso(alpha=0.001) 
lasso.fit(oof_data, y)

oof_preds = lasso.predict(oof_data)
final_auc = roc_auc_score(y, oof_preds)
print(f"Final AUC after LASSO blending: {final_auc}")

Final AUC after LASSO blending: 0.9758025702990527


In [20]:
thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0.5
best_accuracy = 0

for threshold in thresholds:
    y_pred = (oof_preds >= threshold).astype(int)
    
    acc = accuracy_score(y, y_pred)
    
    if acc > best_accuracy:
        best_accuracy = acc
        best_threshold = threshold

print(best_threshold)
print(best_accuracy)

0.48
0.9402842928216063


In [21]:
sub = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')

# preds = lgbm_preds * 0.5 + cat_preds * 0.5

res = pd.DataFrame({
    'lgbm': lgbm_preds,
    'cat': cat_preds,
    'xgb': xgb_preds,
    'p2_cat': p2_cat_preds
})
preds = lasso.predict(res)

sub['Depression'] = (p2_cat_preds > 0.5).astype(int)

# sub['Depression'] = final_predictions

sub.to_csv('submission.csv', index=False)

sub.head()

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0
