# Setting

In [1]:
import numpy as np
import polars as pl
import pandas as pd
from sklearn.base import clone
import seaborn as sns
import optuna
import os
import category_encoders as ce
from IPython.display import clear_output

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

import lightgbm as lgb
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import *
from sklearn.metrics import *

# Import Data

In [2]:
%%time

train = pd.read_csv('/kaggle/input/playground-series-s4e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv')
sample = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
original = pd.read_csv('/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv')

original['Depression'] = original['Depression'].map({'No':0, 'Yes':1})

train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)

train = pd.concat(objs = [train, original])

CPU times: user 448 ms, sys: 131 ms, total: 579 ms
Wall time: 780 ms


# Feature Engineering

In [3]:
def binding_age(df, cols):
    bins = [i for i in range(0, 105, 5)]
    labels = [f'{i}-{i+4}' for i in range(0, 100, 5)]
    df['Age_Bind'] = pd.cut(df[cols], bins=bins, labels=labels)
    return df

In [4]:
def age_statistics(train, test):
    age_bind_depression = train.groupby('Age_Bind')['Depression'].mean()
    age_bind_ratio = train['Age_Bind'].value_counts(normalize = True)
    age_bind_depression_cali = pd.merge(age_bind_depression, age_bind_ratio, how = 'inner', left_on = age_bind_depression.index, right_on = age_bind_ratio.index).dropna()
    age_bind_depression_cali = age_bind_depression_cali.rename(columns = {"key_0":'Age_Bind', "Depression" : "Depression Rate"})
    age_bind_depression_cali['Calibrate Score'] = age_bind_depression_cali['Depression Rate'] * age_bind_depression_cali['proportion']
    
    for col in age_bind_depression_cali.columns:
        age_bind_depression_cali[col] = age_bind_depression_cali[col].astype('str')
    
    train = pd.merge(train, age_bind_depression_cali, how = 'left', on = 'Age_Bind')
    test = pd.merge(test, age_bind_depression_cali, how = 'left', on = 'Age_Bind')
    return train, test

In [5]:
train = binding_age(train, 'Age')
test = binding_age(test, 'Age')
train, test = age_statistics(train, test)

In [6]:
# def cleaning_student(train, test):
#     # 'Working Professional or Student' 열에서 'Student'인 경우 Profession을 'Student'로 설정
#     train.loc[train['Working Professional or Student'] == 'Student', 'Profession'] = "Student"
#     test.loc[test['Working Professional or Student'] == 'Student', 'Profession'] = "Student"
#     return train, test

# train, test = cleaning_student(train, test)

# Null Checking

In [7]:
# for col in train.columns:
#     if train[col].nunique() == 5:
#         print(f"Number of {col} ===> [{train[col].nunique()}]")
        
# train[['Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction', 'Financial Stress']].isna().sum() / train.shape[0]

### [Null] 'Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction'

In [8]:
# def print_statistics_by_group(train, column_name):
#     # 결측치가 없는 데이터 처리
#     not_null_data = train.loc[~train[column_name].isna(), :]
#     # 그룹별 통계량 계산
#     not_null_stats = not_null_data.groupby('Working Professional or Student')[column_name].agg(['mean', 'size'])

#     # 결측치가 있는 데이터 처리
#     null_data = train.loc[train[column_name].isna(), :]
#     # 그룹별 통계량 계산 (결측치가 있는 경우)
#     null_stats = null_data.groupby('Working Professional or Student')[column_name].agg(['mean', 'size'])
    
#     # 출력
#     print(f"Statistics for {column_name}:")
#     print("Not Null Data Statistics:")
#     display(not_null_stats)
#     print("\nNull Data Statistics:")
#     display(null_stats)
#     print("\n")

# # 예시로 각 컬럼에 대해 통계 확인하기
# columns_to_check = ['Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction', 'Financial Stress']

# for column in columns_to_check:
#     print_statistics_by_group(train, column)

In [9]:
# not_null_work_pressure = train.loc[~train['Work Pressure'].isna(), :]
# not_null_work_pressure.groupby('Working Professional or Student')['Work Pressure'].agg(['mean', 'size'])

# not_null_work_pressure[not_null_work_pressure['Working Professional or Student'] != 'Student']

# null_work_pressure = train.loc[train['Work Pressure'].isna(), :]
# null_work_pressure.groupby('Working Professional or Student')['Work Pressure'].agg(['mean', 'size'])

In [10]:
def fill_null(train, test, cols):
    for col in cols:
        if col in ['Academic Pressure', 'Study Satisfaction']:
            # 학생 그룹에 대해 결측치 처리
            for df in [train, test]:
                avg_value = round(df[df['Working Professional or Student'] == 'Student'][col].mean())
                df.loc[df['Working Professional or Student'] == 'Student', col] = df.loc[df['Working Professional or Student'] == 'Student', col].fillna(avg_value)

        elif col in ['Work Pressure', 'Job Satisfaction']:
            # 직장인 그룹에 대해 결측치 처리
            for df in [train, test]:
                avg_value = round(df[df['Working Professional or Student'] != 'Student'][col].mean())
                df.loc[df['Working Professional or Student'] != 'Student', col] = df.loc[df['Working Professional or Student'] != 'Student', col].fillna(avg_value)

        elif col == "CGPA":
            # 학생 그룹에 대해 결측치 처리 (CGPA)
            for df in [train, test]:
                avg_value = df[df['Working Professional or Student'] == 'Student'][col].mean()
                df.loc[df['Working Professional or Student'] == 'Student', col] = df.loc[df['Working Professional or Student'] == 'Student', col].fillna(avg_value)

    return train, test

# def fill_null(train, test, cols):
#     for col in cols:
#         if col in ['Academic Pressure', 'Study Satisfaction']:
#             # 학생 그룹에 대해 결측치 처리
#             for df in [train, test]:
#                 avg_value = round(train[train['Working Professional or Student'] == 'Student'][col].mean())
#                 df.loc[df['Working Professional or Student'] == 'Student', col] = df.loc[df['Working Professional or Student'] == 'Student', col].fillna(avg_value)

#         elif col in ['Work Pressure', 'Job Satisfaction']:
#             # 직장인 그룹에 대해 결측치 처리
#             for df in [train, test]:
#                 avg_value = round(train[train['Working Professional or Student'] != 'Student'][col].mean())
#                 df.loc[df['Working Professional or Student'] != 'Student', col] = df.loc[df['Working Professional or Student'] != 'Student', col].fillna(avg_value)

#         elif col == "CGPA":
#             # 학생 그룹에 대해 결측치 처리 (CGPA)
#             for df in [train, test]:
#                 avg_value = train[train['Working Professional or Student'] == 'Student'][col].mean()
#                 df.loc[df['Working Professional or Student'] == 'Student', col] = df.loc[df['Working Professional or Student'] == 'Student', col].fillna(avg_value)

#     return train, test

# 사용할 컬럼 목록
pressure_satisfaction_cols = ['Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction', 'CGPA']

train, test = fill_null(train, test, pressure_satisfaction_cols)

In [11]:
# train[pressure_satisfaction_cols] = train[pressure_satisfaction_cols].fillna(0)
# test[pressure_satisfaction_cols] = test[pressure_satisfaction_cols].fillna(0)

# Feature Enginnering
### Total Score (Pressure + Satisfaction)

In [12]:
# train[['Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction', 'Depression']].corr(method = 'spearman')

In [13]:
# corr_df = train[['Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction', 'Depression']].corr(method = 'spearman')

# sns.heatmap(corr_df, annot = True, fmt = '.3f', cmap = 'RdYlBu_r')

In [14]:
# # 상관 행렬 계산 (Spearman 방법)
# tmp_train = train[['Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction', 'Depression']].copy()
# tmp_test = test[['Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction']].copy()

# corr_df = tmp_train.corr(method='spearman')

# # 각 column에 대한 상관 계수 추출
# corr_academic = corr_df.loc['Academic Pressure', 'Depression']
# corr_work = corr_df.loc['Work Pressure', 'Depression']
# corr_study = corr_df.loc['Study Satisfaction', 'Depression']
# corr_job = corr_df.loc['Job Satisfaction', 'Depression']

# # total_score 생성
# train['total_score'] = (tmp_train['Academic Pressure'].fillna(0) * corr_academic) + \
#                         (tmp_train['Work Pressure'].fillna(0) * corr_work) + \
#                         (tmp_train['Study Satisfaction'].fillna(0) * corr_study) + \
#                         (tmp_train['Job Satisfaction'].fillna(0) * corr_job)

# test['total_score'] = (tmp_test['Academic Pressure'].fillna(0) * corr_academic) + \
#                         (tmp_test['Work Pressure'].fillna(0) * corr_work) + \
#                         (tmp_test['Study Satisfaction'].fillna(0) * corr_study) + \
#                         (tmp_test['Job Satisfaction'].fillna(0) * corr_job)

In [15]:
# train[['total_score', 'Depression']].corr()

In [16]:
degree = {
    "BCom": "B.Com", "B.Com": "B.Com", "B.Comm": "B.Com",
    "B.Tech": "B.Tech", "BTech": "B.Tech", "B.T": "B.Tech",
    "BSc": "B.Sc", "B.Sc": "B.Sc", "Bachelor of Science": "B.Sc",
    "BArch": "B.Arch", "B.Arch": "B.Arch",
    "BA": "B.A", "B.A": "B.A",
    "BBA": "BBA", "BB": "BBA",
    "BCA": "BCA",
    "BE": "BE",
    "BEd": "B.Ed", "B.Ed": "B.Ed",
    "BPharm": "B.Pharm", "B.Pharm": "B.Pharm",
    "BHM": "BHM",
    "LLB": "LLB", "LL B": "LLB", "LL BA": "LLB", "LL.Com": "LLB", "LLCom": "LLB",
    "MCom": "M.Com", "M.Com": "M.Com",
    "M.Tech": "M.Tech", "MTech": "M.Tech", "M.T": "M.Tech",
    "MSc": "M.Sc", "M.Sc": "M.Sc", "Master of Science": "M.Sc",
    "MBA": "MBA",
    "MCA": "MCA",
    "MD": "MD",
    "ME": "ME",
    "MEd": "M.Ed", "M.Ed": "M.Ed",
    "MArch": "M.Arch", "M.Arch": "M.Arch",
    "MPharm": "M.Pharm", "M.Pharm": "M.Pharm",
    "MA": "MA", "M.A": "MA",
    "MPA": "MPA",
    "LLM": "LLM",
    "PhD": "PhD",
    "MBBS": "MBBS",
    "CA": "CA",
    "Class 12": "Class 12", "12th": "Class 12",
    "Class 11": "Class 11", "11th": "Class 11"
}

train['Degree'] = train['Degree'].map(degree)
test['Degree'] = test['Degree'].map(degree)

train['Degree'].unique()

array(['BHM', 'LLB', 'B.Pharm', 'BBA', 'MCA', 'MD', 'B.Sc', 'ME',
       'B.Arch', 'BCA', 'BE', 'MA', 'B.Ed', 'B.Com', 'MBA', 'M.Com', nan,
       'B.A', 'Class 12', 'M.Tech', 'PhD', 'M.Ed', 'M.Sc', 'B.Tech',
       'LLM', 'MBBS', 'M.Pharm', 'MPA', 'Class 11', 'M.Arch'],
      dtype=object)

# Clean Data

In [17]:
def clean_columns(df, column, valid_categories):
    df[column] = df[column].apply(lambda x : x if x in valid_categories else 'Noise')
    return df

valid_sleep_duration = ["Less than 5 hours", "5-6 hours", "6-7 hours", "7-8 hours", "More than 8 hours"]
valid_dietary_habits = ['Healthy', 'Moderate', 'Unhealthy']

train = clean_columns(train, 'Sleep Duration', valid_sleep_duration)
train = clean_columns(train, 'Dietary Habits', valid_dietary_habits)
test = clean_columns(test, 'Sleep Duration', valid_sleep_duration)
test = clean_columns(test, 'Dietary Habits', valid_dietary_habits)

In [18]:
def removieNoise(df, columns, threshold=100):
    
    for column in columns:
        value_counts = df[column].value_counts()
        low_freq_categories = value_counts[value_counts < threshold].index
        df[column] = df[column].apply(lambda x: x if x not in low_freq_categories else 'Other')
    
    return df

train = removieNoise(train, ['Name', 'City', 'Profession', 'Degree'])
test = removieNoise(test, ['Name', 'City', 'Profession', 'Degree'])

In [19]:
# cat_c = [col for col in train.columns if col != 'Depression']
# train[cat_c].dtypes
train = train.fillna('None').astype('str')
test = test.fillna('None').astype('str')

# Catboost Model

In [20]:
%%time

y = train['Depression']
X = train.drop(['Depression'], axis = 1)

def CAT_(X, y, test, catboost_params, fold=10, seed=0):
    SKF = StratifiedKFold(n_splits=fold, shuffle = True, random_state = seed)
    train_accuracies = []
    valid_accuracies = []
    test_predictions = []
    oof_predictions = np.zeros(len(X))
    
    test_pool = Pool(test, cat_features = X.columns.values)
    
    for fold, (train_idx, val_idx) in enumerate(SKF.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        train_pool = Pool(X_train, y_train, cat_features = X.columns.values)
        val_pool = Pool(X_val, y_val, cat_features = X.columns.values)
        
        model = CatBoostClassifier(**catboost_params, random_state = seed, verbose = 0)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)
        train_predictions = model.predict(train_pool)
        train_accuracy = accuracy_score(y_train, train_predictions)
        train_accuracies.append(train_accuracy)
        
        val_predictions = model.predict(val_pool)
        valid_accuracy = accuracy_score(y_val, val_predictions)
        valid_accuracies.append(valid_accuracy)
        
        oof_predictions[val_idx] = model.predict_proba(val_pool)[:,1]
        
        test_fold_pred = model.predict_proba(test_pool)[:,1]
        test_predictions.append(test_fold_pred)
        
        print(f'Fold {fold + 1} Train Accuracy: {train_accuracy:.7f}, Valid Accuracy: {valid_accuracy:.7f}')
    
    overall_train_accuracy = np.mean(train_accuracies)
    overall_valid_accuracy = np.mean(valid_accuracies)
    
    print(f'Overall Train Accuracy: {overall_train_accuracy:.7f}')
    print(f'Overall Valid Accuracy: {overall_valid_accuracy:.7f}')
    print(f'Gap Between Train-Valid : {abs(overall_train_accuracy - overall_valid_accuracy):.7f}')
    mean_oof_preds = np.mean(oof_predictions)
    mean_test_preds = np.mean(test_predictions, axis=0)
    
    return mean_oof_preds, mean_test_preds

CPU times: user 23.6 ms, sys: 0 ns, total: 23.6 ms
Wall time: 22.9 ms


# Train

In [21]:
%%time
Params4 = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'learning_rate': 0.08114394459649094,
    'iterations': 1000,
    'depth': 6,
    'random_strength':0,
    'l2_leaf_reg': 0.7047064221215757,
    'task_type': 'GPU'} # LB : 0.94381

meanOFF, meanTest = CAT_(X, y, test, Params4, fold=10, seed = 0)

Default metric period is 5 because AUC is/are not implemented for GPU


Fold 1 Train Accuracy: 0.9445203, Valid Accuracy: 0.9367583


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 2 Train Accuracy: 0.9454433, Valid Accuracy: 0.9437387


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 3 Train Accuracy: 0.9463197, Valid Accuracy: 0.9397599


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 4 Train Accuracy: 0.9447917, Valid Accuracy: 0.9422728


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 5 Train Accuracy: 0.9459086, Valid Accuracy: 0.9427614


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 6 Train Accuracy: 0.9447142, Valid Accuracy: 0.9411559


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 7 Train Accuracy: 0.9440166, Valid Accuracy: 0.9424084


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 8 Train Accuracy: 0.9462813, Valid Accuracy: 0.9405236


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 9 Train Accuracy: 0.9462581, Valid Accuracy: 0.9396161


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 10 Train Accuracy: 0.9437916, Valid Accuracy: 0.9393368
Overall Train Accuracy: 0.9452045
Overall Valid Accuracy: 0.9408332
Gap Between Train-Valid : 0.0043714
CPU times: user 8min 1s, sys: 44.5 s, total: 8min 46s
Wall time: 3min 16s


## Finding Best 
**[1] original 합치기 + Degree 카테고리 정리 + clean columns + remove_noise + Catboost (기본 파라미터)**

- Overall Train Accuracy: 0.9450541
- Overall Valid Accuracy: 0.9406308
- Gap Between Train-Valid : 0.0044233

**[2] (No original) + Degree 카테고리 정리 + clean columns + remove_noise + Catboost (기본 파라미터)**

- Overall Train Accuracy: 0.9447374
- Overall Valid Accuracy: 0.9400355
- Gap Between Train-Valid : 0.0047019

**[3] original 합치기 + Degree 카테고리 정리 + (나이대 + 나이대별 통계) + clean columns + remove_noise + Catboost (기본 파라미터)**

- Overall Train Accuracy: 0.9455993
- Overall Valid Accuracy: 0.9404632
- Gap Between Train-Valid : 0.0051361

**[4] original 합치기 + Degree 카테고리 정리 + 나이대 + 나이대별 통계 + (직업 Student 정리) + clean columns + remove_noise + Catboost (기본 파라미터)**

- Overall Train Accuracy: 0.9453643
- Overall Valid Accuracy: 0.9404911
- Gap Between Train-Valid : 0.0048732

**[5] original 합치기 + Degree 카테고리 정리 + 나이대 + 나이대별 통계 + (Null 처리) + clean columns + remove_noise + Catboost (기본 파라미터)**

- Overall Train Accuracy: 0.9460073
- Overall Valid Accuracy: 0.9405958
- Gap Between Train-Valid : 0.0054115

**[6] original 합치기 + Degree 카테고리 정리 + 나이대 + 나이대별 통계 + (Null 처리 with 'train avg') + clean columns + remove_noise + Catboost (기본 파라미터)**

- Overall Train Accuracy: 0.9456428
- Overall Valid Accuracy: 0.9406866
- Gap Between Train-Valid : 0.0049562

In [22]:
# my_Params1 = {
#     'loss_function': 'Logloss',
#     'eval_metric': 'AUC',
#     'learning_rate': 0.07788937579814562,
#     'iterations': 1744,
#     'depth': 4,
#     'random_strength':100,
#     'l2_leaf_reg': 0.007488708299233241,
#     'task_type': 'GPU'} # LB : 0.94381


# meanOFF, meanTest = CAT_(X, y, test, my_Params1, fold=10, seed = 0)

In [23]:
import gc
gc.collect()

0

# Submit

In [24]:
# %%time

# sample['Depression'] = np.round(meanTest)

# sample.to_csv('submission.csv', index=False)

# sample.head()

# Optuna HPO

In [25]:
# import optuna
# from optuna.samplers import TPESampler
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import accuracy_score
# from catboost import CatBoostClassifier, Pool
# import numpy as np


# y = train['Depression']
# X = train.drop(['Depression'], axis = 1)

# # 주어진 하이퍼파라미터 Params4를 기준으로 범위 재설정
# def objective(trial):
#     # Optuna에서 튜닝할 하이퍼파라미터 범위 정의
# #     catboost_params = {
# #         'loss_function': 'Logloss',
# #         'eval_metric': 'AUC',
# #         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),  # 학습률 범위
# #         'iterations': trial.suggest_int('iterations', 500, 2000),  # 반복 횟수
# #         'depth': trial.suggest_int('depth', 4, 12),  # 트리 깊이 범위
# #         'random_strength': trial.suggest_int('random_strength', 0, 100),  # 랜덤 강도
# #         'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 1e2),  # L2 정규화
# #         'task_type': 'GPU',
# #         'random_state': 0,
# #         'verbose': 0
# #     }

#     catboost_params = {
#         'loss_function': 'Logloss',
#         'eval_metric': 'AUC',
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),  # 좁힌 학습률 범위
#         'iterations': trial.suggest_int('iterations', 900, 2000),  # 1000에서 2000 사이
#         'depth': trial.suggest_int('depth', 3, 8),  # 깊이는 4에서 6 사이로 제한
#         'random_strength': trial.suggest_int('random_strength', 0, 100),  # 랜덤 강도 범위
#         'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 1e1),  # L2 정규화 범위 좁힘
#         'task_type': 'GPU',
#         'random_state': 0,
#         'verbose': 0
#     }
    
#     # Stratified KFold 설정
#     fold = 10
#     SKF = StratifiedKFold(n_splits=fold, shuffle=True, random_state=0)
#     train_accuracies = []
#     valid_accuracies = []
    
#     oof_predictions = np.zeros(len(X))  # Out of fold 예측값
    
#     for fold_idx, (train_idx, val_idx) in enumerate(SKF.split(X, y)):
# #         print(f"\n--- Fold {fold_idx + 1} ---")  # 각 fold마다 시작 로그
        
#         X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
#         train_pool = Pool(X_train, y_train, cat_features=X.columns.values.tolist())
#         val_pool = Pool(X_val, y_val, cat_features=X.columns.values.tolist())
        
#         model = CatBoostClassifier(**catboost_params)
#         model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)
        
#         # 훈련 데이터 예측
#         train_predictions = model.predict(train_pool)
#         train_accuracy = accuracy_score(y_train, train_predictions)
#         train_accuracies.append(train_accuracy)
        
#         # 검증 데이터 예측
#         val_predictions = model.predict(val_pool)
#         valid_accuracy = accuracy_score(y_val, val_predictions)
#         valid_accuracies.append(valid_accuracy)
        
#         oof_predictions[val_idx] = model.predict_proba(val_pool)[:, 1]  # OOF 예측값
        
#         # 각 fold의 결과 출력
# #         print(f"Train Accuracy for Fold {fold_idx + 1}: {train_accuracy:.3f}")
# #         print(f"Validation Accuracy for Fold {fold_idx + 1}: {valid_accuracy:.3f}")
        
#     overall_train_accuracy = np.mean(train_accuracies)
#     overall_valid_accuracy = np.mean(valid_accuracies)
    
#     print(f"\nOverall Train Accuracy: {overall_train_accuracy:.3f}")
#     print(f"Overall Validation Accuracy: {overall_valid_accuracy:.3f}")
    
#     # 최적화할 값 반환 (validation accuracy를 최대로)
#     return overall_valid_accuracy  # Valid accuracy를 반환해서 최적화

In [26]:
# %%time
# # Optuna로 최적화 수행
# study = optuna.create_study(direction='maximize', sampler=TPESampler())
# study.optimize(objective, n_trials=30)

In [27]:
# # 최적 하이퍼파라미터 출력
# print(f"Best parameters: {study.best_params}")
# print(f"Best validation accuracy: {study.best_value:.9f}")

In [28]:
# # 최적화된 파라미터로 최종 모델 학습
# final_catboost_params = study.best_params

# final_catboost_params.update({
#     'loss_function': 'Logloss',
#     'eval_metric': 'AUC',
#     'task_type': 'GPU'
# })

# # # 최종 모델 훈련 및 예측 (교차 검증을 통해 예측 수행)
# # fold = 10
# # SKF = StratifiedKFold(n_splits=fold, shuffle=True, random_state=0)
# # test_predictions = []

# # for train_idx, val_idx in SKF.split(X, y):
# #     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
# #     y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
# #     train_pool = Pool(X_train, y_train, cat_features=X.columns.values.tolist())
# #     test_pool = Pool(test, cat_features=X.columns.values.tolist())
    
# #     model = CatBoostClassifier(**final_catboost_params)
# #     model.fit(train_pool, eval_set=(train_pool, y_train), early_stopping_rounds=50)
    
# #     # 테스트 데이터 예측
# #     test_fold_pred = model.predict_proba(test_pool)[:, 1]
# #     test_predictions.append(test_fold_pred)

# meanOFF, meanTest = CAT_(X, y, test, final_catboost_params, fold=10, seed = 0)

In [29]:
# 결과를 'submission.csv'로 저장
sample['Depression'] = np.round(meanTest)  # 예측 결과 반올림하여 저장

# 결과 파일 저장
sample.to_csv('submission.csv', index=False)

# 제출 파일 확인
sample.head()

Unnamed: 0,id,Depression
0,140700,0.0
1,140701,0.0
2,140702,0.0
3,140703,1.0
4,140704,0.0
