In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


# 필요한 라이브러리 import

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna
from optuna.samplers import TPESampler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import time

In [5]:
file_path = '/content/drive/MyDrive/서울시 고립은둔청년 실태조사(청년조사)_분류.csv'
df = pd.read_csv(file_path, encoding = 'euc-kr')

In [6]:
# 데이터 확인 및 전처리
target = '【KEY_1】 고립은둔청년'


# 종속변수를 이진 분류로 변환
df[target] = df[target].apply(lambda x: 1 if x == '해당' else 0)

# 전처리(기타)

In [7]:
# a10, a11열 이상치 제거 함수 (각각 300개, 98개 제거)
def remove_outlier(df, column):
  q3 = df[column].quantile(0.75)
  q1 = df[column].quantile(0.25)

  IQR = q3 - q1
  max_val = q3 + 1.5 * q3

  cond = (df[column] >= 0) & (df[column] < max_val)
  return df[cond]

# 이상치 제거
df = remove_outlier(df, '【A10】 지난 2주간 교류 상대(명)')
df = remove_outlier(df, '【A11】 지난 2주 동안 교류 횟수(회)')

In [8]:
X = df.drop(target, axis = 1)
y = df[target]

In [9]:
cols_object = X.select_dtypes(include = 'object').columns
cols_numeric = X.select_dtypes(exclude = 'object').columns

# 전처리 (인코딩)

In [10]:
le_X = X.copy()
# oe_X = X.copy()

In [11]:
# 라벨 인코딩
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cols_object:
  le_X[col] = le.fit_transform(le_X[col])


In [12]:
# oe_X = pd.get_dummies(oe_X , columns = cols_object)

In [13]:
# train, test 데이터 분리
X_train_le, X_test_le, y_train, y_test = train_test_split(le_X, y, test_size=0.2, random_state=2024)
X_train_cat, X_test_cat, y_train, y_test = train_test_split(X, y , test_size = 0.2, random_state = 2024)
# X_train_oe, X_test_oe, y_train, y_test = train_test_split(oe_X, y, test_size=0.2, random_state=2024)

# 하이퍼파라미터 튜닝

In [22]:
def objective_rf(trial, X_train, y_train):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10)
    }
    rf_model = RandomForestClassifier(**params, random_state=42)
    scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()


def objective_xgb(trial, X_train, y_train):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3)
    }
    xgb_model = XGBClassifier(**params, random_state=42)
    scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()

def objective_cat(trial, X_train, y_train):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 2, 16),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 10),
        'thread_count': 4,  # CPU 코어 수에 맞게 조정
        'early_stopping_rounds': 100,  # 100번 반복 후 성능 개선 없으면 중지
    }
    cat_model = CatBoostClassifier(**params, random_state=42, verbose=1)
    scores = cross_val_score(cat_model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()


In [23]:
start_time = time.time()
# 랜덤 포레스트 최적화
# study_rf_le = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
# study_rf_le.optimize(lambda trial: objective_rf(trial, X_train_le, y_train), n_trials=50)
# best_params_rf_le = study_rf_le.best_params
# print(f'Best parameters for Random Forest with Label Encoding: {best_params_rf_le}')

# # study_rf_oe = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
# # study_rf_oe.optimize(lambda trial: objective_rf(trial, X_train_oe, y_train), n_trials=50)
# # best_params_rf_oe = study_rf_oe.best_params
# # print(f'Best parameters for Random Forest with One-Hot Encoding: {best_params_rf_oe}')

# # XGBoost 최적화
# study_xgb_le = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
# study_xgb_le.optimize(lambda trial: objective_xgb(trial, X_train_le, y_train), n_trials=50)
# best_params_xgb_le = study_xgb_le.best_params
# print(f'Best parameters for XGBoost with Label Encoding: {best_params_xgb_le}')

# # study_xgb_oe = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
# # study_xgb_oe.optimize(lambda trial: objective_xgb(trial, X_train_oe, y_train), n_trials=50)
# # best_params_xgb_oe = study_xgb_oe.best_params
# # print(f'Best parameters for XGBoost with One-Hot Encoding: {best_params_xgb_oe}')

# CatBoost 최적화 (인코딩 없이)
study_cat = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study_cat.optimize(lambda trial: objective_cat(trial, X_train_le, y_train), n_trials=50)
best_params_cat = study_cat.best_params
print(f'Best parameters for CatBoost: {best_params_cat}')

# 최적화된 모델로 스태킹 모델 구성
estimators = [
    ('rf_le', RandomForestClassifier(**best_params_rf_le, random_state=42)),
    # ('rf_oe', RandomForestClassifier(**best_params_rf_oe, random_state=42)),
    ('xgb_le', XGBClassifier(**best_params_xgb_le, random_state=42)),
    # ('xgb_oe', XGBClassifier(**best_params_xgb_oe, random_state=42)),
    ('cat', CatBoostClassifier(**best_params_cat, random_state=42))
]

stacking_model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(random_state=42))
stacking_model.fit(X_train_le, y_train)
y_pred_stacking = stacking_model.predict(X_test_le)
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
precision_stacking = precision_score(y_test, y_pred_stacking)
recall_stacking = recall_score(y_test, y_pred_stacking)
f1_stacking = f1_score(y_test, y_pred_stacking)

print(f'Stacking Model Accuracy: {accuracy_stacking:.4f}')
print(f'Precision: {precision_stacking:.4f}')
print(f'Recall: {recall_stacking:.4f}')
print(f'F1 Score: {f1_stacking:.4f}')
print(classification_report(y_test, y_pred_stacking))

end_time = time.time()
print(start_time - end_time)


[I 2024-05-20 07:07:22,643] A new study created in memory with name: no-name-71328089-6ad0-44b5-adb7-8468f1ca874c


0:	learn: 0.3958994	total: 13.6ms	remaining: 5.94s
1:	learn: 0.2630532	total: 509ms	remaining: 1m 50s
2:	learn: 0.2231728	total: 2.75s	remaining: 6m 37s
3:	learn: 0.1944656	total: 4.97s	remaining: 8m 57s
4:	learn: 0.1740397	total: 7.14s	remaining: 10m 16s
5:	learn: 0.1526591	total: 10.2s	remaining: 12m 11s
6:	learn: 0.1386022	total: 13.2s	remaining: 13m 29s
7:	learn: 0.1280085	total: 15.4s	remaining: 13m 43s
8:	learn: 0.1186876	total: 17.5s	remaining: 13m 53s
9:	learn: 0.1094094	total: 19.7s	remaining: 14m 1s
10:	learn: 0.0999601	total: 21.9s	remaining: 14m 8s
11:	learn: 0.0926325	total: 24.8s	remaining: 14m 36s
12:	learn: 0.0868774	total: 27.9s	remaining: 15m 9s
13:	learn: 0.0818887	total: 30.1s	remaining: 15m 8s
14:	learn: 0.0761767	total: 32.3s	remaining: 15m 7s
15:	learn: 0.0698061	total: 34.5s	remaining: 15m 7s
16:	learn: 0.0658727	total: 36.7s	remaining: 15m 5s
17:	learn: 0.0625830	total: 39.4s	remaining: 15m 17s
18:	learn: 0.0577278	total: 39.7s	remaining: 14m 33s
19:	learn: 0.0

[W 2024-05-20 08:13:26,893] Trial 0 failed with parameters: {'iterations': 437, 'depth': 16, 'learning_rate': 0.22227824312530747, 'l2_leaf_reg': 5.9865888553855235} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-23-eb278a974f1c>", line 26, in <lambda>
    study_cat.optimize(lambda trial: objective_cat(trial, X_train_le, y_train), n_trials=50)
  File "<ipython-input-22-81c74acb8fd9>", line 33, in objective_cat
    scores = cross_val_score(cat_model, X_train, y_train, cv=5, scoring='accuracy')
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 515, in cross_val_score
    cv_results = cross_validate(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 266, in cross_validate
    results = parallel(
  File "/u

284:	learn: 0.0039498	total: 11m 42s	remaining: 6m 14s


KeyboardInterrupt: 

In [None]:
# 문자열 값이 남아있는지 확인
print("Checking for non-numeric values:")
for col in le_X.columns:
    if le_X[col].dtype == 'object':
        print(f"Non-numeric values found in column: {col}")
        print(le_X[col].unique())