In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


# 필요한 라이브러리 import

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna
from optuna.samplers import TPESampler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import time

In [6]:
file_path = '/content/drive/MyDrive/서울시 고립은둔청년 실태조사(청년조사)_분류.csv'
df = pd.read_csv(file_path, encoding = 'euc-kr')

In [7]:
# 데이터 확인 및 전처리
target = '【KEY_1】 고립은둔청년'


# 종속변수를 이진 분류로 변환
df[target] = df[target].apply(lambda x: 1 if x == '해당' else 0)

# 전처리(기타)

In [8]:
# a10 이상치 제거 (300개 제거)
print(df.shape)
a10_q3 = df['【A10】 지난 2주간 교류 상대(명)'].quantile(0.75)
a10_q1 = df['【A10】 지난 2주간 교류 상대(명)'].quantile(0.25)
a10_IQR = a10_q3 - a10_q1

# 최솟값은 구할 필요 없다. 어차피 제일 작은 값은 0명이므로, 음수는 전부 없애버리면 됨.
a10_max = a10_q3 + 1.5 * a10_IQR

#print(a10_max)


cond = (df['【A10】 지난 2주간 교류 상대(명)'] >= 0) & (df['【A10】 지난 2주간 교류 상대(명)'] < a10_max)
df = df[cond]
print(df.shape)

(5513, 51)
(5208, 51)


In [9]:
# a11 이상치 제거 (98개 제거)
print(df.shape)

a11_q3 = df['【A11】 지난 2주 동안 교류 횟수(회)'].quantile(0.75)
a11_q1 = df['【A11】 지난 2주 동안 교류 횟수(회)'].quantile(0.25)
a11_IQR = a11_q3 - a11_q1

# 최솟값은 구할 필요 없다. 어차피 제일 작은 값은 0회이므로, 음수는 전부 없애버리면 됨.
a11_max = a11_q3 + 1.5 * a11_IQR

# print(a11_max)


cond = (df['【A11】 지난 2주 동안 교류 횟수(회)'] >= 0) & (df['【A11】 지난 2주 동안 교류 횟수(회)'] < a10_max)
df = df[cond]

print(df.shape)


(5208, 51)
(5110, 51)


In [10]:
X = df.drop(target, axis = 1)
y = df[target]

In [11]:
cols_object = X.select_dtypes(include = 'object').columns
cols_numeric = X.select_dtypes(exclude = 'object').columns

In [12]:
cols_numeric

Index(['【SQ2】 연령', '【A10】 지난 2주간 교류 상대(명)', '【A11】 지난 2주 동안 교류 횟수(회)',
       '【A18_5】 외로움 척도 총점', '【B12_10】 우울 척도점수'],
      dtype='object')

# 전처리 (인코딩)

In [13]:
le_X = X.copy()
oe_X = X.copy()

In [14]:
# 라벨 인코딩
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cols_object:
  le_X[col] = le.fit_transform(le_X[col])


In [15]:
# 원핫 인코딩
oe_X = pd.get_dummies(oe_X, columns = cols_object)

In [16]:
# train, test 데이터 분리
X_train_le, X_test_le, y_train, y_test = train_test_split(le_X, y, test_size=0.2, random_state=2024)
X_train_oe, X_test_oe, y_train, y_test = train_test_split(oe_X, y, test_size=0.2, random_state=2024)
X_train_cat, X_test_cat, y_train, y_test = train_test_split(oX, y, test_size=0.2, random_state=2024)

# 전처리 (스케일링)

In [17]:
scalers = {
    'RobustScaler': RobustScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler()
}

# 하이퍼파라미터 튜닝

In [18]:
def objective_rf(trial, X_train, y_train, scaler, num_features):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20)
    }
    model = RandomForestClassifier(**params)
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', scalers[scaler], num_features)
        ],
        remainder='passthrough'
    )
    pipeline = Pipeline([('preprocessor', preprocessor), ('model', model)])
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy').mean()
    return score

def objective_xgb(trial, X_train, y_train, scaler, num_features):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3)
    }
    model = XGBClassifier(**params)
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', scalers[scaler], num_features)
        ],
        remainder='passthrough'
    )
    pipeline = Pipeline([('preprocessor', preprocessor), ('model', model)])
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy').mean()
    return score



In [19]:
results = []
best_params_list = []
start_time = time.time()

num_features = ['【A10】 지난 2주간 교류 상대(명)', '【A11】 지난 2주 동안 교류 횟수(회)']  # 예시 피처 리스트

for encoding, (X_train, X_test) in tqdm([('Label Encoding', (X_train_le, X_test_le)), ('One-Hot Encoding', (X_train_oe, X_test_oe))], desc='Encoding'):
    for scaler in tqdm(scalers, desc='Scaler'):
        # 첫 번째 모델에 대한 하이퍼파라미터 최적화
        study_rf = optuna.create_study(direction='maximize', sampler=TPESampler())
        study_rf.optimize(lambda trial: objective_rf(trial, X_train, y_train, scaler, num_features), n_trials=50)
        best_params_rf = study_rf.best_params

        # RandomForest 모델 초기화 및 학습
        rf_model = RandomForestClassifier(**best_params_rf)
        preprocessor_rf = ColumnTransformer(
            transformers=[
                ('num', scalers[scaler], num_features)
            ],
            remainder='passthrough'
        )
        pipeline_rf = Pipeline([('preprocessor', preprocessor_rf), ('model', rf_model)])
        pipeline_rf.fit(X_train, y_train)

        # 두 번째 모델에 대한 하이퍼파라미터 최적화
        study_xgb = optuna.create_study(direction='maximize', sampler=TPESampler())
        study_xgb.optimize(lambda trial: objective_xgb(trial, X_train, y_train, scaler, num_features), n_trials=50)
        best_params_xgb = study_xgb.best_params

        # XGBoost 모델 초기화 및 학습
        xgb_model = XGBClassifier(**best_params_xgb)
        preprocessor_xgb = ColumnTransformer(
            transformers=[
                ('num', scalers[scaler], num_features)
            ],
            remainder='passthrough'
        )
        pipeline_xgb = Pipeline([('preprocessor', preprocessor_xgb), ('model', xgb_model)])
        pipeline_xgb.fit(X_train, y_train)

        # 테스트 데이터에 대한 예측 및 정확도 계산
        y_pred_rf = pipeline_rf.predict(X_test)
        accuracy_rf = accuracy_score(y_test, y_pred_rf)

        y_pred_xgb = pipeline_xgb.predict(X_test)
        accuracy_xgb = accuracy_score(y_test, y_pred_xgb)


        # 결과 저장
        results.append((encoding, scaler, accuracy_rf, accuracy_xgb))
        best_params_list.append((encoding, scaler, best_params_rf, best_params_xgb))

# 결과 출력
for (encoding, scaler, accuracy_rf, accuracy_xgb), (enc, sc, best_rf, best_xgb) in zip(results, best_params_list):
    print(f"Encoding: {encoding}, Scaler: {scaler}")
    print(f"Random Forest Accuracy: {accuracy_rf:.4f}, Best Params: {best_rf}")
    print(f"XGBoost Accuracy: {accuracy_xgb:.4f}, Best Params: {best_xgb}")
    print()

end_time = time.time()
elapsed_time = end_time - start_time
print(f"총 소요 시간 : {elapsed_time} 초")


Encoding:   0%|          | 0/2 [00:00<?, ?it/s]
Scaler:   0%|          | 0/3 [00:00<?, ?it/s][A[I 2024-05-19 06:35:02,441] A new study created in memory with name: no-name-773e1243-f937-4c07-ae8c-2be7df8f7f99
[I 2024-05-19 06:35:16,079] Trial 0 finished with value: 0.9491167219806496 and parameters: {'n_estimators': 359, 'max_depth': 29, 'min_samples_split': 4}. Best is trial 0 with value: 0.9491167219806496.
[I 2024-05-19 06:35:43,326] Trial 1 finished with value: 0.9500956148830028 and parameters: {'n_estimators': 839, 'max_depth': 22, 'min_samples_split': 6}. Best is trial 1 with value: 0.9500956148830028.
[I 2024-05-19 06:35:48,256] Trial 2 finished with value: 0.948137529814187 and parameters: {'n_estimators': 199, 'max_depth': 30, 'min_samples_split': 16}. Best is trial 1 with value: 0.9500956148830028.
[I 2024-05-19 06:35:50,342] Trial 3 finished with value: 0.9476482329950653 and parameters: {'n_estimators': 104, 'max_depth': 18, 'min_samples_split': 5}. Best is trial 1 with v

Encoding: Label Encoding, Scaler: RobustScaler
Random Forest Accuracy: 0.9472, Best Params: {'n_estimators': 266, 'max_depth': 24, 'min_samples_split': 5}
XGBoost Accuracy: 0.9511, Best Params: {'n_estimators': 216, 'max_depth': 26, 'learning_rate': 0.12416585469516189}

Encoding: Label Encoding, Scaler: MinMaxScaler
Random Forest Accuracy: 0.9423, Best Params: {'n_estimators': 881, 'max_depth': 29, 'min_samples_split': 2}
XGBoost Accuracy: 0.9472, Best Params: {'n_estimators': 541, 'max_depth': 7, 'learning_rate': 0.030749344967256842}

Encoding: Label Encoding, Scaler: StandardScaler
Random Forest Accuracy: 0.9491, Best Params: {'n_estimators': 500, 'max_depth': 15, 'min_samples_split': 4}
XGBoost Accuracy: 0.9442, Best Params: {'n_estimators': 236, 'max_depth': 28, 'learning_rate': 0.19691124411262545}

Encoding: One-Hot Encoding, Scaler: RobustScaler
Random Forest Accuracy: 0.9423, Best Params: {'n_estimators': 486, 'max_depth': 16, 'min_samples_split': 13}
XGBoost Accuracy: 0.9491




In [26]:
# X_train_le, X_test_le, y_train, y_test

scale = RobustScaler()
X_train_le[num_features] = scale.fit_transform(X_train_le[num_features])
X_test_le[num_features] = scale.transform(X_test_le[num_features])

xgb_model = XGBClassifier(n_estimators=216, max_depth=26, learning_rate=0.12416585469516189)
xgb_model.fit(X_train_le, y_train)
pred = xgb_model.predict(X_test_le)

print(pred)
print(accuracy_score(y_test, pred))

[0 0 0 ... 1 0 0]
0.9461839530332681


In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

accuracy = accuracy_score(y_test, pred)

# 정밀도, 재현율, F1 점수 계산
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)

# 평가 결과 출력
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# 상세한 분류 보고서 출력
print(classification_report(y_test, pred))

Test Accuracy: 0.9462
Precision: 0.7831
Recall: 0.6373
F1 Score: 0.7027
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       920
           1       0.78      0.64      0.70       102

    accuracy                           0.95      1022
   macro avg       0.87      0.81      0.84      1022
weighted avg       0.94      0.95      0.94      1022



In [None]:
y_test.value_counts()