# 0) 필요한 라이브러리 설치

In [None]:
!pip install optuna
!pip install catboost

# 1) 필요한 라이브러리 import

In [None]:
import pandas as pd # 데이터 분석 및 조작을 위한 라이브러리
import numpy as np # 수치 계산을 위한 라이브러리
from sklearn.model_selection import train_test_split, cross_val_score # 데이터를 학습용과 테스트용으로 분할 / 교차 검증 점수를 계산
from sklearn.preprocessing import LabelEncoder #데이터 전처리를 위한 라벨 인코딩 라이브러리
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report# 모델 평가를 위한 평가지표 라이브러리
from sklearn.ensemble import RandomForestClassifier, StakingClassifier # 랜덤 포레스트 분류기, 스태킹 import
from xgboost import XGBClassifier # XGBoost 분류기
from catboost import CatBoostClassifier # CatBoost 분류기
import optuna # 하이퍼파라미터 최적화를 위한 라이브러리
from optuna.samplers import TPESampler # TPESampler를 사용한 최적화
from imblearn.over_sampling import SMOTE  # 불균형 데이터 처리
import time # 시간 측정을 위한 라이브러리

# 2) 데이터 불러오기

In [None]:
file_path = '/content/drive/MyDrive/파이널 프로젝트/data/서울시 고립은둔청년 실태조사(청년조사)_분류.csv'
df = pd.read_csv(file_path, encoding = 'euc-kr')

# 3) 전처리

In [None]:
# 데이터 확인 및 전처리
target = '【KEY_1】 고립은둔청년'


# 종속변수를 이진 분류로 변환
df[target] = df[target].apply(lambda x: 1 if x == '해당' else 0)

In [None]:
# a10, a11열 이상치 제거 함수 (각각 300개, 98개 제거)
def remove_outlier(df, column):
  q3 = df[column].quantile(0.75)
  q1 = df[column].quantile(0.25)

  IQR = q3 - q1
  max_val = q3 + 1.5 * q3

  cond = (df[column] >= 0) & (df[column] < max_val)
  return df[cond]

# 이상치 제거
df = remove_outlier(df, '【A10】 지난 2주간 교류 상대(명)')
df = remove_outlier(df, '【A11】 지난 2주 동안 교류 횟수(회)')

In [None]:
X = df.drop(target, axis = 1)
y = df[target]

In [None]:
# 라벨 인코딩
cols_object = X.select_dtypes(include = 'object').columns
le_X = X.copy()

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cols_object:
  le_X[col] = le.fit_transform(le_X[col])

# 4) train, test 데이터 분리



In [None]:
X_train_le, X_test_le, y_train, y_test = train_test_split(le_X, y, test_size=0.2, random_state=2024)

In [None]:
new_X_train_le, new_y_train = SMOTE(random_state = 2024).fit_resample(X_train_le, y_train)

# 5) 하이퍼파라미터 튜닝

In [None]:
# RandomForest
def objective_rf(trial, X_train, y_train):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10)
    }
    rf_model = RandomForestClassifier(**params, random_state=42)
    scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()

# XGBoost
def objective_xgb(trial, X_train, y_train):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3)
    }
    xgb_model = XGBClassifier(**params, random_state=42)
    scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()

# CatBoost
def objective_cat(trial, X_train, y_train):
    param = {
        'iterations': trial.suggest_int('iterations', 200, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 128)
    }

    cat_model = CatBoostClassifier(**param, verbose=1)
    scores = cross_val_score(cat_model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()


In [None]:
start_time = time.time()

# 랜덤 포레스트 최적화
study_rf_le = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study_rf_le.optimize(lambda trial: objective_rf(trial, new_X_train_le, new_y_train), n_trials=50)
best_params_rf_le = study_rf_le.best_params
print(f'Best parameters for Random Forest with Label Encoding: {best_params_rf_le}')

# XGBoost 최적화
study_xgb_le = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study_xgb_le.optimize(lambda trial: objective_xgb(trial, new_X_train_le, new_y_train), n_trials=50)
best_params_xgb_le = study_xgb_le.best_params
print(f'Best parameters for XGBoost with Label Encoding: {best_params_xgb_le}')

# CatBoost 최적화
study_cat = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study_cat.optimize(lambda trial: objective_cat(trial, new_X_train_le, new_y_train), n_trials=20)
best_params_cat = study_cat.best_params
print(f'Best parameters for CatBoost with Label Encoding: {best_params_cat}')

# 6) 머신러닝 모델 최적화 및 스태킹 앙상블 평가



In [None]:
# 랜덤포레스트 최적화 파라미터 학습 및 예측
rf_model_le = RandomForestClassifier(**best_params_rf_le, random_state=42)
rf_model_le.fit(new_X_train_le, new_y_train)
y_pred_rf = rf_model_le.predict(X_test_le)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# XGBoost 최적화 파라미터 학습 및 예측
xgb_model_le = XGBClassifier(**best_params_xgb_le, random_state=42)
xgb_model_le.fit(new_X_train_le, new_y_train)
y_pred_xgb = xgb_model_le.predict(X_test_le)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# CatBoost 최적화 파라미터 학습 및 예측
cat_model = CatBoostClassifier(**best_params_cat, random_state=42, verbose=0)
cat_model.fit(new_X_train_le, new_y_train)
y_pred_cat = cat_model.predict(X_test_le)
accuracy_cat = accuracy_score(y_test, y_pred_cat)

# 스태킹 모델
estimators = [
    ('rf_le', rf_model_le),
    ('xgb_le', xgb_model_le),
    ('cat', cat_model)
]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(random_state=42))
stacking_model.fit(new_X_train_le, new_y_train)
y_pred_stacking = stacking_model.predict(X_test_le)
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
precision_stacking = precision_score(y_test, y_pred_stacking)
recall_stacking = recall_score(y_test, y_pred_stacking)
f1_stacking = f1_score(y_test, y_pred_stacking)

print(f'Random Forest Accuracy: {accuracy_rf:.4f}')
print(f'XGBoost Accuracy: {accuracy_xgb:.4f}')
print(f'CatBoost Accuracy: {accuracy_cat:.4f}')

print(f'Stacking Model Accuracy: {accuracy_stacking:.4f}')
print(f'Precision: {precision_stacking:.4f}')
print(f'Recall: {recall_stacking:.4f}')
print(f'F1 Score: {f1_stacking:.4f}')
print(classification_report(y_test, y_pred_stacking))

end_time = time.time()
print(f'Time taken: {end_time - start_time:.2f} seconds')
