In [4]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [5]:
df = pd.read_csv('encoded_jeju.csv')

In [6]:
df

Unnamed: 0,id,placeID,MCT_NM,MCT_NAVER_NAME,UE_CNT_GRP,UE_AMT_GRP,MON_UE_CNT_RAT,TUE_UE_CNT_RAT,WED_UE_CNT_RAT,THU_UE_CNT_RAT,...,MCT_TYPE_죽,MCT_TYPE_중식,MCT_TYPE_카페/디저트,MCT_TYPE_탕/국/면류,MCT_TYPE_탕/찌개,MCT_TYPE_한식,MCT_TYPE_해산물,MCT_TYPE_label,center_group_label.2,center_group_label.3
0,1,,(사)한국수상레저안전협회 제주제주시지부,,1_상위 10% 이하,1_상위 10% 이하,1.000000,0.000000,0.000000,0.000000,...,False,False,False,False,False,False,False,0,2,2
1,5,1.247913e+09,(주) 비케이알 버거킹 제주화북DT점,버거킹 제주화북DT점,6_90% 초과,5_75~90%,0.115321,0.125206,0.146623,0.148270,...,False,False,False,False,False,False,False,1,35,35
2,6,,(주) 성우디엔에프,,4_50~75%,2_10~25%,0.139785,0.129032,0.129032,0.236559,...,False,False,False,False,False,False,False,2,2,2
3,7,1.827123e+09,(주) 신세계푸드 제주신화월드 고래라면,고래라면 제주신화월드점,5_75~90%,4_50~75%,0.152174,0.086957,0.130435,0.079710,...,False,False,False,False,False,False,False,0,21,21
4,8,1.199772e+09,(주) 신세계푸드 제주신화월드 윤경양식당,윤경양식당 신화월드점,5_75~90%,5_75~90%,0.157692,0.092308,0.076923,0.111538,...,False,False,False,False,False,False,False,1,21,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5873,9244,1.575929e+09,희양양,,3_25~50%,5_75~90%,0.184211,0.105263,0.131579,0.131579,...,False,True,False,False,False,False,False,11,5,5
5874,9248,1.996719e+09,히아담,히아담,3_25~50%,2_10~25%,0.096774,0.112903,0.209677,0.161290,...,False,False,False,False,False,False,False,1,5,5
5875,9250,1.762730e+09,히포파운드,,4_50~75%,2_10~25%,0.185185,0.000000,0.000000,0.000000,...,False,False,False,False,False,False,False,2,5,5
5876,9251,3.216359e+07,힘찬장어,,1_상위 10% 이하,1_상위 10% 이하,0.166667,0.000000,0.333333,0.000000,...,False,False,False,False,False,False,False,2,5,5


In [None]:
# 종속 변수 (Target)
target_col = 'UE_AMT_GRP_encoded'

# 독립 변수 (Features)
feature_cols = [
    # 요일
    'MON_UE_CNT_RAT', 'TUE_UE_CNT_RAT', 'WED_UE_CNT_RAT', 'THU_UE_CNT_RAT', 'FRI_UE_CNT_RAT', 'SAT_UE_CNT_RAT', 'SUN_UE_CNT_RAT',
    'peak_day_label', 'day_type_encoded', 'weekday_ratio', 'weekend_ratio', 'day_entropy',

    # 시간
    'HR_5_11_UE_CNT_RAT', 'HR_12_13_UE_CNT_RAT', 'HR_14_17_UE_CNT_RAT', 'HR_18_22_UE_CNT_RAT', 'HR_23_4_UE_CNT_RAT',
    'peak_time_label', 'peak_ratio', 'peak_min_diff', 'top2_gap', 'time_type_label',

    # 현지인
    'LOCAL_UE_CNT_RAT',

    # 성별
    'RC_M12_MAL_CUS_CNT_RAT', 'RC_M12_FME_CUS_CNT_RAT',

    # 연령대
    'RC_M12_AGE_UND_20_CUS_CNT_RAT', 'RC_M12_AGE_30_CUS_CNT_RAT', 'RC_M12_AGE_40_CUS_CNT_RAT',
    'RC_M12_AGE_50_CUS_CNT_RAT', 'RC_M12_AGE_OVR_60_CUS_CNT_RAT', 'main_age_group_label',
    'age_entropy', 'young_ratio', 'old_ratio', 'age_concentration', 'expected_age',

    # 업종
    'MCT_TYPE_label',

    # 위치
    'center_group_label'
]

In [22]:
# 7. 명목형 변수 설정
cat_features = ['peak_day_label', 'day_type_encoded',
                'peak_time_label', 'time_type_label',
                'main_age_group_label', 'MCT_TYPE_label', 'center_group_label']  # <- 네가 지정한 범주형 컬럼명 리스트

In [23]:
X = df[feature_cols]
y = df[target_col]

In [24]:
# 2. 학습/검증 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [25]:
# 3. CatBoost Pool 생성
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
val_pool = Pool(data=X_val, label=y_val, cat_features=cat_features)

In [26]:
# 4. 모델 정의
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',  # 분류문제용
    eval_metric='MultiClass',
    early_stopping_rounds=50,
    random_seed=42,
    verbose=100
)


In [27]:
# 5. 학습
model.fit(train_pool, eval_set=val_pool, use_best_model=True)

0:	learn: 1.7696123	test: 1.7718403	best: 1.7718403 (0)	total: 44.3ms	remaining: 22.1s
100:	learn: 1.3539500	test: 1.5224798	best: 1.5224798 (100)	total: 2.29s	remaining: 9.04s
200:	learn: 1.1439572	test: 1.5017848	best: 1.5015634 (182)	total: 4.51s	remaining: 6.72s
300:	learn: 0.9872337	test: 1.4952067	best: 1.4944981 (296)	total: 6.67s	remaining: 4.41s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1.492830339
bestIteration = 337

Shrink model to first 338 iterations.


<catboost.core.CatBoostClassifier at 0x145147b50>

In [28]:
# 6. 예측 및 평가
y_pred = model.predict(val_pool)
print("Accuracy:", accuracy_score(y_val, y_pred))

Accuracy: 0.3494897959183674


In [20]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.41      0.40      0.40       241
           1       0.32      0.41      0.36       279
           2       0.28      0.27      0.27       275
           3       0.31      0.32      0.31       209
           4       0.24      0.07      0.11       108
           5       0.51      0.61      0.56        64

    accuracy                           0.34      1176
   macro avg       0.35      0.35      0.34      1176
weighted avg       0.33      0.34      0.33      1176



In [29]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.41      0.44      0.42       241
           1       0.35      0.41      0.38       279
           2       0.27      0.28      0.27       275
           3       0.32      0.31      0.31       209
           4       0.31      0.12      0.17       108
           5       0.50      0.59      0.54        64

    accuracy                           0.35      1176
   macro avg       0.36      0.36      0.35      1176
weighted avg       0.34      0.35      0.34      1176

