In [4]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# 1. 데이터 준비
TARGET = 'host_is_superhost'

strategy_cols = [
    'amenities_cnt', 'availability_365', 'price', 'host_about_length_group',
    'room_type', 'name_length_group', 'description_length_group',
    'host_has_profile_pic', 'host_response_time_score', 'type_amenity_score',
    'common_amenity_score', 'host_acceptance_rate_score',
    'host_identity_verified', 'is_long_term', 'accommodates'
]

X = df[strategy_cols]
y = df[TARGET].astype(int)

# 2. 열 타입 분리
categorical_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 3. 전처리 파이프라인 구성
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore', drop=None), categorical_cols)
], remainder='passthrough')  # 수치형은 그대로 통과

# 4. 전체 파이프라인 구성
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=1000,
        max_depth=30,
        min_samples_split=15,
        min_samples_leaf=10,
        random_state=42,
        class_weight='balanced'
    ))
])

# 5. 학습
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipeline.fit(X_train, y_train)

# 6. 저장
joblib.dump(pipeline, 'superhost_pipeline_rf.pkl')
print("✅ 파이프라인 모델이 'superhost_pipeline_rf.pkl'로 저장되었습니다.")


✅ 파이프라인 모델이 'superhost_pipeline_rf.pkl'로 저장되었습니다.
