In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline


csv_path = 'superhost.csv'    # 여기에 absolute path
# CSV 읽기
df = pd.read_csv(
    csv_path,
    header=0,        # 첫 줄을 컬럼명으로 사용
    index_col='id',  # 인덱스 컬럼으로 id 지정
    encoding='utf-8-sig'
)

# 1. 타겟 및 피처 정의
TARGET = 'host_is_superhost'

strategy_cols = ['amenities_cnt', 'availability_365', 'price', 'host_about_length_group', 'room_type','name_length_group', 'description_length_group',
                 'host_has_profile_pic', 'host_response_time_score','type_amenity_score','common_amenity_score',
                 'host_acceptance_rate_score', 'host_identity_verified','is_long_term', 'accommodates']

X = df[strategy_cols]
y = df[TARGET].astype(int)

# 2. 열 타입 분리
categorical_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 3. 전처리 파이프라인 구성
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore', drop=None), categorical_cols)
], remainder='passthrough')  # 수치형은 그대로 통과

# 4. 전체 파이프라인 구성
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=1000,
        max_depth=30,
        min_samples_split=15,
        min_samples_leaf=10,
        random_state=42,
        class_weight='balanced'
    ))
])

# 5. 학습 데이터 분할 및 모델 학습
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
pipeline.fit(X_train, y_train)

# 원-핫 인코딩된 피처 이름 저장
X_encoded = pd.DataFrame(preprocessor.fit_transform(X))  # 피처명 추출용
train_columns = X_encoded.columns

# -------------------------------
# 점수 계산 함수 정의
# -------------------------------

def response_time_to_score(response_time_str):
    mapping = {
        'within an hour': 4,
        'within a few hours': 3,
        'within a day': 2,
        'a few days or more': 1
    }
    return mapping.get(response_time_str.lower(), 0)

def response_rate_to_score(rate_percent):
    rate = rate_percent / 100
    if rate <= 0.25:
        return 1
    elif rate <= 0.5:
        return 2
    elif rate <= 0.75:
        return 3
    else:
        return 4

def acceptance_rate_to_score(rate_percent):
    rate = rate_percent / 100
    if rate <= 0.25:
        return 1
    elif rate <= 0.5:
        return 2
    elif rate <= 0.75:
        return 3
    else:
        return 4

common_amenities = ['Carbon monoxide alarm', 'Essentials', 'Hangers', 'Smoke alarm', 'Wifi']

type_amenity_dict = {
    'high': ['Air conditioning', 'Building staff', 'Elevator', 'Gym', 'Heating', 'Paid parking off premises', 'Shampoo'],
    'low-mid': ['Cleaning products', 'Dining table', 'Exterior security cameras on property', 'Free street parking', 
                'Freezer', 'Laundromat nearby', 'Lock on bedroom door', 'Microwave'],
    'mid': ['Cooking basics', 'Kitchen', 'Oven'],
    'upper-mid': ['Bathtub', 'Cleaning products', 'Cooking basics', 'Dishes and silverware', 'Elevator', 'Freezer']
}

def calc_amenity_scores(amenities_list, room_new_type):
    common_match = sum(1 for a in amenities_list if a in common_amenities) / len(common_amenities) if common_amenities else 0
    type_amenities = type_amenity_dict.get(room_new_type, [])
    type_match = sum(1 for a in amenities_list if a in type_amenities) / len(type_amenities) if type_amenities else 0
    return round(common_match, 3), round(type_match, 3)

# -------------------------------
# 입력값 전처리 함수
# -------------------------------
def preprocess_input(new_df, train_cols):
    new_encoded = pd.get_dummies(new_df, drop_first=False)
    missing_cols = set(train_cols) - set(new_encoded.columns)
    for c in missing_cols:
        new_encoded[c] = 0
    new_encoded = new_encoded[train_cols]
    return new_encoded


In [4]:
# 사용자 입력 예시
user_input = {
    'host_response_time': 'within an hour',
    'host_response_rate': 85,
    'host_acceptance_rate': 78,
    'amenities': ['Wifi', 'Essentials', 'Hangers', 'Oven', 'Kitchen'],
    'room_new_type': 'mid'
}

# 점수 계산
response_time_score = response_time_to_score(user_input['host_response_time'])
response_rate_score = response_rate_to_score(user_input['host_response_rate'])
acceptance_rate_score = acceptance_rate_to_score(user_input['host_acceptance_rate'])
common_amenity_score, type_amenity_score = calc_amenity_scores(user_input['amenities'], user_input['room_new_type'])

# 예측용 입력 (인코딩 X)
new_data = pd.DataFrame([{
    'amenities_cnt': 12,
    'availability_365': 200,
    'price': 150,
    'host_about_length_group': 'medium',
    'room_type': 'Entire home/apt',
    'name_length_group': 'short',
    'description_length_group': 'long',
    'host_has_profile_pic': 1,
    'host_response_time_score': response_time_score,
    'type_amenity_score': type_amenity_score,
    'common_amenity_score': common_amenity_score,
    'host_acceptance_rate_score': acceptance_rate_score,
    'host_identity_verified': 1,
    'is_long_term': 0,
    'accommodates': 3
}])

# 전처리 없이 바로 예측
pred = pipeline.predict(new_data)
proba = pipeline.predict_proba(new_data)[:, 1]

print("예측 결과 (슈퍼호스트 여부):", pred[0])
print("슈퍼호스트 확률:", round(proba[0], 3))


예측 결과 (슈퍼호스트 여부): 0
슈퍼호스트 확률: 0.38
