In [15]:
import pandas as pd
import pingouin as pg
import numpy as np
import shap
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.compose import ColumnTransformer
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import platform
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import LabelEncoder



plt.rcParams['font.family'] = 'Malgun Gothic'     # 한글 폰트 설정 (윈도우용)
plt.rcParams['axes.unicode_minus'] = False  


csv_path = 'superhost.csv'    # 여기에 absolute path
# CSV 읽기
df = pd.read_csv(
    csv_path,
    header=0,        # 첫 줄을 컬럼명으로 사용
    index_col='id',  # 인덱스 컬럼으로 id 지정
    encoding='utf-8-sig'
)


FileNotFoundError: [Errno 2] No such file or directory: 'superhost.csv'

In [None]:
# 1. 점수 변환 함수 정의
def response_time_to_score(response_time_str):
    mapping = {
        'within an hour': 4,
        'within a few hours': 3,
        'within a day': 2,
        'a few days or more': 1
    }
    return mapping.get(response_time_str.lower(), 0)

def response_rate_to_score(rate_percent):
    rate = rate_percent / 100
    if rate <= 0.25: return 1
    elif rate <= 0.5: return 2
    elif rate <= 0.75: return 3
    else: return 4

def acceptance_rate_to_score(rate_percent):
    rate = rate_percent / 100
    if rate <= 0.25: return 1
    elif rate <= 0.5: return 2
    elif rate <= 0.75: return 3
    else: return 4

common_amenities = ['Carbon monoxide alarm', 'Essentials', 'Hangers', 'Smoke alarm', 'Wifi']
type_amenity_dict = {
    'high': ['Air conditioning', 'Building staff', 'Elevator', 'Gym', 'Heating', 'Paid parking off premises', 'Shampoo'],
    'low-mid': ['Cleaning products', 'Dining table', 'Exterior security cameras on property', 'Free street parking', 
                'Freezer', 'Laundromat nearby', 'Lock on bedroom door', 'Microwave'],
    'mid': ['Cooking basics', 'Kitchen', 'Oven'],
    'upper-mid': ['Bathtub', 'Cleaning products', 'Cooking basics', 'Dishes and silverware', 'Elevator', 'Freezer']
}

def calc_amenity_scores(amenities_list, room_new_type):
    common_match = sum(1 for a in amenities_list if a in common_amenities) / len(common_amenities)
    type_amenities = type_amenity_dict.get(room_new_type, [])
    type_match = sum(1 for a in amenities_list if a in type_amenities) / len(type_amenities) if type_amenities else 0
    return round(common_match, 3), round(type_match, 3)

# 2. 사용자 입력 예시
user_input_raw = {
    'amenities_cnt': 12,
    'availability_365': 200,
    'price': 150,
    'host_about_length_group': 'medium',
    'room_type': 'Entire home/apt',
    'name_length_group': 'short',
    'description_length_group': 'long',
    'host_has_profile_pic': 1,
    'host_response_time': 'within an hour',
    'host_response_rate': 85,
    'host_acceptance_rate': 78,
    'host_identity_verified': 1,
    'is_long_term': 0,
    'accommodates': 3,
    'amenities': ['Wifi', 'Essentials', 'Hangers', 'Oven', 'Kitchen'],
    'room_new_type': 'mid'
}

# 3. 점수 계산 및 전처리
user_input_processed = {
    'amenities_cnt': user_input_raw['amenities_cnt'],
    'availability_365': user_input_raw['availability_365'],
    'price': user_input_raw['price'],
    'host_about_length_group': user_input_raw['host_about_length_group'],
    'room_type': user_input_raw['room_type'],
    'name_length_group': user_input_raw['name_length_group'],
    'description_length_group': user_input_raw['description_length_group'],
    'host_has_profile_pic': user_input_raw['host_has_profile_pic'],
    'host_response_time_score': response_time_to_score(user_input_raw['host_response_time']),
    'host_response_rate_score': response_rate_to_score(user_input_raw['host_response_rate']),
    'host_acceptance_rate_score': acceptance_rate_to_score(user_input_raw['host_acceptance_rate']),
    'host_identity_verified': user_input_raw['host_identity_verified'],
    'is_long_term': user_input_raw['is_long_term'],
    'accommodates': user_input_raw['accommodates']
}

# amenities 점수 추가
common_score, type_score = calc_amenity_scores(user_input_raw['amenities'], user_input_raw['room_new_type'])
user_input_processed['common_amenity_score'] = common_score
user_input_processed['type_amenity_score'] = type_score

# 4. DataFrame 형태로 변환
new_data = pd.DataFrame([user_input_processed])

# 5. 모델 학습 때 사용한 컬럼 순서 맞추기
train_columns = joblib.load("train_columns.pkl")  # 훈련 당시 컬럼 저장해둔 pkl

def preprocess_input(new_df, train_cols):
    new_encoded = pd.get_dummies(new_df, drop_first=False)
    missing_cols = set(train_cols) - set(new_encoded.columns)
    for c in missing_cols:
        new_encoded[c] = 0
    new_encoded = new_encoded[train_cols]
    return new_encoded

X_new = preprocess_input(new_data, train_columns)

# 6. 모델 로드 및 예측
model = joblib.load("superhost_rf_model.pkl")  # 학습된 모델 저장한 파일

pred = model.predict(X_new)
proba = model.predict_proba(X_new)[:, 1]

# 7. 결과 출력
print("예측 결과 (슈퍼호스트 여부):", pred[0])
print("슈퍼호스트 확률:", round(proba[0], 3))