# 1. Library & Seed Setting

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
import requests
import pickle
import lightgbm as lgb
import joblib
import re
import math

from bs4 import BeautifulSoup
from hyperopt import fmin, tpe, hp, Trials
from hyperopt.pyll import scope
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
from tqdm import tqdm

plt.rcParams['font.family'] = 'NanumGothic'

In [2]:
def seed_setting(seed=1004) :
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_setting()

In [37]:
bus_data = pd.read_csv("bus_feature.csv")
subway_data = pd.read_csv("subway_feature.csv")
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
apart_data = pd.read_csv('Apart_data.csv', encoding='cp949')

  train_data = pd.read_csv("train.csv")


In [70]:
def Entire_Preprocessing(df) :
    # 문자열 컬럼만 찾아서 좌우 공백 제거
    df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

    # 전화번호, 팩스번호 k-홈페이지, 고용보험관리번호, k-등록일자, k-수정일자, 관리비 업로드, 단지소개기존clob 삭제
    df = df.drop(columns=['k-전화번호', 'k-팩스번호', 'k-홈페이지', '고용보험관리번호', 'k-등록일자', 'k-수정일자', '관리비 업로드', '단지소개기존clob'])

    # 본번, 부번, 시군구 삭제
    df = df.drop(columns=['본번', '부번', '시군구'])

    # 계약년월 분해
    df['계약(연)'] = df['계약년월'] // 100
    df['계약(월)'] = df['계약년월'] % 100
    df = df.drop(columns=['계약년월'])

    # 계약일 → 계약(일)
    df = df.rename(columns={"계약일" : "계약(일)"})

    # 건축년도 → 건물연식
    df['건물연식'] = df['건축년도'] - df['계약(연)']

    # 해제사유발생일 전처리
    df['해제사유발생여부'] = df['해제사유발생일'].notnull().astype(int)

    # 세대당_주차대수 특성 생성
    df['세대당_주차대수'] = df.apply(
    lambda row: row['주차대수'] / row['k-전체세대수'] if pd.notnull(row['주차대수']) and pd.notnull(row['k-전체세대수']) else np.nan,
    axis=1)

    # 등기신청일자 전처리
    df['등기신청여부'] = df['등기신청일자'].notnull().astype(int)

    # 불필요 특성 제거
    if 'target' in df.columns :
        columns_to_keep = [
        '전용면적(㎡)', '해제사유발생여부', 'k-전용면적별세대현황(60㎡이하)', '건물연식',
        'k-전용면적별세대현황(60㎡~85㎡이하)', '세대당_주차대수', '계약(연)', '계약(월)',
        '좌표X', '좌표Y', '아파트명', '등기신청여부', 'k-복도유형', 'k-단지분류(아파트,주상복합등등)', '도로명', 'target'
        ]
    else :
        columns_to_keep = [
        '전용면적(㎡)', '해제사유발생여부', 'k-전용면적별세대현황(60㎡이하)', '건물연식',
        'k-전용면적별세대현황(60㎡~85㎡이하)', '세대당_주차대수', '계약(연)', '계약(월)',
        '좌표X', '좌표Y', '아파트명', '등기신청여부', 'k-복도유형', 'k-단지분류(아파트,주상복합등등)', '도로명'
        ]

    df = df[columns_to_keep]

    # 특성 이름에서 k- 빼기
    df.columns = df.columns.str.replace('k-', '')

    # 군집화
    ## 카카오 API 호출 함수
    def get_coords_kakao(address, api_key):
        url = "https://dapi.kakao.com/v2/local/search/address.json"
        headers = {"Authorization": f"KakaoAK {api_key}"}
        params = {"query": address}
        response = requests.get(url, headers=headers, params=params)
        result = response.json()
        
        try:
            x = float(result['documents'][0]['x'])
            y = float(result['documents'][0]['y'])
            return x, y
        except IndexError:
            return None, None

    ## 도로명을 기반으로 좌표X와 좌표Y를 받아옴 (결측치에 한해서)
    def fill_missing_coords(row):
        if pd.isna(row['좌표X']) or pd.isna(row['좌표Y']):
            coords = roadname_to_coords.get(row['도로명'])
            if coords:
                return pd.Series(coords)
        return pd.Series([row['좌표X'], row['좌표Y']])
    
    roadname_to_coords = {}
    unique_roads = df.loc[df[['좌표X', '좌표Y']].isnull().any(axis=1), '도로명'].dropna().unique()

    api_key = '13b7b7a0b7a853100b56c56f19f6bc24'

    for road in tqdm(unique_roads) :
        x, y = get_coords_kakao(road, api_key)
        if x is not None and y is not None :
            roadname_to_coords[road] = (x, y)

    df[['좌표X', '좌표Y']] = df.apply(fill_missing_coords, axis=1)

    return df

In [28]:
df = Entire_Preprocessing(train_data)

100%|██████████| 8441/8441 [09:56<00:00, 14.15it/s]


In [None]:
df.to_csv('preprocessed_data.csv', index=False)

In [47]:
df = pd.read_csv('preprocessed_data.csv')

In [49]:
def create_high_price_apartment_feature(df, apart_data):
    high_price_roads = set(
        data['도로명'] for _, data in apart_data[
            apart_data['거래금액(만원)'].str.replace(',', '').astype(int) > 200000
        ].iterrows()
    )
    
    df_result = df.copy()
    df_result['고가아파트'] = df_result['도로명'].isin(high_price_roads).astype(int)
    
    return df_result

df = create_high_price_apartment_feature(df, apart_data)

In [53]:
df['좌표X'].isnull().sum()

22191

In [54]:
df = df.dropna(subset=['좌표X', '좌표Y'])

In [55]:
# 군집화
kmeans = joblib.load('kmeans_model.pkl')

df['cluster'] = kmeans.predict(df[['좌표X', '좌표Y']])
df = df.drop(columns=['좌표X', '좌표Y'])

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [56]:
df.head(3)

Unnamed: 0,전용면적(㎡),해제사유발생여부,전용면적별세대현황(60㎡이하),건물연식,전용면적별세대현황(60㎡~85㎡이하),세대당_주차대수,계약(연),계약(월),아파트명,등기신청여부,복도유형,"단지분류(아파트,주상복합등등)",도로명,target,고가아파트,cluster
0,79.97,0,20.0,-30,250.0,0.97037,2017,12,개포6차우성,1,계단식,아파트,언주로 3,124000,1,3
1,79.97,0,20.0,-30,250.0,0.97037,2017,12,개포6차우성,1,계단식,아파트,언주로 3,123500,1,3
2,54.98,0,20.0,-30,250.0,0.97037,2017,12,개포6차우성,1,계단식,아파트,언주로 3,91500,1,3


# 2. Modeling

In [57]:
def clean_column_name(name):
    return re.sub(r'[^\uac00-\ud7a3a-zA-Z0-9_]', '_', name)

df.columns = [clean_column_name(col) for col in df.columns]

In [58]:
# object 타입 특성 변환
categorical_features = df.select_dtypes(include=['object']).columns

for col in categorical_features:
    df[col] = df[col].astype('category')

In [59]:
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
X_train.head(1)

Unnamed: 0,전용면적___,해제사유발생여부,전용면적별세대현황_60_이하_,건물연식,전용면적별세대현황_60__85_이하_,세대당_주차대수,계약_연_,계약_월_,아파트명,등기신청여부,복도유형,단지분류_아파트_주상복합등등_,도로명,고가아파트,cluster
331805,74.2,0,,-31,,,2020,7,경남,1,,,섬밭로 265,0,2


In [61]:
# 수치형 특성 스케일링(Data Leakage 방지 적용)
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns

scaler = RobustScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

In [62]:
model = LGBMRegressor()

param_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 300, 3000, 10)),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.2),
    'num_leaves' : scope.int(hp.quniform('num_leaves', 2, 50, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 0, 40, 1)),
    'min_data_in_leaf' : scope.int(hp.quniform('min_data_in_leaf', 0, 50, 1)),
    'feature_fraction_bynode' : hp.uniform('feature_fraction_bynode', 0.001, 1.0),
    'bagging_fraction' : hp.uniform('bagging_fraction', 0.001, 1.0),
    'bagging_freq' : scope.int(hp.quniform('bagging_freq', 0, 30, 1)),
    'min_child_weight': hp.uniform('min_child_weight', 0, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'drop_rate' : hp.uniform('drop_rate', 0, 1)
}

In [63]:
def objective(params):
    lgb_model = lgb.LGBMRegressor(
        n_estimators=params['n_estimators'],
        learning_rate=params['learning_rate'],
        num_leaves=params['num_leaves'],
        max_depth=params['max_depth'],
        min_data_in_leaf=params['min_data_in_leaf'],
        feature_fraction_bynode=params['feature_fraction_bynode'],
        bagging_fraction=params['bagging_fraction'],
        bagging_freq=params['bagging_freq'],
        min_child_weight=params['min_child_weight'],
        reg_alpha=params['reg_alpha'],
        reg_lambda=params['reg_lambda'],
        drop_rate=params['drop_rate'],
    )
    
    lgb_model.fit(X_train, y_train)
    
    lgb_pred = lgb_model.predict(X_test)
    
    mse = mean_squared_error(y_test, lgb_pred)
    rmse = math.sqrt(mse)
    
    return rmse

In [64]:
trials = Trials()
best = fmin(fn=objective, space=param_space, algo=tpe.suggest, max_evals=50, trials=trials)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018662 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11904                    
[LightGBM] [Info] Number of data points in the train set: 877304, number of used features: 14
[LightGBM] [Info] Start training from score 57544.421636
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012926 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11904                                             
[LightGBM] [Info] Number of data points in the train set: 877304, number of used features: 14
[LightGBM] [Info] Start training from score 57544.421636                       
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.

In [65]:
print(f"Best Hyperparameters: {best}")

Best Hyperparameters: {'bagging_fraction': 0.8546484922120094, 'bagging_freq': 0.0, 'drop_rate': 0.8251814575286062, 'feature_fraction_bynode': 0.4696145898729537, 'learning_rate': 0.1670756766194666, 'max_depth': 25.0, 'min_child_weight': 6.336352253884298, 'min_data_in_leaf': 23.0, 'n_estimators': 1580.0, 'num_leaves': 49.0, 'reg_alpha': 0.36887459188045524, 'reg_lambda': 0.22340494687276646}


In [66]:
best_model = lgb.LGBMRegressor(
    n_estimators=int(best['n_estimators']),
    learning_rate=best['learning_rate'],
    num_leaves=int(best['num_leaves']),
    max_depth=int(best['max_depth']),
    min_data_in_leaf=int(best['min_data_in_leaf']),
    feature_fraction_bynode=best['feature_fraction_bynode'],
    bagging_fraction=best['bagging_fraction'],
    bagging_freq=int(best['bagging_freq']),
    min_child_weight=best['min_child_weight'],
    reg_alpha=best['reg_alpha'],
    reg_lambda=best['reg_lambda'],
    drop_rate=best['drop_rate'],
)

best_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009478 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11904
[LightGBM] [Info] Number of data points in the train set: 877304, number of used features: 14
[LightGBM] [Info] Start training from score 57544.421636


In [67]:
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
rmse



6799.898164350143

In [68]:
feature_importance = best_model.feature_importances_

# 특성 중요도를 데이터프레임으로 정리
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
})

# 중요도가 높은 순으로 정렬
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df

Unnamed: 0,Feature,Importance
0,전용면적___,14707
6,계약_연_,13071
12,도로명,11118
8,아파트명,10960
3,건물연식,10821
7,계약_월_,8900
14,cluster,2943
13,고가아파트,1157
5,세대당_주차대수,683
2,전용면적별세대현황_60_이하_,660


# 3. Submission

In [71]:
test = Entire_Preprocessing(test_data)

100%|██████████| 2052/2052 [02:33<00:00, 13.34it/s]


In [72]:
test = create_high_price_apartment_feature(test, apart_data)

In [None]:
coord_mask = test[['좌표X', '좌표Y']].notna().all(axis=1)

test['cluster'] = 'NaN'

test.loc[coord_mask, 'cluster'] = kmeans.predict(test.loc[coord_mask, ['좌표X', '좌표Y']])
test = test.drop(columns=['좌표X', '좌표Y'])

KeyError: "None of [Index(['좌표X', '좌표Y'], dtype='object')] are in the [columns]"

In [75]:
test.head(1)

Unnamed: 0,전용면적(㎡),해제사유발생여부,전용면적별세대현황(60㎡이하),건물연식,전용면적별세대현황(60㎡~85㎡이하),세대당_주차대수,계약(연),계약(월),아파트명,등기신청여부,복도유형,"단지분류(아파트,주상복합등등)",도로명,고가아파트,cluster
0,79.97,0,20.0,-36,250.0,0.97037,2023,7,개포6차우성,1,계단식,아파트,언주로 3,1,3


In [76]:
def test_preprocessing(df) :
    df.columns = [clean_column_name(col) for col in df.columns]
    categorical_features = df.select_dtypes(include=['object']).columns

    for col in categorical_features:
        df[col] = df[col].astype('category')
        
    df[numeric_features] = scaler.transform(df[numeric_features])
    return df

In [77]:
test = test_preprocessing(test)

In [78]:
test.head(1)

Unnamed: 0,전용면적___,해제사유발생여부,전용면적별세대현황_60_이하_,건물연식,전용면적별세대현황_60__85_이하_,세대당_주차대수,계약_연_,계약_월_,아파트명,등기신청여부,복도유형,단지분류_아파트_주상복합등등_,도로명,고가아파트,cluster
0,-0.094108,0.0,-0.388258,-1.5,-0.014374,-0.205102,1.142857,0.166667,개포6차우성,0.0,계단식,아파트,언주로 3,1.0,3


In [79]:
pred = best_model.predict(test)

ValueError: train and valid dataset categorical_feature do not match.

In [149]:
submission = pd.read_csv('sample_submission.csv')
submission.head(1)

Unnamed: 0,target
0,179048


In [160]:
submission['target'] = pred.astype(int)

In [161]:
submission.head()

Unnamed: 0,target
0,190039
1,252180
2,299319
3,249677
4,195089


In [162]:
submission.to_csv('submission_1.csv', index=False)