# Library & Data Loading

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
import requests
import pickle
import joblib
import re
import math
import lightgbm as lgb

from hyperopt import fmin, tpe, hp, Trials
from hyperopt.pyll import scope
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.neighbors import BallTree
from autogluon.tabular import TabularPredictor
from autogluon.tabular import FeatureMetadata
from tqdm import tqdm

plt.rcParams['font.family'] = 'NanumGothic'

In [2]:
def seed_setting(seed=1004) :
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_setting()

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
subway = pd.read_csv('subway_feature.csv')
bus = pd.read_csv('bus_feature.csv')
submission = pd.read_csv('sample_submission.csv')

  train = pd.read_csv('train.csv')


# Preprocessing

In [4]:
def Entire_Preprocessing(df) :
    # 문자열 컬럼만 찾아서 좌우 공백 제거
    df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

    df = df.drop(columns=['본번', '부번', '시군구', 'k-전화번호', 'k-팩스번호',
                          'k-홈페이지', '고용보험관리번호', 'k-등록일자', 'k-수정일자',
                          '관리비 업로드', '단지소개기존clob'])

    # 군집화
    ## 카카오 API 호출 함수
    def get_coords_kakao(address, api_key):
        url = "https://dapi.kakao.com/v2/local/search/address.json"
        headers = {"Authorization": f"KakaoAK {api_key}"}
        params = {"query": address}
        response = requests.get(url, headers=headers, params=params)
        result = response.json()
        
        try:
            x = float(result['documents'][0]['x'])
            y = float(result['documents'][0]['y'])
            return x, y
        except IndexError:
            return None, None

    ## 도로명을 기반으로 좌표X와 좌표Y를 받아옴 (결측치에 한해서)
    def fill_missing_coords(row):
        if pd.isna(row['좌표X']) or pd.isna(row['좌표Y']):
            coords = roadname_to_coords.get(row['도로명'])
            if coords:
                return pd.Series(coords)
        return pd.Series([row['좌표X'], row['좌표Y']])
    
    roadname_to_coords = {}
    unique_roads = df.loc[df[['좌표X', '좌표Y']].isnull().any(axis=1), '도로명'].dropna().unique()

    api_key = '13b7b7a0b7a853100b56c56f19f6bc24'

    for road in tqdm(unique_roads) :
        x, y = get_coords_kakao(road, api_key)
        if x is not None and y is not None :
            roadname_to_coords[road] = (x, y)

    df[['좌표X', '좌표Y']] = df.apply(fill_missing_coords, axis=1)

    return df

In [9]:
df = Entire_Preprocessing(train)

100%|██████████| 8441/8441 [10:06<00:00, 13.92it/s]


In [11]:
df.to_csv('preprocessed_data.csv', index=False)

In [12]:
df = pd.read_csv('preprocessed_data.csv')

In [None]:
df = df.drop(columns=['등기신청일자', '해제사유발생일'])

# Skip(군집화, 버스, 지하철거리, 금리)

In [13]:
# 군집화 & 버스, 지하철 거리
df = df.dropna(subset=['좌표X', '좌표Y'])
kmeans = joblib.load('kmeans_model.pkl')

df['cluster'] = kmeans.predict(df[['좌표X', '좌표Y']])

def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0

    lat1 = np.radians(lat1)[:, np.newaxis]
    lon1 = np.radians(lon1)[:, np.newaxis]
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c

apt_coords_rad = np.radians(df[['좌표Y', '좌표X']].to_numpy())
bus_coords_rad = np.radians(bus[['Y좌표', 'X좌표']].to_numpy())
subway_coords_rad = np.radians(subway[['위도', '경도']].to_numpy())

bus_tree = BallTree(bus_coords_rad, metric='haversine')
subway_tree = BallTree(subway_coords_rad, metric='haversine')

dist_bus_rad, _ = bus_tree.query(apt_coords_rad, k=1)
dist_sub_rad, _ = subway_tree.query(apt_coords_rad, k=1)

df['closest_bus'] = dist_bus_rad.flatten() * 6371.0
df['closest_sub'] = dist_sub_rad.flatten() * 6371.0

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [16]:
# 금리
df['계약(연)'] = df['계약년월'] // 100
df['계약(월)'] = df['계약년월'] % 100
df = df.drop(columns=['계약년월'])

df['연_월'] = pd.PeriodIndex(year=df['계약(연)'], month=df['계약(월)'], freq='M')

rates = {
    "2025-02": 2.75, "2024-11": 3.00, "2024-10": 3.25, "2023-01": 3.50,
    "2022-11": 3.25, "2022-10": 3.00, "2022-08": 2.50, "2022-07": 2.25,
    "2022-05": 1.75, "2022-04": 1.50, "2022-01": 1.25, "2021-11": 1.00,
    "2021-08": 0.75, "2020-05": 0.50, "2020-03": 0.75, "2019-10": 1.25,
    "2019-07": 1.50, "2018-11": 1.75, "2017-11": 1.50, "2016-06": 1.25,
    "2015-06": 1.50, "2015-03": 1.75, "2014-10": 2.00, "2014-08": 2.25,
    "2013-05": 2.50, "2012-10": 2.75, "2012-07": 3.00, "2011-06": 3.25,
    "2011-03": 3.00, "2011-01": 2.75, "2010-11": 2.50, "2010-07": 2.25,
    "2009-02": 2.00, "2009-01": 2.50, "2008-12": 3.00, "2008-11": 4.00,
    "2008-10": 5.00, "2008-08": 5.25, "2007-08": 5.00, "2007-07": 4.75,
    "2006-08": 4.50
}

rate_changes = pd.DataFrame({
    '연_월' : list(rates.keys()),
    '금리' : list(rates.values())
})

rate_changes['연_월'] = pd.PeriodIndex(rate_changes['연_월'], freq='M')
rate_changes = rate_changes.sort_values('연_월').reset_index(drop=True)

rate_changes['start_month'] = rate_changes['연_월']
rate_changes['end_month'] = rate_changes['연_월'].shift(-1) - 1
rate_changes.at[rate_changes.index[-1], 'end_month'] = pd.Period('2099-12', freq='M')

def assign_rate(contract_period):
    matched = rate_changes[(rate_changes['start_month'] <= contract_period) & (rate_changes['end_month'] >= contract_period)]
    if not matched.empty:
        return matched.iloc[0]['금리']
    return None
    
df['금리'] = df['연_월'].apply(assign_rate)

# AutoML Model Training

In [17]:
numeric_features = [col for col in df.select_dtypes(include=['int64', 'float64']).columns if col != 'target']

scaler = RobustScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

In [18]:
predictor = TabularPredictor(
    label='target',
    path='autogluon_models',
    eval_metric='root_mean_squared_error'
).fit(
    train_data=df,
    time_limit=3600,
    presets='best_quality',
    hyperparameters={
        'GBM': [
            {
                'extra_trees': True,
                'ag_args': {'name_suffix': 'XT'}
            },
            {
                'extra_trees': False,
                'ag_args': {'name_suffix': 'Default'}
            }
        ],
    },

    hyperparameter_tune_kwargs={
        'num_trials': 20,
        'scheduler': 'local',
        'searcher': 'auto'
    }
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.0
Python Version:     3.11.5
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Tue Nov 5 00:21:55 UTC 2024
CPU Count:          24
Memory Avail:       16.25 GB / 23.36 GB (69.6%)
Disk Space Avail:   552.64 GB / 1006.85 GB (54.9%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of the 3600s of rem

[36m(_ray_fit pid=12732)[0m [1000]	valid_set's rmse: 8542.69
[36m(_ray_fit pid=12729)[0m [1000]	valid_set's rmse: 7707.48[32m [repeated 3x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
[36m(_ray_fit pid=12732)[0m [2000]	valid_set's rmse: 7707.43
[36m(_ray_fit pid=12731)[0m [2000]	valid_set's rmse: 7419.05
[36m(_ray_fit pid=12732)[0m [3000]	valid_set's rmse: 7355.79[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=12730)[0m [3000]	valid_set's rmse: 7018.86
[36m(_ray_fit pid=12731)[0m [3000]	valid_set's rmse: 7018.45


[36m(_ray_fit pid=12732)[0m 	Ran out of time, early stopping on iteration 3380. Best iteration is:
[36m(_ray_fit pid=12732)[0m 	[3380]	valid_set's rmse: 7267.16


[36m(_ray_fit pid=13468)[0m [1000]	valid_set's rmse: 7948.89
[36m(_ray_fit pid=13538)[0m [1000]	valid_set's rmse: 8331.61
[36m(_ray_fit pid=13468)[0m [2000]	valid_set's rmse: 7168.61[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=13537)[0m [2000]	valid_set's rmse: 7129.41[32m [repeated 2x across cluster][0m


[36m(_ray_fit pid=13468)[0m 	Ran out of time, early stopping on iteration 2864. Best iteration is:[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=13468)[0m 	[2864]	valid_set's rmse: 6861.4[32m [repeated 4x across cluster][0m
[36m(_dystack pid=10831)[0m 	Stopping HPO to satisfy time limit...
  0%|          | 0/20 [04:23<?, ?it/s]
[36m(_ray_fit pid=13594)[0m 	Ran out of time, early stopping on iteration 2648. Best iteration is:[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=13594)[0m 	[2648]	valid_set's rmse: 6870.15[32m [repeated 3x across cluster][0m
[36m(_dystack pid=10831)[0m Fitted model: LightGBMXT_BAG_L1/T1 ...
[36m(_dystack pid=10831)[0m 	-6962.2446	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=10831)[0m 	263.66s	 = Training   runtime
[36m(_dystack pid=10831)[0m 	204.12s	 = Validation runtime
[36m(_dystack pid=10831)[0m Hyperparameter tuning model: LightGBMDefault_BAG_L1 ... Tuning model for up to 267.02s of the 625.9

[36m(_ray_fit pid=14342)[0m [1000]	valid_set's rmse: 6553.86[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=14341)[0m [1000]	valid_set's rmse: 6810.3[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=14342)[0m [2000]	valid_set's rmse: 6106.01
[36m(_ray_fit pid=14338)[0m [2000]	valid_set's rmse: 6248.11
[36m(_ray_fit pid=14341)[0m [2000]	valid_set's rmse: 6413.23[32m [repeated 2x across cluster][0m


[36m(_ray_fit pid=14341)[0m 	Ran out of time, early stopping on iteration 2360. Best iteration is:
[36m(_ray_fit pid=14341)[0m 	[2360]	valid_set's rmse: 6345.02
[36m(_ray_fit pid=14336)[0m 	Ran out of time, early stopping on iteration 2789. Best iteration is:
[36m(_ray_fit pid=14336)[0m 	[2789]	valid_set's rmse: 5831.59


[36m(_ray_fit pid=15053)[0m [1000]	valid_set's rmse: 6487.25
[36m(_ray_fit pid=15142)[0m [1000]	valid_set's rmse: 6601.93
[36m(_ray_fit pid=15136)[0m [1000]	valid_set's rmse: 6404.74[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=15053)[0m [2000]	valid_set's rmse: 6070.57
[36m(_ray_fit pid=15142)[0m [2000]	valid_set's rmse: 6174.18
[36m(_ray_fit pid=15136)[0m [2000]	valid_set's rmse: 5948.55[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=15053)[0m [3000]	valid_set's rmse: 5894.6


[36m(_ray_fit pid=15053)[0m 	Ran out of time, early stopping on iteration 3253. Best iteration is:[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=15053)[0m 	[3253]	valid_set's rmse: 5865[32m [repeated 3x across cluster][0m
[36m(_dystack pid=10831)[0m 	Stopping HPO to satisfy time limit...
  0%|          | 0/20 [04:18<?, ?it/s]
[36m(_dystack pid=10831)[0m Fitted model: LightGBMDefault_BAG_L1/T1 ...
[36m(_dystack pid=10831)[0m 	-5945.9836	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=10831)[0m 	258.5s	 = Training   runtime
[36m(_dystack pid=10831)[0m 	180.96s	 = Validation runtime
[36m(_dystack pid=10831)[0m Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.00s of the 366.80s of remaining time.
[36m(_ray_fit pid=15267)[0m 	Ran out of time, early stopping on iteration 2892. Best iteration is:[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=15267)[0m 	[2892]	valid_set's rmse: 5624.7[32m [repeated 3x across clust

[36m(_ray_fit pid=15942)[0m [1000]	valid_set's rmse: 6477.57


[36m(_ray_fit pid=15943)[0m 	Ran out of time, early stopping on iteration 1768. Best iteration is:
[36m(_ray_fit pid=15943)[0m 	[1768]	valid_set's rmse: 6287.64
[36m(_ray_fit pid=15942)[0m 	Ran out of time, early stopping on iteration 1754. Best iteration is:
[36m(_ray_fit pid=15942)[0m 	[1753]	valid_set's rmse: 6289.65


[36m(_ray_fit pid=16471)[0m [1000]	valid_set's rmse: 6595.63[32m [repeated 4x across cluster][0m


[36m(_ray_fit pid=16468)[0m 	Ran out of time, early stopping on iteration 1753. Best iteration is:[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=16468)[0m 	[1753]	valid_set's rmse: 6548.45[32m [repeated 3x across cluster][0m
[36m(_dystack pid=10831)[0m 	Stopping HPO to satisfy time limit...
  0%|          | 0/20 [02:20<?, ?it/s]
[36m(_dystack pid=10831)[0m Fitted model: LightGBMXT_BAG_L2/T1 ...
[36m(_dystack pid=10831)[0m 	-6386.1365	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=10831)[0m 	140.73s	 = Training   runtime
[36m(_dystack pid=10831)[0m 	37.49s	 = Validation runtime
[36m(_dystack pid=10831)[0m Hyperparameter tuning model: LightGBMDefault_BAG_L2 ... Tuning model for up to 164.94s of the 224.98s of remaining time.
  0%|          | 0/20 [00:00<?, ?it/s]
[36m(_ray_fit pid=16471)[0m 	Ran out of time, early stopping on iteration 1837. Best iteration is:[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=16471)[0m 	[1837]	val

[36m(_ray_fit pid=17149)[0m [1000]	valid_set's rmse: 5826.95[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=17520)[0m [1000]	valid_set's rmse: 6009.86[32m [repeated 3x across cluster][0m


[36m(_ray_fit pid=17149)[0m 	Ran out of time, early stopping on iteration 2399. Best iteration is:
[36m(_ray_fit pid=17149)[0m 	[2399]	valid_set's rmse: 5786.39
[36m(_ray_fit pid=17147)[0m 	Ran out of time, early stopping on iteration 2304. Best iteration is:
[36m(_ray_fit pid=17147)[0m 	[2269]	valid_set's rmse: 6023.76


[36m(_ray_fit pid=17631)[0m [1000]	valid_set's rmse: 6249.44[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=17520)[0m [2000]	valid_set's rmse: 5978.31


[36m(_ray_fit pid=17520)[0m 	Ran out of time, early stopping on iteration 2473. Best iteration is:
[36m(_ray_fit pid=17520)[0m 	[2473]	valid_set's rmse: 5962.22


[36m(_ray_fit pid=17631)[0m [2000]	valid_set's rmse: 6207.19


[36m(_ray_fit pid=17631)[0m 	Ran out of time, early stopping on iteration 2418. Best iteration is:
[36m(_ray_fit pid=17631)[0m 	[2410]	valid_set's rmse: 6191.24


[36m(_ray_fit pid=17829)[0m [1000]	valid_set's rmse: 6162.34
[36m(_ray_fit pid=17828)[0m [1000]	valid_set's rmse: 6075.26
[36m(_ray_fit pid=17829)[0m [2000]	valid_set's rmse: 6125.28
[36m(_ray_fit pid=17829)[0m [3000]	valid_set's rmse: 6098.68
[36m(_ray_fit pid=17829)[0m [4000]	valid_set's rmse: 6073.94


[36m(_ray_fit pid=17829)[0m 	Ran out of time, early stopping on iteration 4418. Best iteration is:
[36m(_ray_fit pid=17829)[0m 	[4418]	valid_set's rmse: 6065.88
[36m(_dystack pid=10831)[0m 	Stopping HPO to satisfy time limit...
  0%|          | 0/20 [02:23<?, ?it/s]
[36m(_dystack pid=10831)[0m Fitted model: LightGBMDefault_BAG_L2/T1 ...
[36m(_dystack pid=10831)[0m 	-6041.1576	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=10831)[0m 	143.27s	 = Training   runtime
[36m(_dystack pid=10831)[0m 	26.88s	 = Validation runtime
[36m(_dystack pid=10831)[0m Fitting model: WeightedEnsemble_L3 ... Training model for up to 360.00s of the 80.91s of remaining time.
[36m(_dystack pid=10831)[0m 	Ensemble Weights: {'LightGBMDefault_BAG_L1/T1': 0.52, 'LightGBMDefault_BAG_L2/T1': 0.28, 'LightGBMXT_BAG_L2/T1': 0.16, 'LightGBMXT_BAG_L1/T1': 0.04}
[36m(_dystack pid=10831)[0m 	-5870.1972	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=10831)[0m 	0.35

  0%|          | 0/20 [00:00<?, ?it/s]

	Memory not enough to fit 8 folds in parallel. Will train 4 folds in parallel instead (Estimated 16.59% memory usage per fold, 66.37%/80.00% total).
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (4 workers, per: cpus=6, gpus=0, memory=16.59%)
	Stopping HPO to satisfy time limit...
Fitted model: LightGBMXT_BAG_L1/T1 ...
	-6188.3124	 = Validation score   (-root_mean_squared_error)
	688.47s	 = Training   runtime
	735.63s	 = Validation runtime
Hyperparameter tuning model: LightGBMDefault_BAG_L1 ... Tuning model for up to 817.03s of the 2034.88s of remaining time.


  0%|          | 0/20 [00:00<?, ?it/s]

	Memory not enough to fit 8 folds in parallel. Will train 4 folds in parallel instead (Estimated 16.55% memory usage per fold, 66.21%/80.00% total).
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (4 workers, per: cpus=6, gpus=0, memory=16.55%)
	Stopping HPO to satisfy time limit...
Fitted model: LightGBMDefault_BAG_L1/T1 ...
	-5584.5134	 = Validation score   (-root_mean_squared_error)
	672.81s	 = Training   runtime
	657.28s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.00s of the 1361.17s of remaining time.
	Ensemble Weights: {'LightGBMDefault_BAG_L1/T1': 0.792, 'LightGBMXT_BAG_L1/T1': 0.208}
	-5539.4574	 = Validation score   (-root_mean_squared_error)
	0.25s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting 2 L2 models, fit_strategy="sequential" ...
Hyperparameter tuning model: LightGBMXT_BAG_L2 ... Tuning model for up to 612.4s of the 1360.82s of remaining time.


  0%|          | 0/20 [00:00<?, ?it/s]

	Memory not enough to fit 8 folds in parallel. Will train 4 folds in parallel instead (Estimated 17.43% memory usage per fold, 69.73%/80.00% total).
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (4 workers, per: cpus=6, gpus=0, memory=17.43%)
	Stopping HPO to satisfy time limit...
Fitted model: LightGBMXT_BAG_L2/T1 ...
	-5889.5428	 = Validation score   (-root_mean_squared_error)
	645.85s	 = Training   runtime
	639.2s	 = Validation runtime
Hyperparameter tuning model: LightGBMDefault_BAG_L2 ... Tuning model for up to 612.4s of the 714.1s of remaining time.


  0%|          | 0/20 [00:00<?, ?it/s]

	Memory not enough to fit 8 folds in parallel. Will train 4 folds in parallel instead (Estimated 17.46% memory usage per fold, 69.83%/80.00% total).
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (4 workers, per: cpus=6, gpus=0, memory=17.46%)
	Memory not enough to fit 8 folds in parallel. Will train 4 folds in parallel instead (Estimated 17.58% memory usage per fold, 70.33%/80.00% total).
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (4 workers, per: cpus=6, gpus=0, memory=17.58%)
	Memory not enough to fit 8 folds in parallel. Will train 4 folds in parallel instead (Estimated 17.49% memory usage per fold, 69.95%/80.00% total).
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (4 workers, per: cpus=6, gpus=0, memory=17.49%)
	Memory not enough to fit 8 folds in parallel. Will train 4 folds in parallel instead (Estimated 17.57% memory usage per fold, 70.30%/80.00% total).
	F

# Test & Submission

In [19]:
test_df = Entire_Preprocessing(test)

100%|██████████| 2052/2052 [02:38<00:00, 12.98it/s]


# Skip

In [20]:
# 군집화 & 버스, 지하철 거리
kmeans = joblib.load('kmeans_model.pkl')

coords_exist = test_df[['좌표X', '좌표Y']].notna().all(axis=1)

test_df.loc[coords_exist, 'cluster'] = kmeans.predict(
   test_df.loc[coords_exist, ['좌표X', '좌표Y']]
)

test_df.loc[~coords_exist, 'cluster'] = np.nan

test_apt_coords_rad = np.radians(
   test_df.loc[coords_exist, ['좌표Y', '좌표X']].to_numpy()
)

if len(test_apt_coords_rad) > 0:
   dist_bus_rad, _ = bus_tree.query(test_apt_coords_rad, k=1)
   dist_sub_rad, _ = subway_tree.query(test_apt_coords_rad, k=1)
   
   test_df.loc[coords_exist, 'closest_bus'] = dist_bus_rad.flatten() * 6371.0
   test_df.loc[coords_exist, 'closest_sub'] = dist_sub_rad.flatten() * 6371.0

test_df.loc[~coords_exist, 'closest_bus'] = np.nan
test_df.loc[~coords_exist, 'closest_sub'] = np.nan

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [22]:
# 금리
test_df['계약(연)'] = test_df['계약년월'] // 100
test_df['계약(월)'] = test_df['계약년월'] % 100

test_df['연_월'] = pd.PeriodIndex(year=test_df['계약(연)'], month=test_df['계약(월)'], freq='M')

rates = {
    "2025-02": 2.75, "2024-11": 3.00, "2024-10": 3.25, "2023-01": 3.50,
    "2022-11": 3.25, "2022-10": 3.00, "2022-08": 2.50, "2022-07": 2.25,
    "2022-05": 1.75, "2022-04": 1.50, "2022-01": 1.25, "2021-11": 1.00,
    "2021-08": 0.75, "2020-05": 0.50, "2020-03": 0.75, "2019-10": 1.25,
    "2019-07": 1.50, "2018-11": 1.75, "2017-11": 1.50, "2016-06": 1.25,
    "2015-06": 1.50, "2015-03": 1.75, "2014-10": 2.00, "2014-08": 2.25,
    "2013-05": 2.50, "2012-10": 2.75, "2012-07": 3.00, "2011-06": 3.25,
    "2011-03": 3.00, "2011-01": 2.75, "2010-11": 2.50, "2010-07": 2.25,
    "2009-02": 2.00, "2009-01": 2.50, "2008-12": 3.00, "2008-11": 4.00,
    "2008-10": 5.00, "2008-08": 5.25, "2007-08": 5.00, "2007-07": 4.75,
    "2006-08": 4.50
}
    
test_df['금리'] = test_df['연_월'].apply(assign_rate)

# Start

In [29]:
test_df = test_df.replace('', np.nan)

test_df[numeric_features] = scaler.transform(test_df[numeric_features])

In [30]:
# Autogluon 예측
pred = predictor.predict(test_df)



In [31]:
submission['target'] = pred.round().astype(int)

In [32]:
submission.head(10)

Unnamed: 0,target
0,171520
1,274593
2,328479
3,278501
4,212132
5,229851
6,236032
7,222793
8,175831
9,381545


In [34]:
submission.to_csv('AutoML_final.csv', index=False)