In [2]:
# --- 1. 라이브러리 임포트 ---
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import lightgbm as lgb # LightGBM 임포트
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [5]:
# 경로를 os.path.join()으로 수정
data_dir = os.path.join(os.getenv('HOME'), 'aiffel/kaggle_kakr_housing/data')

# train, test 데이터 경로 설정
train_data_path = os.path.join(data_dir, 'train.csv')
test_data_path = os.path.join(data_dir, 'test.csv')

# 데이터 로드
train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

In [9]:
# --- 3. 기본 데이터 준비 ---
y_original = train['price']
y_log = np.log1p(y_original)
test_id = test['id']     # 테스트 ID 저장

# 가격 컬럼 및 ID 컬럼 드롭 (ID는 test_id로 저장했으므로)
train_df = train.drop(columns=['price'])

# 나중에 분리하기 위해 학습 데이터 길이 저장
train_len = len(train_df)

# 학습 데이터와 테스트 데이터 합치기
data = pd.concat((train_df, test), axis=0).reset_index(drop=True)
# ID 컬럼은 test_id로 저장했으므로 전체 데이터에서 제거
data = data.drop(columns=['id'])
print("Train and test data concatenated.")

Train and test data concatenated.


In [10]:
# --- 4. Feature Engineering (위치 정보 유지!) ---

# 4.1. Date 처리
def convert_date(df: pd.DataFrame) -> pd.DataFrame:
    if 'date' in df.columns:
        df['date'] = df['date'].apply(lambda x: int(str(x)[:6]))
    else:
        print("Warning: 'date' column not found.")
    return df
data = convert_date(data)

# 4.2. Year/Age 특징 생성
if 'date' in data.columns and 'yr_built' in data.columns:
    data['sale_year'] = data['date'].astype(str).str[:4].astype(int)
    data['age'] = data['sale_year'] - data['yr_built']
else:
    print("Warning: Could not create 'age' feature.")

if 'sale_year' in data.columns and 'yr_renovated' in data.columns and 'yr_built' in data.columns:
    effective_renovation_year = np.where(
        data['yr_renovated'] == 0, data['yr_built'], data['yr_renovated']
    )
    data['age_renovated'] = data['sale_year'] - effective_renovation_year
else:
     print("Warning: Could not create 'age_renovated' feature.")

# 4.3. Area/Ratio 특징 생성
data['sqft_basement'] = data['sqft_basement'].fillna(0) # 결측치 처리

area_cols = ['sqft_living', 'sqft_basement', 'sqft_above', 'sqft_lot', 'sqft_living15', 'sqft_lot15']
if all(col in data.columns for col in area_cols):
    data['total_sqft'] = data[area_cols].sum(axis=1)
    data['basement_finished_ratio'] = data['sqft_basement'] / data['total_sqft']
    data['basement_finished_ratio'] = data['basement_finished_ratio'].fillna(0)
    data.loc[data['total_sqft'] == 0, 'basement_finished_ratio'] = 0
else:
    print(f"Warning: Could not create 'total_sqft'. Missing components.")
    if 'basement_finished_ratio' not in data.columns: data['basement_finished_ratio'] = 0

if 'sqft_living' in data.columns and 'sqft_lot' in data.columns:
    data['living_lot_ratio'] = data['sqft_living'] / data['sqft_lot']
    data.loc[data['sqft_lot'] == 0, 'living_lot_ratio'] = 0
else:
    print("Warning: Could not create 'living_lot_ratio'.")
    if 'living_lot_ratio' not in data.columns: data['living_lot_ratio'] = 0

# --- [중요] 파생 위치 Feature 추가 영역 ---
print("\nPlaceholder: Add derived location features (zipcode stats, distance, clusters) here for better performance.")
# 예시: data['zip_mean_price'] = data.groupby('zipcode')['price'].transform('mean')
#      (주의: 위 예시는 Data Leakage 발생 가능. K-Fold 내에서 처리 필요)
# 지금은 Raw 위도, 경도, 우편번호만 사용합니다.

# --- 4.4. 범주형 Feature 처리: Zipcode (One-Hot Encoding) ---
if 'zipcode' in data.columns:
    print("Applying One-Hot Encoding to 'zipcode'...")
    data['zipcode'] = data['zipcode'].astype(str) # 문자열로 변환
    data = pd.get_dummies(data, columns=['zipcode'], prefix='zip', dummy_na=False)
    print(f"Zipcode encoded. Number of columns increased to: {data.shape[1]}")
else:
    print("Warning: 'zipcode' column not found for encoding.")

# --- 4.5. 불필요 컬럼 제거 (위치 정보는 유지!) ---
cols_to_drop = ['date', 'yr_built', 'yr_renovated', 'sale_year', 'sqft_basement']
cols_to_drop_existing = [col for col in cols_to_drop if col in data.columns]
data = data.drop(columns=cols_to_drop_existing, errors='ignore')
print(f"\nColumns dropped: {cols_to_drop_existing}")
print(f"Total columns remaining: {data.shape[1]}")
# print("Remaining columns:", data.columns.tolist()) # 너무 많으면 주석 처리


Placeholder: Add derived location features (zipcode stats, distance, clusters) here for better performance.
Applying One-Hot Encoding to 'zipcode'...
Zipcode encoded. Number of columns increased to: 94

Columns dropped: ['date', 'yr_built', 'yr_renovated', 'sale_year', 'sqft_basement']
Total columns remaining: 89


In [11]:
# --- 5. 데이터 분리 ---
X = data[:train_len]
X_test = data[train_len:] # 테스트셋 (나중에 최종 예측 시 사용)
print("\nData split back into X (train features) and X_test (test features).")
print("Shape of X:", X.shape)
print("Shape of X_test:", X_test.shape)


Data split back into X (train features) and X_test (test features).
Shape of X: (15035, 89)
Shape of X_test: (6468, 89)


In [12]:
# --- 6. 스케일링 (수치형만, One-Hot 제외) ---
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
zip_cols = [col for col in X.columns if col.startswith('zip_')]
numerical_cols_to_scale = [col for col in numerical_cols if col not in zip_cols]

if numerical_cols_to_scale:
    print(f"\nScaling {len(numerical_cols_to_scale)} numerical features...")
    scaler = StandardScaler()
    X[numerical_cols_to_scale] = scaler.fit_transform(X[numerical_cols_to_scale])
    # 테스트 데이터에도 동일한 스케일러 적용
    X_test[numerical_cols_to_scale] = scaler.transform(X_test[numerical_cols_to_scale])
    print("Scaling complete.")
else:
    print("\nNo numerical columns found to scale.")


Scaling 19 numerical features...
Scaling complete.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [13]:
# --- 7. K-Fold 교차 검증 (최적 파라미터 사용) ---

# 7.1. 이전 RandomizedSearch에서 찾은 최적 파라미터 사용
best_params = {'bagging_fraction': 0.7159005811655073, 'feature_fraction': 0.6644885149016018, 'lambda_l1': 5.233480488540084, 'lambda_l2': 1.7079750342958222, 'learning_rate': 0.021496797273657196, 'max_depth': 10, 'metric': 'rmse', 'n_estimators': 1851, 'num_leaves': 39, 'objective': 'regression_l1', 'seed': 42}

final_params = {}
for key, value in best_params.items():
    if isinstance(value, list) and len(value) > 0: final_params[key] = value[0]
    else: final_params[key] = value
if 'random_state' not in final_params: final_params['random_state'] = 42
final_params.pop('seed', None)
if 'n_estimators' not in final_params: final_params['n_estimators'] = 1851 # best_params에 있었음
final_params['verbose'] = -1 # 로그 줄이기

print("\nUsing best parameters for K-Fold CV:", final_params)

# 7.2. K-Fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 7.3. K-Fold 루프 실행
original_scale_fold_rmses = []
print("\nRunning K-Fold CV with best parameters AND raw location features...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_log)):
    print(f"--- Fold {fold+1} ---")
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_log_fold, y_val_log_fold = y_log.iloc[train_idx], y_log.iloc[val_idx]
    y_val_original_fold = y_original.iloc[val_idx]

    model = lgb.LGBMRegressor(**final_params)
    model.fit(X_train_fold, y_train_log_fold,
              eval_set=[(X_val_fold, y_val_log_fold)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(100, verbose=False)])

    y_pred_log = model.predict(X_val_fold)
    y_pred_original = np.expm1(y_pred_log)
    y_pred_original[y_pred_original < 0] = 0

    rmse = np.sqrt(mean_squared_error(y_val_original_fold, y_pred_original))
    original_scale_fold_rmses.append(rmse)
    print(f"Fold {fold+1} Original Scale RMSE: {rmse:.4f}")

# 7.4. 최종 평균 CV RMSE 출력
mean_original_rmse = np.mean(original_scale_fold_rmses)
print(f"\n======================================================")
print(f"Estimated Average CV RMSE (Original Scale) with raw location features: {mean_original_rmse:.4f}")
print(f"======================================================")

target_rmse = 110000
print(f"Target RMSE: {target_rmse}")
if mean_original_rmse < target_rmse:
    print("CV score is below the target! Good sign.")
else:
    print(f"CV score is still above the target by {mean_original_rmse - target_rmse:.4f}.")
    print("-> Next step: Implement derived location features (zipcode stats, distance, clusters) for further improvement.")


Using best parameters for K-Fold CV: {'bagging_fraction': 0.7159005811655073, 'feature_fraction': 0.6644885149016018, 'lambda_l1': 5.233480488540084, 'lambda_l2': 1.7079750342958222, 'learning_rate': 0.021496797273657196, 'max_depth': 10, 'metric': 'rmse', 'n_estimators': 1851, 'num_leaves': 39, 'objective': 'regression_l1', 'random_state': 42, 'verbose': -1}

Running K-Fold CV with best parameters AND raw location features...
--- Fold 1 ---
Fold 1 Original Scale RMSE: 110956.5094
--- Fold 2 ---
Fold 2 Original Scale RMSE: 103203.3774
--- Fold 3 ---
Fold 3 Original Scale RMSE: 113497.6596
--- Fold 4 ---
Fold 4 Original Scale RMSE: 145738.0378
--- Fold 5 ---
Fold 5 Original Scale RMSE: 115096.9355

Estimated Average CV RMSE (Original Scale) with raw location features: 117698.5039
Target RMSE: 110000
CV score is still above the target by 7698.5039.
-> Next step: Implement derived location features (zipcode stats, distance, clusters) for further improvement.


In [14]:
# --- 8. [참고] 최종 예측 및 제출 파일 생성 (대회 종료됨) ---
# 필요 시 주석 해제하여 실행 (현재는 CV 결과 확인이 주 목적)
print("\nTraining final model...")
final_model = lgb.LGBMRegressor(**final_params)
final_model.fit(X, y_log)
print("Predicting test data...")
test_pred_log = final_model.predict(X_test)
test_pred_original = np.expm1(test_pred_log)
test_pred_original[test_pred_original < 0] = 0
submission = pd.DataFrame({'id': test_id, 'price': test_pred_original})
submission_path = os.path.join(data_dir, 'submission_lgbm_with_raw_location_v1.csv')
submission.to_csv(submission_path, index=False)
print(f"Submission file saved to: {submission_path}")


Training final model...
Predicting test data...
Submission file saved to: /aiffel/aiffel/kaggle_kakr_housing/data/submission_lgbm_with_raw_location_v1.csv
