In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import uniform, randint
from xgboost import XGBRegressor

warnings.filterwarnings('ignore')

In [2]:
# data import
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [3]:
missing_data = pd.DataFrame({
    'missing': train_df.isnull().sum(),
    'percentage': train_df.isnull().sum() / len(train_df) * 100
})

missing_data = missing_data[missing_data['missing'] > 0]
print(missing_data.sort_values(by='missing', ascending=False))
missing_col = list(missing_data[missing_data['percentage'] > 30].index)

              missing  percentage
PoolQC           1453   99.520548
MiscFeature      1406   96.301370
Alley            1369   93.767123
Fence            1179   80.753425
MasVnrType        872   59.726027
FireplaceQu       690   47.260274
LotFrontage       259   17.739726
GarageType         81    5.547945
GarageYrBlt        81    5.547945
GarageFinish       81    5.547945
GarageQual         81    5.547945
GarageCond         81    5.547945
BsmtExposure       38    2.602740
BsmtFinType2       38    2.602740
BsmtQual           37    2.534247
BsmtCond           37    2.534247
BsmtFinType1       37    2.534247
MasVnrArea          8    0.547945
Electrical          1    0.068493


In [4]:
train_df = train_df.drop(columns=missing_col, errors='ignore')

missing_num = ['LotFrontage', 'GarageYrBlt', 'MasVnrArea']
for col in missing_num:
    train_df[col] = train_df[col].fillna(train_df[col].mean())
    
missing_obj = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'Electrical']
for col in missing_obj:
    train_df[col].fillna(train_df[col].mode()[0], inplace=True)


In [5]:
missing_data = pd.DataFrame({
    'missing': train_df.isnull().sum(),
    'percentage': train_df.isnull().sum() / len(train_df) * 100
})

missing_data = missing_data[missing_data['missing'] > 0]
print(missing_data.sort_values(by='missing', ascending=False))

Empty DataFrame
Columns: [missing, percentage]
Index: []


In [6]:
# Neighborhood별 평균 집값 계산
price_by_nb = train_df.groupby("Neighborhood")["SalePrice"].mean()
nb_groups = pd.qcut(price_by_nb, q=4, labels=["Low", "MedLow", "MedHigh", "High"])
nb_group_map = nb_groups.to_dict()
train_df["Neighborhood_group"] = train_df["Neighborhood"].map(nb_group_map)

In [7]:
def simplify_ms_subclass(x):
    if x in [20, 30, 40, 45, 50, 60, 70, 75, 80, 85]:
        return "SingleFam"
    elif x in [120, 150, 160, 180]:
        return "PUD"
    elif x in [90, 190]:
        return "MultiFam"
    else:
        return "Other"
    
train_df["MSSubClass_simplified"] = train_df["MSSubClass"].apply(simplify_ms_subclass)

In [8]:
def simplify_sale_type(x):
    if x in ["WD", "CWD", "VWD"]:
        return "Normal"
    else:
        return "Other"

train_df["SaleType_group"] = train_df["SaleType"].apply(simplify_sale_type)

In [9]:
def simplify_sale_condition(x):
    if x == "Normal":
        return "Normal"
    else:
        return "Abnormal"
    
train_df["SaleCondition_group"] = train_df["SaleCondition"].apply(simplify_sale_condition)

In [16]:
numeric_cols = ['TotalBsmtSF', 'GrLivArea', 'GarageArea','LotArea', 'OverallCond', 'OverallQual', 'YearRemodAdd']
categorical_cols = ['MSSubClass_simplified', 'Neighborhood_group', 'SaleCondition_group', 'SaleType_group']    

X_categorical = pd.get_dummies(train_df[categorical_cols])
print(X_categorical.sum())

X = pd.concat([train_df[numeric_cols], X_categorical], axis=1).to_numpy()
y = train_df['SalePrice'].to_numpy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

MSSubClass_simplified_MultiFam       82
MSSubClass_simplified_PUD           160
MSSubClass_simplified_SingleFam    1218
Neighborhood_group_High             278
Neighborhood_group_Low              415
Neighborhood_group_MedHigh          398
Neighborhood_group_MedLow           369
SaleCondition_group_Abnormal        262
SaleCondition_group_Normal         1198
SaleType_group_Normal              1271
SaleType_group_Other                189
dtype: int64


In [18]:
xgb = XGBRegressor(random_state=42)

# 하이퍼파라미터 범위
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 6),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0.2, 0.5),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0.3, 0.5),
}

# MSE 기반 스코어
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# RandomizedSearchCV
rscv = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,
    scoring=mse_scorer,
    cv=5,
    verbose=1,
    n_jobs=-1
)

# 학습
rscv.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

# 최적 모델
best_xgb = rscv.best_estimator_

# 테스트 성능 평가
y_pred = best_xgb.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)
print(f"Validation RMSE: {rmse:.2f}, R2: {r2:.4f}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Validation RMSE: 27415.08, R2: 0.9020


In [20]:
test_df["MSSubClass_simplified"] = test_df["MSSubClass"].apply(simplify_ms_subclass)
test_df["Neighborhood_group"] = test_df["Neighborhood"].map(nb_group_map)
test_df["SaleType_group"] = test_df["SaleType"].apply(simplify_sale_type)
test_df["SaleCondition_group"] = test_df["SaleCondition"].apply(simplify_sale_condition)

X_categorical = pd.get_dummies(test_df[categorical_cols])
X = pd.concat([test_df[numeric_cols], X_categorical], axis=1).to_numpy()

In [None]:
y_pred_test = best_xgb.predict(X)
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': y_pred_test
})

# submission.to_csv('submission.csv', index=False)