In [59]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
"""
주택 가격 데이터를 활용한 회귀 모델 학습 및 예측
DecisionTreeRegressor를 활용한 모델 구축 및 성능 평가

주택 가격 데이터를 기반으로 회귀 모델을 학습하여 주택 가격을 예측해야 합니다.
"""

In [34]:
# 데이터 불러오기
df = pd.read_csv('20250620_143716_train.csv')

In [35]:
# 데이터 정보 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [36]:
# 데이터 구조 확인
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [37]:
# 결측값 처리
# 결측치 개수 확인
bf_counts = df.isnull().sum()
print(bf_counts[bf_counts > 0])

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [38]:
# 결측치가 500개 이상인 열 삭제
for col in df :
    if bf_counts[col] > 500 :
        df = df.drop(col, axis=1)

In [39]:
# 500개 이상 결측치 제거 확인
af_counts = df.isnull().sum()
print(af_counts[af_counts > 0])

LotFrontage     259
MasVnrArea        8
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Electrical        1
GarageType       81
GarageYrBlt      81
GarageFinish     81
GarageQual       81
GarageCond       81
dtype: int64


In [40]:
# LotFrontage 열 평균으로 대체
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].mean())

In [41]:
# 결과 확인
print(df['LotFrontage'].isnull().sum())

0


In [52]:
# 범주형 데이터 인코딩
categorical_cols = df.select_dtypes(include='object').columns
df = pd.get_dummies(df, columns=categorical_cols)

In [None]:
# 결과 확인
df.select_dtypes(include='object').columns

Index([], dtype='object')

In [58]:
# 불필요한 Id 열 제거
df = df.drop(columns=['Id'], axis=1)

In [62]:
# 결과 확인
'Id' in df.columns

False

In [64]:
# 학습 및 테스트 데이터 분리 : 8:2 비율
X = df.drop('SalePrice', axis=1)    # Feature : 나머지 열
y = df['SalePrice']                 # Target : SalePrice

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
# 모델 학습
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [68]:
# 모델 평가
pred = model.predict(X_test)

mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

print('MAE(Mean Absolute Error; 평균 절대 오차): ', mae, ' >> 작을수록 좋음')
print('MSE(Mean Squared Error; 평균 제곱 오차) : ', mse, ' >> 작을수록 좋음')
print('RMSE(Root Mean Squared Error; 평균 제곱근 오차) : ', rmse, ' >> 작을수록 좋음')
print('R^2(R-squared; 결정 계수) : ', r2, ' >> 1에 가까울 수록 좋음')

MAE(Mean Absolute Error; 평균 절대 오차):  27599.695205479453  >> 작을수록 좋음
MSE(Mean Squared Error; 평균 제곱 오차) :  1779381093.921233  >> 작을수록 좋음
RMSE(Root Mean Squared Error; 평균 제곱근 오차) :  42182.710841305976  >> 작을수록 좋음
R^2(R-squared; 결정 계수) :  0.768017546339301  >> 1에 가까울 수록 좋음
