In [1]:
# 문제정의(KAGGLE : https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview)

# ---------------------------
## 1. 해결과제
# ---------------------------
# - 주어진 주택 데이터(물리적 특성 + 주변 환경)를 이용해 **최종 판매 가격(SalePrice)** 예측
# - 단순 선형 회귀가 아니라 다양한 회귀 기법을 적용해 성능 최적화가 목표

# ---------------------------
# 2. 타겟 변수
# ---------------------------
# - **SalePrice**  
# - 주택의 실제 판매 가격 (회귀 문제의 종속 변수)

# ---------------------------
# 학습에 사용할 X 후보 선정 
# # ---------------------------
# features = [
#     "OverallQual",   # 전반적 자재/마감 품질
#     "GrLivArea",     # 지상 생활 면적
#     "GarageCars",    # 차고 수용 차량 수
#     "GarageArea",    # 차고 면적
#     "TotalBsmtSF",   # 지하 전체 면적
#     "1stFlrSF",      # 1층 면적
#     "FullBath",      # 전체 욕실 개수
#     "YearBuilt",     # 건축 연도
#     "YearRemodAdd",  # 리모델링 연도
#     "KitchenQual",   # 주방 품질
#     "Fireplaces",    # 벽난로 개수
#     "Neighborhood"   # 위치
# ]
# # 선택 Feature 값 정리 (표)

# | Feature       | 타입     | 값 범위 / 카테고리 |
# |---------------|---------|--------------------|
# | OverallQual   | 정수형  | 1 ~ 10 (1=매우 나쁨, 10=매우 우수) |
# | GrLivArea     | 수치형  | 약 300 ~ 5600+ (지상 생활 면적, sq ft) |
# | GarageCars    | 정수형  | 0 ~ 4 (차고에 주차 가능한 차량 수) |
# | GarageArea    | 수치형  | 0 ~ 1400+ (차고 면적, sq ft) |
# | TotalBsmtSF   | 수치형  | 0 ~ 6000+ (지하 전체 면적, sq ft) |
# | 1stFlrSF      | 수치형  | 약 300 ~ 4000+ (1층 면적, sq ft) |
# | FullBath      | 정수형  | 0 ~ 3+ (전체 욕실 개수) |
# | YearBuilt     | 정수형  | 1870 ~ 2010 (건축 연도) |
# | YearRemodAdd  | 정수형  | 1950 ~ 2010 (리모델링 연도, 없으면 YearBuilt 동일) |
# | KitchenQual   | 범주형  | {Ex=우수, Gd=좋음, TA=보통, Fa=나쁨} |
# | Fireplaces    | 정수형  | 0 ~ 3 (벽난로 개수) |
# | Neighborhood  | 범주형  | 25개 지역 코드 (예: CollgCr, Veenker, Crawfor, NoRidge, Mitchel 등) |


In [2]:
# ---------------------------------
# 라이브러리 가져오기
# ---------------------------------

In [3]:
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
# ---------------------------------
# EDA
# ---------------------------------
train.shape, test.shape

((1460, 81), (1459, 80))

In [5]:
train.head()
train.tail()
train.sample(5)

test.head()
test.tail()
test.sample(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
960,2421,20,RL,75.0,9532,Pave,,Reg,Lvl,AllPub,...,0,368,,GdPrv,,0,2,2007,WD,Normal
715,2176,20,RL,,14860,Pave,,IR2,Lvl,AllPub,...,0,0,,,,0,6,2008,WD,Normal
986,2447,70,RM,,10337,Pave,Pave,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,5,2007,WD,Normal
481,1942,60,RL,,14067,Pave,,Reg,Lvl,AllPub,...,0,0,,GdPrv,,0,9,2008,WD,Normal
568,2029,160,FV,24.0,2280,Pave,Pave,Reg,Lvl,AllPub,...,0,0,,,,0,4,2008,WD,Normal


In [6]:
# train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [7]:
train.describe()
# test.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [8]:
train.describe(include='O')
test.describe(include='O')

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,1455,1459,107,1459,1459,1457,1459,1459,1459,1459,...,1383,1381,1381,1381,1459,3,290,51,1458,1459
unique,5,2,2,4,4,1,5,3,25,9,...,6,3,4,5,3,2,4,3,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,Ex,MnPrv,Shed,WD,Normal
freq,1114,1453,70,934,1311,1457,1081,1396,218,1251,...,853,625,1293,1328,1301,2,172,46,1258,1204


In [9]:
train.isnull().sum()
print(train.isnull().sum().to_string())  # 생략없이 다 표시하는 거

print(test.isnull().sum().to_string())

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType        872
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFin

In [10]:
# ---------------------------------
# 데이터 전처리
# ---------------------------------|
# y값 분리
target = train.pop('SalePrice')

In [11]:
train.shape, test.shape

((1460, 80), (1459, 80))

In [12]:
# # 후보데이터만 사용 - 선택

features = [
    "OverallQual",   # 전반적 자재/마감 품질
    "GrLivArea",     # 지상 생활 면적
    "GarageCars",    # 차고 수용 차량 수
    "GarageArea",    # 차고 면적
    "TotalBsmtSF",   # 지하 전체 면적
    "1stFlrSF",      # 1층 면적
    "FullBath",      # 전체 욕실 개수
    "YearBuilt",     # 건축 연도
    "YearRemodAdd",  # 리모델링 연도
    "KitchenQual",   # 주방 품질 (범주형)
    "Fireplaces",    # 벽난로 개수
    "Neighborhood"   # 위치 (범주형)
]
train = train [features]
# train.info()
test = test [features]
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   OverallQual   1459 non-null   int64  
 1   GrLivArea     1459 non-null   int64  
 2   GarageCars    1458 non-null   float64
 3   GarageArea    1458 non-null   float64
 4   TotalBsmtSF   1458 non-null   float64
 5   1stFlrSF      1459 non-null   int64  
 6   FullBath      1459 non-null   int64  
 7   YearBuilt     1459 non-null   int64  
 8   YearRemodAdd  1459 non-null   int64  
 9   KitchenQual   1458 non-null   object 
 10  Fireplaces    1459 non-null   int64  
 11  Neighborhood  1459 non-null   object 
dtypes: float64(3), int64(7), object(2)
memory usage: 136.9+ KB


In [13]:
# 결측치 제거 / 채우기 (수치 - 최소/최대/평균/중앙)

# train.isnull().sum()
test.isnull().sum()

OverallQual     0
GrLivArea       0
GarageCars      1
GarageArea      1
TotalBsmtSF     1
1stFlrSF        0
FullBath        0
YearBuilt       0
YearRemodAdd    0
KitchenQual     1
Fireplaces      0
Neighborhood    0
dtype: int64

In [14]:
# 수치형 컬럼 결측치 채우기(최소/ 최대/ 평균/ 중앙값 중 최소값 선택)
test['GarageCars'] = test['GarageCars'].fillna(test['GarageCars'].min())
test['GarageArea'] = test['GarageArea'].fillna(test['GarageArea'].min())
test['TotalBsmtSF'] = test['TotalBsmtSF'].fillna(test['TotalBsmtSF'].min())

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   OverallQual   1459 non-null   int64  
 1   GrLivArea     1459 non-null   int64  
 2   GarageCars    1459 non-null   float64
 3   GarageArea    1459 non-null   float64
 4   TotalBsmtSF   1459 non-null   float64
 5   1stFlrSF      1459 non-null   int64  
 6   FullBath      1459 non-null   int64  
 7   YearBuilt     1459 non-null   int64  
 8   YearRemodAdd  1459 non-null   int64  
 9   KitchenQual   1458 non-null   object 
 10  Fireplaces    1459 non-null   int64  
 11  Neighborhood  1459 non-null   object 
dtypes: float64(3), int64(7), object(2)
memory usage: 136.9+ KB


In [15]:
# 범주형 컬럼(X) 결측치 채우기(최빈값 / 사분위수 범위내 중 최소값/최대값..)

test['KitchenQual'] = test['KitchenQual'].fillna(test['KitchenQual'].mode()[0]) 
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   OverallQual   1459 non-null   int64  
 1   GrLivArea     1459 non-null   int64  
 2   GarageCars    1459 non-null   float64
 3   GarageArea    1459 non-null   float64
 4   TotalBsmtSF   1459 non-null   float64
 5   1stFlrSF      1459 non-null   int64  
 6   FullBath      1459 non-null   int64  
 7   YearBuilt     1459 non-null   int64  
 8   YearRemodAdd  1459 non-null   int64  
 9   KitchenQual   1459 non-null   object 
 10  Fireplaces    1459 non-null   int64  
 11  Neighborhood  1459 non-null   object 
dtypes: float64(3), int64(7), object(2)
memory usage: 136.9+ KB


In [16]:
# 이상치 제거 / 채우기


In [17]:
# 인코딩(LIGHTGBM 사용) -> 범주형

train['KitchenQual'] = train['KitchenQual'].astype('category')
train['Neighborhood'] = train['Neighborhood'].astype('category')

test['KitchenQual'] = test['KitchenQual'].astype('category')
test['Neighborhood'] = test['Neighborhood'].astype('category')

train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   OverallQual   1460 non-null   int64   
 1   GrLivArea     1460 non-null   int64   
 2   GarageCars    1460 non-null   int64   
 3   GarageArea    1460 non-null   int64   
 4   TotalBsmtSF   1460 non-null   int64   
 5   1stFlrSF      1460 non-null   int64   
 6   FullBath      1460 non-null   int64   
 7   YearBuilt     1460 non-null   int64   
 8   YearRemodAdd  1460 non-null   int64   
 9   KitchenQual   1460 non-null   category
 10  Fireplaces    1460 non-null   int64   
 11  Neighborhood  1460 non-null   category
dtypes: category(2), int64(10)
memory usage: 118.0 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   OverallQual   1459 non-

In [22]:
# ---------------------------------
# 검증 데이터 분할 train - val (8,2)
# ---------------------------------

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train,
    target,
    test_size=0.2,
    random_state=0)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1168, 12), (292, 12), (1168,), (292,))

In [26]:
# ---------------------------------
# 학습 및 평가(MSE, MAE , R2..)
# ---------------------------------

import lightgbm as lgb

model = lgb.LGBMRegressor(random_state=0, verbose=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
y_pred

array([341764.75749812, 141688.47910067, 120677.81758381, 228575.75211451,
        92251.28852243, 113043.99740768, 225809.54820198, 127978.36444965,
       563876.8571553 , 146429.51181508, 200409.89106504, 173586.80944635,
       220551.87875481, 117533.88502192, 128088.77204484, 132381.60556189,
       207334.32809832, 117411.78140702, 144575.01788236, 172587.93147109,
       128809.21044769, 143704.1417425 , 109073.95609355, 175374.0920053 ,
       188612.94734439, 190423.50138547, 176349.39322053,  71867.25871788,
       306058.79134878, 123242.47692563, 118737.94715063, 181409.04171498,
       140585.05457823, 288160.12655409, 404077.30328448, 176080.76221067,
       267782.91798305, 130654.79465395, 250707.29123974, 299490.56030609,
       202327.97574432, 130428.26202987, 189989.11439171, 302960.56851511,
       388234.26728087, 141637.48155231, 122178.45559983, 123184.20341559,
       154213.64188508,  97465.09571438, 401084.00749283, 141343.96728508,
       178784.83192722,  

In [27]:
# 평가

from sklearn.metrics import mean_squared_error # MSE(평균 제곱 오차)
from sklearn.metrics import mean_absolute_error # MAE(평균 절대 오차)
from sklearn.metrics import root_mean_squared_error # RMSE(제곱 평균 오차 루트)
from sklearn.metrics import r2_score # R2Score(결정계수)

lr_mse = mean_squared_error(y_val, y_pred)
lr_mae = mean_absolute_error(y_val, y_pred)
lr_rmse = root_mean_squared_error(y_val, y_pred)
lr_r2 = r2_score(y_val, y_pred)

print("MSE: ", lr_mse) # 오차를 제곱해 평균을 낸 값 -> 값 클수록 예측이 실제와 다름
print("MAE: ", lr_mae) # 오차를 절대값으로 계싼 -> 값 클수록 예측이 실제와 다름
print("RMSE: ", lr_rmse) # mse에다 루트씌움 -> 값 클수록 예측이 실제와 다름
print("R2: ", lr_r2)

# MSE:  1138255660.1912904 -> 다른 모델과 비교시 사용(튜닝이전/이후 비교) 낮은값 코드 사용
# MAE:  19091.970142754628 -> 예측값과 실제 집값의 차이가 평균적으로 19000달러
# RMSE:  33738.044700179205 -> 예측값과 실제 집값의 차이가 33000 - 약 20% 정도 오차
# R2:  0.8351751535112408 -> 모델이 집값에 대한 예측을 83.5% 정도 설명이 가능(우수한 성능)

MSE:  1138255660.1912904
MAE:  19091.970142754628
RMSE:  33738.044700179205
R2:  0.8351751535112408


In [28]:
# ---------------------------------
# 파일로 저장
# ---------------------------------

pred = model.predict(test)
# pred

# lgbmc.classes_

submit = pd.DataFrame({'pred':pred})
submit.to_csv("result.csv", index=False)

pd.read_csv("result.csv")

Unnamed: 0,pred
0,123748.199989
1,142817.723116
2,172142.033646
3,185112.445658
4,197993.811938
...,...
1454,69201.626749
1455,82823.938828
1456,160958.191536
1457,100570.797983


In [31]:
# -------------------------------
# Sample X 전달 -> 집값 예측 확인 
# -------------------------------

sample = {
    "OverallQual" : 5,   # 전반적 자재/마감 품질
    "GrLivArea" : 1800,     # 지상 생활 면적
    "GarageCars" : 2,    # 차고 수용 차량 수
    "GarageArea" : 500,     # 차고 면적
    "TotalBsmtSF" : 800,   # 지하 전체 면적
    "1stFlrSF" : 1200,      # 1층 면적
    "FullBath" : 2 ,      # 전체 욕실 개수
    "YearBuilt" : 2005,     # 건축 연도
    "YearRemodAdd" : 2010,  # 리모델링 연도
    "KitchenQual" : "TA",   # 주방 품질
    "Fireplaces" : 1,    # 벽난로 개수
    "Neighborhood": "CollgCr"   # 위치
}

sample_df = pd.DataFrame([sample])
sample_df['Neighborhood'] = sample_df['Neighborhood'].astype('category')
sample_df['KitchenQual'] = sample_df['KitchenQual'].astype('category')

sample_df.info()

pred_price = model.predict(sample_df)
print("예측 집값: " , pred_price)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   OverallQual   1 non-null      int64   
 1   GrLivArea     1 non-null      int64   
 2   GarageCars    1 non-null      int64   
 3   GarageArea    1 non-null      int64   
 4   TotalBsmtSF   1 non-null      int64   
 5   1stFlrSF      1 non-null      int64   
 6   FullBath      1 non-null      int64   
 7   YearBuilt     1 non-null      int64   
 8   YearRemodAdd  1 non-null      int64   
 9   KitchenQual   1 non-null      category
 10  Fireplaces    1 non-null      int64   
 11  Neighborhood  1 non-null      category
dtypes: category(2), int64(10)
memory usage: 446.0 bytes
예측 집값:  [187981.40745232]
