In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
test = pd.read_csv("../06_machine_learning/data/house_test.csv", header=0)
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


* Id: 고유 식별자
* MSSubClass: 주택 유형
* MSZoning: 주택 구역
* LotFrontage: 거리와 접한 면적 (피트 단위)
* LotArea: 부지 면적 (평방 피트)
* Street: 도로 유형
* Alley: 골목 접근 유형
* LotShape: 부지 형태
* LandContour: 토지 윤곽
* Utilities: 유틸리티 유형
* LotConfig: 부지 구성
* LandSlope: 토지 경사도
* Neighborhood: 인근 지역
* Condition1: 주요 도로 또는 철도 근접도
* Condition2: 주요 도로 또는 철도 근접도
* BldgType: 건물 유형
* HouseStyle: 주택 스타일
* OverallQual: 전체 재료 및 마감 품질
* OverallCond: 전체 상태
* YearBuilt: 건축 연도
* YearRemodAdd: 리모델링 연도
* RoofStyle: 지붕 스타일
* RoofMatl: 지붕 재료
* Exterior1st: 외부 마감재 1
* Exterior2nd: 외부 마감재 2
* MasVnrType: 외장 벽 마감재 유형
* MasVnrArea: 외장 벽 마감재 면적 (평방 피트)
* ExterQual: 외부 품질
* ExterCond: 외부 상태
* Foundation: 기초 유형
* BsmtQual: 지하실 품질
* BsmtCond: 지하실 상태
* BsmtExposure: 지하실 노출 정도
* BsmtFinType1: 지하실 마감 유형 1
* BsmtFinSF1: 지하 마감 면적 1 (평방 피트)
* BsmtFinType2: 지하실 마감 유형 2
* BsmtFinSF2: 지하 마감 면적 2 (평방 피트)
* BsmtUnfSF: 지하 미마감 면적 (평방 피트)
* TotalBsmtSF: 지하 전체 면적 (평방 피트)
* Heating: 난방 유형
* HeatingQC: 난방 품질 및 상태
* CentralAir: 중앙 에어컨 여부
* Electrical: 전기 시스템
* 1stFlrSF: 1층 면적 (평방 피트)
* 2ndFlrSF: 2층 면적 (평방 피트)
* LowQualFinSF: 저품질 마감 면적 (평방 피트)
* GrLivArea: 지상 생활 면적 (평방 피트)
* BsmtFullBath: 지하 전체 욕실 개수
* BsmtHalfBath: 지하 반 욕실 개수
* FullBath: 지상 전체 욕실 개수
* HalfBath: 지상 반 욕실 개수
* BedroomAbvGr: 지상 침실 개수
* KitchenAbvGr: 지상 주방 개수
* KitchenQual: 주방 품질
* TotRmsAbvGrd: 지상 총 방 개수 (욕실 제외)
* Functional: 주택 기능성
* Fireplaces: 벽난로 개수
* FireplaceQu: 벽난로 품질
* GarageType: 차고 유형
* GarageYrBlt: 차고 건축 연도
* GarageFinish: 차고 내부 마감 상태
* GarageCars: 차고에 주차 가능한 차량 수
* GarageArea: 차고 면적 (평방 피트)
* GarageQual: 차고 품질
* GarageCond: 차고 상태
* PavedDrive: 포장된 진입로 여부
* WoodDeckSF: 목재 데크 면적 (평방 피트)
* OpenPorchSF: 개방형 현관 면적 (평방 피트)
* EnclosedPorch: 폐쇄형 현관 면적 (평방 피트)
* 3SsnPorch: 3계절 현관 면적 (평방 피트)
* ScreenPorch: 스크린 현관 면적 (평방 피트)
* PoolArea: 수영장 면적 (평방 피트)
* PoolQC: 수영장 품질
* Fence: 울타리 품질
* MiscFeature: 기타 기능 (예: 테니스 코트, 2차 차고 등)
* MiscVal: 기타 기능의 가치 (달러)
* MoSold: 판매 월
* YrSold: 판매 연도
* SaleType: 판매 유형
* SaleCondition: 판매 조건
* SalePrice: 판매 가격 (목표 변수)

In [4]:
train = pd.read_csv("../06_machine_learning/data/house_train.csv", header=0)
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [6]:
test.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1459.0,1459.0,1232.0,1459.0,1459.0,1459.0,1459.0,1459.0,1444.0,1458.0,...,1458.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,2190.0,57.378341,68.580357,9819.161069,6.078821,5.553804,1971.357779,1983.662783,100.709141,439.203704,...,472.768861,93.174777,48.313914,24.243317,1.79438,17.064428,1.744345,58.167923,6.104181,2007.769705
std,421.321334,42.74688,22.376841,4955.517327,1.436812,1.11374,30.390071,21.130467,177.6259,455.268042,...,217.048611,127.744882,68.883364,67.227765,20.207842,56.609763,30.491646,630.806978,2.722432,1.30174
min,1461.0,20.0,21.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,1825.5,20.0,58.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,0.0,...,318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,2190.0,50.0,67.0,9399.0,6.0,5.0,1973.0,1992.0,0.0,350.5,...,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2554.5,70.0,80.0,11517.5,7.0,6.0,2001.0,2004.0,164.0,753.5,...,576.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,2919.0,190.0,200.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,4010.0,...,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [8]:
test.isna().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [9]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive