라이브러리 확인

In [1]:
import seaborn as sns                                                                     # 데이터 시각화 라이브러리
import pandas as pd       
import numpy as np                                                                # 데이터 처리 라이브러리
from sklearn.model_selection import train_test_split, RandomizedSearchCV                  # 학습 데이터와 테스트 데이터로 나누는 라이브러리
from sklearn.impute import SimpleImputer                                                  # 결측치 처리 라이브러리                          
from sklearn.compose import ColumnTransformer                                             # 
from sklearn.pipeline import Pipeline                                                     # 파이프라인을 만들기 위한 라이브러리                     
from sklearn.tree import DecisionTreeRegressor                                            # 결정트리 회귀 모델 라이브러리       
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score       # 모델 평가 라이브러리
from sklearn.metrics import make_scorer, mean_squared_error                               # 모델 평가 라이브러리      
from sklearn.preprocessing import OneHotEncoder, StandardScaler                           # 원핫인코딩, 표준화 라이브러리

데이터

In [2]:
# 데이터 불러오기
submission = pd.read_csv('./house-prices/sample_submission.csv')
train = pd.read_csv('./house-prices/train.csv')
test = pd.read_csv('./house-prices/test.csv')

# 데이터 분리 
y = train['SalePrice']                      # SalePrice 컬럼을 y 데이터로 사용
train = train.drop('SalePrice', axis=1)     # SalePrice 컬럼을 삭제한 데이터를 train 데이터로 사용
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

탐색적 자료 분석 : 불 필요한 변수 제거 

In [3]:
numerical_cols = train.select_dtypes(include=['int64', 'float64']).columns      # 수치 데이터 컬럼만 추출
categorical_cols = train.select_dtypes(include=['object']).columns              # 범주형 데이터 컬럼만 추출

# 수치 데이터 전처리(결측치 처리, 표준화)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),      # 결측치를 중앙값으로 채우기
    ('scaler', StandardScaler())                        # 표준화
])

# 범주형 데이터 전처리(결측치 처리, 원핫인코딩)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),   # 결측치를 최빈값으로 채우기
    ('onehot', OneHotEncoder(handle_unknown='ignore'))      # 원핫인코딩
])

# 전처리 파이프라인 만들기
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),       # 수치 데이터 파이프라인
        ('cat', categorical_transformer, categorical_cols)    # 범주형 데이터 파이프라인
    ])

모델 정의

In [4]:
model = DecisionTreeRegressor()   # 모델 생성

# 파이프라인 구축 
pipline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])    # 전처리 파이프라인과 모델을 연결

# 랜덤 서치 
# 훈련/ 테스트 데이터 분리 
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.2, random_state=0)   # 훈련 데이터와 테스트 데이터로 분리

# 랜덤 서치를 위한 하이퍼파라미터 설정
param_distributions = {
    # 'preprocessor__num__imputer__strategy': ['mean', 'median'],           # 수치 데이터 결측치 처리 전략
    'regressor__max_depth': np.concatenate((np.arange(3, 11), [None])),     # 결정트리의 최대 깊이
    'regressor__min_samples_split': np.arange(2, 11),                       # 분할되기 위해 노드가 가져야 하는 최소 샘플 수
    'regressor__min_samples_leaf': np.arange(1, 5)                          # 리프 노드가 가져야 하는 최소 샘플 수
}


def rmse(y_true, y_pred):
    return -np.sqrt(mean_squared_error(y_true, y_pred))     # rmse 함수 정의(음수 값 보정)

rmse_score = make_scorer(rmse, greater_is_better=False)     # rmse 함수를 이용하여 rmse_score 생성

random_search = RandomizedSearchCV(pipline, param_distributions, n_iter=10, cv=5, random_state=0, scoring=rmse_score)  # 랜덤 서치
random_search.fit(X_train, y_train)   # 랜덤 서치 훈련

print('최적 하이퍼파라미터: ', random_search.best_params_)   # 최적 하이퍼파라미터 출력

최적 하이퍼파라미터:  {'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 2, 'regressor__max_depth': 3}


In [6]:
results = pd.DataFrame(random_search.cv_results_)   # 결과 데이터프레임 생성
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__min_samples_split,param_regressor__min_samples_leaf,param_regressor__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.075914,0.003023,0.007204,0.0002852247,9,1,,"{'regressor__min_samples_split': 9, 'regressor...",40235.857145,44156.303039,40646.850571,44249.161428,31492.745794,40156.183595,4648.777895,9
1,0.029655,0.000323,0.006604,0.0004472466,4,4,4.0,"{'regressor__min_samples_split': 4, 'regressor...",44387.021954,49254.533715,40079.757015,44214.107883,35172.863829,42621.656879,4724.261678,2
2,0.026045,0.000568,0.006907,0.0004829173,8,2,3.0,"{'regressor__min_samples_split': 8, 'regressor...",39086.292995,52900.746949,44607.001888,51362.613882,36313.489691,44854.029081,6532.686425,1
3,0.066846,0.002189,0.007061,0.0006384805,6,3,10.0,"{'regressor__min_samples_split': 6, 'regressor...",34325.308235,47858.400513,43137.866334,42457.71842,32598.940126,40075.646726,5737.4253,10
4,0.039728,0.000592,0.007125,0.0004669014,7,4,6.0,"{'regressor__min_samples_split': 7, 'regressor...",43526.35615,45851.067986,40628.098781,44686.925847,33254.644081,41589.418569,4514.433204,5
5,0.047288,0.001107,0.007257,0.0007972027,2,2,7.0,"{'regressor__min_samples_split': 2, 'regressor...",34639.608731,47861.43703,43203.265651,41869.531119,34178.768289,40350.522164,5245.273184,7
6,0.033952,0.00014,0.007079,0.0003449102,4,4,5.0,"{'regressor__min_samples_split': 4, 'regressor...",44393.015978,46281.884088,40195.390179,44952.387562,34572.223459,42078.980253,4270.244505,4
7,0.061893,0.002849,0.007621,0.0004813643,2,3,9.0,"{'regressor__min_samples_split': 2, 'regressor...",35516.274544,48047.321573,42295.784715,42662.324674,32744.877701,40253.316641,5468.779234,8
8,0.030802,0.001126,0.007002,8.449576e-07,2,4,4.0,"{'regressor__min_samples_split': 2, 'regressor...",44387.021954,49254.533715,40079.757015,44214.107883,35172.863829,42621.656879,4724.261678,2
9,0.046004,0.000943,0.007304,0.0004005815,8,2,7.0,"{'regressor__min_samples_split': 8, 'regressor...",33705.154862,47021.247919,43705.248319,43727.32845,33989.021241,40429.600158,5509.154641,6


모델 예측 및 평가 

In [5]:
y_val_pred = np.round(random_search.predict(X_valid), 0)   # 테스트 데이터 예측

rmse = np.sqrt(mean_squared_error(y_valid, y_val_pred))     # rmse 계산
print('RMSE: ', rmse)   # rmse 출력

RMSE:  42278.02605340749


In [7]:
test_pred = np.round(random_search.predict(test), 0).astype('int64')   # 테스트 데이터 예측
submission_df = pd.DataFrame({'Id': test['Id'], 'SalePrice': np.round(test_pred)})   # 제출 데이터프레임 생성
submission_df.to_csv('./house_output/submission.csv', index=False)   # 제출 파일 생성