# 집 가격 예측하기
## 데이터 필드
| **ID**             | 집을 구분하는 번호                                        |
|--------------------|-----------------------------------------------------------|
| **date**           | 집을 구매한 날짜                                           |
| **price**          | 집의 가격(Target variable)                                |
| **bedrooms**       | 침실의 수                                                 |
| **bathrooms**      | 화장실의 수                                               |
| **sqft_living**    | 주거 공간의 평방 피트(면적)                               |
| **sqft_lot**       | 부지의 평방 피트(면적)                                    |
| **floors**         | 집의 층 수                                                |
| **waterfront**     | 집의 전방에 강이 흐르는지 유무 (a.k.a. 리버뷰)            |
| **view**           | 집이 얼마나 좋아 보이는지의 정도                           |
| **condition**      | 집의 전반적인 상태                                         |
| **grade**          | King County grading 시스템 기준으로 매긴 집의 등급       |
| **sqft_above**     | 지하실을 제외한 평방 피트(면적)                           |
| **sqft_basement**  | 지하실의 평방 피트(면적)                                  |
| **yr_built**       | 지어진 년도                                               |
| **yr_renovated**   | 집을 재건축한 년도                                       |
| **zipcode**        | 우편번호                                                  |
| **lat**            | 위도                                                      |
| **long**           | 경도                                                      |
| **sqft_living15**  | 2015년 기준 주거 공간의 평방 피트(면적, 집을 재건축했다면, 변화가 있을 수 있음) |
| **sqft_lot15**     | 2015년 기준 부지의 평방 피트(면적, 집을 재건축했다면, 변화가 있을 수 있음)     |

In [35]:
# import modules
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import seaborn as sns

# 추가 로드
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [36]:
def print_separator():
    print('='*80)

In [37]:
# 데이터의 총갯수 15000개정도

# 결측치가 없는건 이미 암
# lms 노드기준 date컬럼을 int로만, target값 분포만 log로 정규화시킴
# TODO date컬럼을 좀더 세분화
# TODO 원핫인코딩

# TODO 이상치 확인 후 제거
# TODO 마지막 노드기준 LGBMRegressor 모델에서 평가한 기준으로만 submission을 작성했지만 여러 모델로 평가해보기

In [38]:
# data load
data_dir = os.getenv('HOME') + '/aiffel/workspace/06_study/predict_home_price/data'

train_data_path = os.path.join(data_dir, 'train.csv')
test_data_path = os.path.join(data_dir, 'test.csv') 

org_train = pd.read_csv(train_data_path)
train = org_train
org_test = pd.read_csv(test_data_path)
test = org_test

### 필요없는 컬럼 제거 및 y변수에 target값 (노드내용)

In [39]:
# train dataset -> remove target,id cols
y = train['price']
# 로그변환
y = np.log1p(y)

del train['price']
del train['id']

print_separator()
print(train.columns)
print_separator()
print(train.info())

Index(['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15035 entries, 0 to 15034
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           15035 non-null  object 
 1   bedrooms       15035 non-null  int64  
 2   bathrooms      15035 non-null  float64
 3   sqft_living    15035 non-null  int64  
 4   sqft_lot       15035 non-null  int64  
 5   floors         15035 non-null  float64
 6   waterfront     15035 non-null  int64  
 7   view           15035 non-null  int64  
 8   condition      15035 non-null  int64  
 9   grade          15035 non-null  int64  
 10  sqft_above     15035 non-null  int64  
 11  sqft_basement  15035 non-null  in

In [40]:
# test dataset -> remove id col
del test['id']

print_separator()
print(test.columns)
print_separator()
print(test.info())

Index(['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6468 entries, 0 to 6467
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           6468 non-null   object 
 1   bedrooms       6468 non-null   int64  
 2   bathrooms      6468 non-null   float64
 3   sqft_living    6468 non-null   int64  
 4   sqft_lot       6468 non-null   int64  
 5   floors         6468 non-null   float64
 6   waterfront     6468 non-null   int64  
 7   view           6468 non-null   int64  
 8   condition      6468 non-null   int64  
 9   grade          6468 non-null   int64  
 10  sqft_above     6468 non-null   int64  
 11  sqft_basement  6468 non-null   int6

## date 컬럼 좀더 세분화

### train 데이터셋

In [41]:
# train dataset
# date column -> date
train = org_train
train['date'] = pd.to_datetime(train['date']) # 2014-10-13 < 이형식으로 나옴
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day

del train['date'] # datetime 컬럼제거

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15035 entries, 0 to 15034
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   bedrooms       15035 non-null  int64  
 1   bathrooms      15035 non-null  float64
 2   sqft_living    15035 non-null  int64  
 3   sqft_lot       15035 non-null  int64  
 4   floors         15035 non-null  float64
 5   waterfront     15035 non-null  int64  
 6   view           15035 non-null  int64  
 7   condition      15035 non-null  int64  
 8   grade          15035 non-null  int64  
 9   sqft_above     15035 non-null  int64  
 10  sqft_basement  15035 non-null  int64  
 11  yr_built       15035 non-null  int64  
 12  yr_renovated   15035 non-null  int64  
 13  zipcode        15035 non-null  int64  
 14  lat            15035 non-null  float64
 15  long           15035 non-null  float64
 16  sqft_living15  15035 non-null  int64  
 17  sqft_lot15     15035 non-null  int64  
 18  year  

### test 데이터에도 적용

In [42]:
# test dataset
# date column -> date
test['date'] = pd.to_datetime(test['date']) # 2014-10-13 < 이형식으로 나옴
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day

del test['date'] # datetime 컬럼제거

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6468 entries, 0 to 6467
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   bedrooms       6468 non-null   int64  
 1   bathrooms      6468 non-null   float64
 2   sqft_living    6468 non-null   int64  
 3   sqft_lot       6468 non-null   int64  
 4   floors         6468 non-null   float64
 5   waterfront     6468 non-null   int64  
 6   view           6468 non-null   int64  
 7   condition      6468 non-null   int64  
 8   grade          6468 non-null   int64  
 9   sqft_above     6468 non-null   int64  
 10  sqft_basement  6468 non-null   int64  
 11  yr_built       6468 non-null   int64  
 12  yr_renovated   6468 non-null   int64  
 13  zipcode        6468 non-null   int64  
 14  lat            6468 non-null   float64
 15  long           6468 non-null   float64
 16  sqft_living15  6468 non-null   int64  
 17  sqft_lot15     6468 non-null   int64  
 18  year    

## 훈련 및 결과 저장

In [56]:
random_state = 2022


def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):
    # GridSearch model
    grid_model = GridSearchCV(model, param_grid=param_grid, \
                        scoring='neg_mean_squared_error', \
                        cv=5, verbose=verbose, n_jobs=n_jobs)
    
    # model fit
    grid_model.fit(train, y)
    
    # grid result
    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']
    
    # results 에 grid result
    results = pd.DataFrame(params)
    results['score'] = score
    
    # rmsle
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values(by=['RMSLE'])
    
    return results, grid_model.best_params_

In [59]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [1, 10, 30],
}

# 1. 모델 선택
model = LGBMRegressor(random_state=random_state)

# 하이퍼파라미터 찾기
result, best_params = my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5)
print(best_params)
result

Fitting 5 folds for each of 9 candidates, totalling 45 fits
{'max_depth': 10, 'n_estimators': 200}


Unnamed: 0,max_depth,n_estimators,score,RMSLE
5,10,200,-0.02632,0.162235
8,30,200,-0.026339,0.162294
7,30,100,-0.027148,0.164767
4,10,100,-0.027162,0.164809
6,30,50,-0.029388,0.171429
3,10,50,-0.029405,0.171478
2,1,200,-0.04529,0.212813
1,1,100,-0.05502,0.234564
0,1,50,-0.073394,0.270914


In [55]:
model = LGBMRegressor(max_depth=10, n_estimators=100, random_state=random_state)
model.fit(train, y)
y_pred = model.predict(test)


prediction

array([13.10126089, 13.05951585, 14.19303769, ..., 13.0188345 ,
       12.71504377, 12.98150748])

In [None]:
def save_submission(model, train, y, test, model_name):
    # 찾은 하이퍼파라미터로 학습 및 예측
    model.fit(train, y)
    prediction = model.predict(test)
    prediction = np.expm1(prediction)
    
    # csv 로 저장 (submission)
    data_dir = os.getenv('HOME') + '/aiffel/workspace/06_study/predict_home_price/data'
    submission_path = join(data_dir, 'sample_submission.csv')
    submission = pd.read_csv(submission_path)
    submission['price'] = prediction
    
    submission_csv_path = '{}/submission_{}_RMSLE.csv'.format(data_dir, model_name)
    submission.to_csv(submission_csv_path, index=False)
    print('{} saved!'.format(submission_csv_path))

save_submission(model, train, y, test, 'lgbm', rmsle='0.164399')

In [None]:
#

In [None]:
# 각 모델별로 학습하고 결과 저장하기
# 
gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]
def get_scores(models, train, y):
    df = {}
    for model in models:
        # 모델 이름 획득
        model_name = model.__class__.__name__

        # train, test 데이터셋 분리 8:2
        X_train, X_test, y_train, y_test = train_test_split(train,y,test_size=0.2,random_state=random_state)

        # 모델 학습
        model.fit(X_train,y_train)

        # 예측
        y_pred = model.predict(X_test)

        # 예측 결과의 rmse값 저장
        df[model_name] = rmse(y_test,y_pred)

        # data frame에 저장
        score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
    return score_df

get_scores(models, train, y)

## 원 핫 인코딩

### train 데이터 셋

In [22]:
all_cols = list(train.columns)
all_cols

['date',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15',
 'year',
 'month',
 'day']

In [26]:
def len_unique_raw(train,test,cols):
    for col in cols:
        # 값 갯수가 같은 컬럼
        if len(set(train[col])) == len(set(test[col])):
            print(f'colum이름 {col}')
            same_col.append(col)
        
# ['floors','waterfront','view','condition','yr_built']
one_hot_encoding_cols = ['floors','waterfront','view','condition','yr_built']
train.shape

(15035, 22)

In [27]:
def one_hot_encoding(df,cols):
    for col in cols:
        dummy = pd.get_dummies(data=train[col],prefix=col)
        df = pd.concat([df, dummy], axis=1)
        
    return df

train = one_hot_encoding(train,one_hot_encoding_cols)

train.shape

(15035, 156)

### test 데이터 셋도 적용

In [28]:
test = one_hot_encoding(train,one_hot_encoding_cols)

test.shape

(15035, 290)

### bedrooms 컬럼 확인 후 원핫 인코딩

In [31]:
print_separator()
print('bedrooms 값 갯수',len(set(train['bedrooms'])))
print_separator()
dummy_bedroom = pd.get_dummies(data=train['bedrooms'],prefix='bedrooms')
print(dummy_bedroom)
print_separator()
train = pd.concat([train,dummy_bedroom], axis=1)
train.info()

bedrooms 값 갯수 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
       bedrooms_0  bedrooms_1  bedrooms_2  bedrooms_3  bedrooms_4  bedrooms_5  \
0               0           0           0           1           0           0   
1               0           0           1           0           0           0   
2               0           0           0           1           0           0   
3               0           0           0           1           0           0   
4               0           0           0           1           0           0   
...           ...         ...         ...         ...         ...         ...   
15030           0           0           0           0           1           0   
15031           0           0           0           0           1           0   
15032           0           0           0           1           0           0   
15033           0           0           0           0           1           0   
15034           0           0           1           0       

### bathrooms 확인 후 원핫 인코딩

In [36]:
# 15000개중 29개
print_separator()
print('bathrooms 값 갯수',len(set(train['bathrooms'])))
print_separator()
dummy_bathroom = pd.get_dummies(data=train['bathrooms'],prefix='bathrooms')
print(dummy_bathroom)
print_separator()
train = pd.concat([train,dummy_bathroom], axis=1)
train.info()

bathrooms 값 갯수 29
       bathrooms_0.0  bathrooms_0.5  bathrooms_0.75  bathrooms_1.0  \
0                  0              0               0              1   
1                  0              0               0              1   
2                  0              0               0              0   
3                  0              0               0              0   
4                  0              0               0              0   
...              ...            ...             ...            ...   
15030              0              0               0              0   
15031              0              0               0              0   
15032              0              0               0              0   
15033              0              0               0              0   
15034              0              0               1              0   

       bathrooms_1.25  bathrooms_1.5  bathrooms_1.75  bathrooms_2.0  \
0                   0              0               0              0   

### floors 확인 후 원핫 인코딩

In [40]:
# 15000개중 6개
print_separator()
print('floors 값 갯수',len(set(train['floors'])))
print_separator()
dummy_floor = pd.get_dummies(data=train['floors'],prefix='floors')
print(dummy_floor)
print_separator()
train = pd.concat([train,dummy_floor], axis=1)
train.info()

floors 값 갯수 6
       floors_1.0  floors_1.5  floors_2.0  floors_2.5  floors_3.0  floors_3.5
0               1           0           0           0           0           0
1               1           0           0           0           0           0
2               1           0           0           0           0           0
3               0           0           1           0           0           0
4               1           0           0           0           0           0
...           ...         ...         ...         ...         ...         ...
15030           0           0           1           0           0           0
15031           0           0           1           0           0           0
15032           0           0           0           0           1           0
15033           0           0           1           0           0           0
15034           0           0           1           0           0           0

[15035 rows x 6 columns]
<class 'pandas.core.fram

### waterfront 확인 후 원핫 인코딩

In [43]:
# 15000개중 2개 (강물이 흐르는 유무)
print_separator()
print('waterfront 값 갯수',len(set(train['waterfront'])))
print_separator()
dummy_waterfront = pd.get_dummies(data=train['waterfront'],prefix='waterfront')
print(dummy_waterfront)
print_separator()
train = pd.concat([train,dummy_waterfront], axis=1)
train.info()

waterfront 값 갯수 2
       waterfront_0  waterfront_1
0                 1             0
1                 1             0
2                 1             0
3                 1             0
4                 1             0
...             ...           ...
15030             1             0
15031             1             0
15032             1             0
15033             1             0
15034             1             0

[15035 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15035 entries, 0 to 15034
Data columns (total 83 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              15035 non-null  int64         
 1   date            15035 non-null  datetime64[ns]
 2   price           15035 non-null  float64       
 3   bedrooms        15035 non-null  int64         
 4   bathrooms       15035 non-null  float64       
 5   sqft_living     15035 non-null  int64         
 6   sqft_lot        15

### view 확인 후 원핫 인코딩

In [45]:
# 15000개중 5개 (집이 얼마나 좋아보이는지)
print_separator()
print('view 값 갯수',len(set(train['view'])))
print_separator()
dummy_view = pd.get_dummies(data=train['view'],prefix='view')
print(dummy_view)
print_separator()

train = pd.concat([train,dummy_view], axis=1)
train.info()

view 값 갯수 5
       view_0  view_1  view_2  view_3  view_4
0           1       0       0       0       0
1           1       0       0       0       0
2           1       0       0       0       0
3           1       0       0       0       0
4           1       0       0       0       0
...       ...     ...     ...     ...     ...
15030       1       0       0       0       0
15031       1       0       0       0       0
15032       1       0       0       0       0
15033       1       0       0       0       0
15034       1       0       0       0       0

[15035 rows x 5 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15035 entries, 0 to 15034
Data columns (total 88 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              15035 non-null  int64         
 1   date            15035 non-null  datetime64[ns]
 2   price           15035 non-null  float64       
 3   bedrooms        15035 non-null  int6

### condition 확인후 원핫 인코딩

In [47]:
# 15000개중 5개 (집의 전반적인 상태)
print_separator()
print('condition 값 갯수',len(set(train['condition'])))
print_separator()
dummy_condition = pd.get_dummies(data=train['condition'],prefix='condition')
print(dummy_condition)
print_separator()

train = pd.concat([train,dummy_condition], axis=1)
train.info()

condition 값 갯수 5
       condition_1  condition_2  condition_3  condition_4  condition_5
0                0            0            1            0            0
1                0            0            1            0            0
2                0            0            1            0            0
3                0            0            1            0            0
4                0            0            1            0            0
...            ...          ...          ...          ...          ...
15030            0            0            1            0            0
15031            0            0            1            0            0
15032            0            0            1            0            0
15033            0            0            1            0            0
15034            0            0            1            0            0

[15035 rows x 5 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15035 entries, 0 to 15034
Data columns (total 93 column

### grade 원핫 인코딩

In [50]:
# 15000개중 12개
print_separator()
print('grade 값 갯수',len(set(train['grade'])))
print_separator()
dummy_grade = pd.get_dummies(data=train['grade'],prefix='grade')
print(dummy_grade)
print_separator()

train = pd.concat([train,dummy_grade], axis=1)
train.info()

grade 값 갯수 12
       grade_1  grade_3  grade_4  grade_5  grade_6  grade_7  grade_8  grade_9  \
0            0        0        0        0        0        1        0        0   
1            0        0        0        0        1        0        0        0   
2            0        0        0        0        0        0        1        0   
3            0        0        0        0        0        1        0        0   
4            0        0        0        0        0        1        0        0   
...        ...      ...      ...      ...      ...      ...      ...      ...   
15030        0        0        0        0        0        0        0        1   
15031        0        0        0        0        0        0        0        1   
15032        0        0        0        0        0        0        1        0   
15033        0        0        0        0        0        0        1        0   
15034        0        0        0        0        0        1        0        0   

       grade_

### yr_built 원 핫 인코딩

In [61]:
# 15000개중 166개 (지어진 연도)
print_separator()
print('yr_built 값 갯수',len(set(train['yr_built'])))
print_separator()
dummy_yr_built = pd.get_dummies(data=train['yr_built'],prefix='yr_built')
print(dummy_yr_built)
print_separator()

train = pd.concat([train,dummy_yr_built], axis=1)
train.info()

yr_built 값 갯수 116
       yr_built_1900  yr_built_1901  yr_built_1902  yr_built_1903  \
0                  0              0              0              0   
1                  0              0              0              0   
2                  0              0              0              0   
3                  0              0              0              0   
4                  0              0              0              0   
...              ...            ...            ...            ...   
15030              0              0              0              0   
15031              0              0              0              0   
15032              0              0              0              0   
15033              0              0              0              0   
15034              0              0              0              0   

       yr_built_1904  yr_built_1905  yr_built_1906  yr_built_1907  \
0                  0              0              0              0   
1              

### yr_renovated 원핫 인코딩

In [63]:
# 15000개중 65개
print_separator()
print('yr_renovated 값 갯수',len(set(train['yr_renovated'])))
print_separator()
dummy_yr_renovated = pd.get_dummies(data=train['yr_renovated'],prefix='yr_renovated')
print(dummy_yr_renovated)
print_separator()

train = pd.concat([train,dummy_yr_renovated], axis=1)
train.info()

yr_renovated 값 갯수 65
       yr_renovated_0  yr_renovated_1940  yr_renovated_1944  \
0                   1                  0                  0   
1                   1                  0                  0   
2                   1                  0                  0   
3                   1                  0                  0   
4                   1                  0                  0   
...               ...                ...                ...   
15030               1                  0                  0   
15031               1                  0                  0   
15032               1                  0                  0   
15033               1                  0                  0   
15034               1                  0                  0   

       yr_renovated_1945  yr_renovated_1946  yr_renovated_1948  \
0                      0                  0                  0   
1                      0                  0                  0   
2                      0

In [69]:
# # 15000개중 6개
# print_separator()
# print('yr_renovated 값 갯수',len(set(train['yr_renovated'])))
# print_separator()
# dummy_yr_renovated = pd.get_dummies(data=train['yr_renovated'],prefix='yr_renovated')
# print(dummy_yr_renovated)
# print_separator()

# train = pd.concat([train,dummy_yr_renovated], axis=1)
# train.info()



In [80]:
test = org_train
all_cols = list(test.columns)
all_cols

['id',
 'date',
 'price',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15',
 'year',
 'month',
 'day']

In [82]:
def len_unique_raw(df,cols):
    for col in cols:
        print(f'{col} 컬럼 값 갯수 {len(set(df[col]))}')
        
len_unique_raw(test,all_cols)
test.shape
# ['bedrooms','bathrooms','floors','waterfront','view','condition','grade','yr_built','yr_renovated']

id 컬럼 값 갯수 15035
date 컬럼 값 갯수 365
price 컬럼 값 갯수 3263
bedrooms 컬럼 값 갯수 11
bathrooms 컬럼 값 갯수 29
sqft_living 컬럼 값 갯수 863
sqft_lot 컬럼 값 갯수 7678
floors 컬럼 값 갯수 6
waterfront 컬럼 값 갯수 2
view 컬럼 값 갯수 5
condition 컬럼 값 갯수 5
grade 컬럼 값 갯수 12
sqft_above 컬럼 값 갯수 789
sqft_basement 컬럼 값 갯수 271
yr_built 컬럼 값 갯수 116
yr_renovated 컬럼 값 갯수 65
zipcode 컬럼 값 갯수 70
lat 컬럼 값 갯수 4692
long 컬럼 값 갯수 716
sqft_living15 컬럼 값 갯수 682
sqft_lot15 컬럼 값 갯수 6958
year 컬럼 값 갯수 2
month 컬럼 값 갯수 12
day 컬럼 값 갯수 31


(15035, 24)

In [83]:
one_hot_encoding_cols = ['bedrooms','bathrooms','floors','waterfront','view','condition','grade','yr_built','yr_renovated']

def one_hot_encoding(df,cols):
    for col in cols:
        dummy = pd.get_dummies(data=train[col],prefix=col)
        df = pd.concat([df, dummy], axis=1)
        
    return df

test = one_hot_encoding(test,one_hot_encoding_cols)

test.shape
#


# print_separator()
# print('yr_renovated 값 갯수',len(set(train['yr_renovated'])))



# dummy_yr_renovated = pd.get_dummies(data=train['yr_renovated'],prefix='yr_renovated')
# print(dummy_yr_renovated)
# print_separator()

# train = pd.concat([train,dummy_yr_renovated], axis=1)
# train.info()



(15035, 275)