In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.tree import ExtraTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [3]:
apart_sep_nowon = pd.read_csv('노원구.csv', encoding='cp949')

In [4]:
x = apart_sep_nowon.drop(['물건금액'],axis=1)
y = apart_sep_nowon['물건금액']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=123) # x, y를 train과 test 7:3 비율로 분리

# 연속형 변수 열 입력
numeric_features = ['매매지수','신고년도','건물면적','층정보','건축년도','경과년도']
numeric_transformer = StandardScaler()

# 범주형 변수 열 입력
categorical_features = ['자치구명','법정동명','건물명','브랜드점수']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') # 범주형 데이터가 x_train, x_test에 고르게 들어가지 않는 경우 pass 하기 위해 handel_unknown param ='ignore' 로 설정

preprocessor = ColumnTransformer(
    transformers=[ # List of (name, transformer, column(s))
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)]) # 파이프라인 단계 입력

preprocessor_pipe.fit(x_train, x_test) # 변수 변환

# 변환된 변수로 transform
x_train_transformed = preprocessor_pipe.transform(x_train)
x_test_transformed = preprocessor_pipe.transform(x_test)

In [5]:
models = {'KNeighbors':KNeighborsRegressor(),'Randomforest':RandomForestRegressor(),'ExtraTree':ExtraTreeRegressor(),'MLP':MLPRegressor(), 'SGD':SGDRegressor(),
'SVM':SVR(),'CatBoost':CatBoostRegressor(),'LightGBM':LGBMRegressor(),'XGBoost':XGBRegressor()}

In [None]:
model_score = {} # 모델 성능 저장용 dict

# 모델간 점수 비교
for name, attr in models.items():
    model = attr
    print(f'{name} model tranning ...')
    model.fit(x_train_transformed, y_train)
    predict_y= model.predict(x_test_transformed)
    model_score_list = []
    model_score_list.append(f'{name} RMSE : {(mean_squared_error(y_test, predict_y)**0.5):.1f}')
    model_score_list.append(f'{name} MAE : {(mean_absolute_error(y_test, predict_y)**0.5):.1f}')
    model_score_list.append(f'{name} R2 score : {(r2_score(y_test, predict_y)*100):.2f} %')
    model_score[f'{name}'] = model_score_list
    model_score_list = []

In [7]:
# 모델간 점수 비교 결과 출력
for name, score in model_score.items():
    print(f'{name}')
    print(f'{score}')
    print()

KNeighbors
['KNeighbors RMSE : 49512378.1', 'KNeighbors MAE : 5718.6', 'KNeighbors R2 score : 92.61 %']

Randomforest
['Randomforest RMSE : 45102920.2', 'Randomforest MAE : 5405.4', 'Randomforest R2 score : 93.87 %']

ExtraTree
['ExtraTree RMSE : 50804490.5', 'ExtraTree MAE : 5693.8', 'ExtraTree R2 score : 92.22 %']

MLP
['MLP RMSE : 451292082.4', 'MLP MAE : 20322.2', 'MLP R2 score : -513.97 %']

SGD
['SGD RMSE : 66167763.1', 'SGD MAE : 6945.6', 'SGD R2 score : 86.80 %']

SVM
['SVM RMSE : 186790996.8', 'SVM MAE : 11640.1', 'SVM R2 score : -5.18 %']

CatBoost
['CatBoost RMSE : 40794724.9', 'CatBoost MAE : 5279.9', 'CatBoost R2 score : 94.98 %']

LightGBM
['LightGBM RMSE : 43444983.6', 'LightGBM MAE : 5436.8', 'LightGBM R2 score : 94.31 %']

XGBoost
['XGBoost RMSE : 42993633.3', 'XGBoost MAE : 5424.0', 'XGBoost R2 score : 94.43 %']



In [8]:
# MLP GridSearch
model1 = MLPRegressor()

param_grid_mlp={'hidden_layer_sizes': [(32,64),(128,64),(128,256)],
            'batch_size':  [50,100,200],
            'learning_rate_init': [0.01,0.05],
            'max_iter': [100,300,400]
            }

gs1 = GridSearchCV(model1, param_grid_mlp, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, verbose=False)

gs1.fit(x_train_transformed, y_train)

gs1_test_score = mean_squared_error(y_train, gs1.predict(x_train_transformed))
print(f'Best RMSE {(-gs1.best_score_)**0.5} params {gs1.best_params_}')
print()



Best RMSE 41299066.89491322 params {'batch_size': 50, 'hidden_layer_sizes': (128, 256), 'learning_rate_init': 0.05, 'max_iter': 400}



In [9]:
# Catboost Gridsearch

model2 = CatBoostRegressor()

param_grid_catboost = {
                        'learning_rate': [0.01, 0.05],
                        'iterations': [100, 300, 500],
                        'depth': [6, 8, 10], #In most cases, the optimal depth ranges from 4 to 10. Values in the range from 6 to 10 are recommended.
                        'l2_leaf_reg': [1, 3, 5, 7, 10]
                       }

grid_search_result = model2.grid_search(param_grid_catboost, 
                                       X=x_train_transformed, 
                                       y=y_train, 
                                       plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 444615661.0972139	test: 443840709.0448757	best: 443840709.0448757 (0)	total: 26ms	remaining: 2.57s
1:	learn: 440499945.1689555	test: 439707814.7572554	best: 439707814.7572554 (1)	total: 29.3ms	remaining: 1.44s
2:	learn: 436384247.3306293	test: 435586353.4307979	best: 435586353.4307979 (2)	total: 33.5ms	remaining: 1.08s
3:	learn: 432277763.7520026	test: 431470057.7427204	best: 431470057.7427204 (3)	total: 37.6ms	remaining: 902ms
4:	learn: 428249783.5413616	test: 427428132.4654296	best: 427428132.4654296 (4)	total: 41.5ms	remaining: 789ms
5:	learn: 424247734.4984148	test: 423413555.4157631	best: 423413555.4157631 (5)	total: 45.6ms	remaining: 714ms
6:	learn: 420269817.0161630	test: 419417322.6012632	best: 419417322.6012632 (6)	total: 49.6ms	remaining: 659ms
7:	learn: 416327938.1184555	test: 415475869.3080344	best: 415475869.3080344 (7)	total: 53.6ms	remaining: 616ms
8:	learn: 412442630.7171338	test: 411580569.1406052	best: 411580569.1406052 (8)	total: 57.6ms	remaining: 582ms
9:	

In [10]:
grid_search_result['params'] # Catboost Hyperparameter

{'depth': 10, 'l2_leaf_reg': 1, 'iterations': 500, 'learning_rate': 0.05}

In [11]:
# LightGBM GridSearch
model3 = LGBMRegressor()

param_grid_lgbm={'n_estimators': [100, 200, 300],
            'learning_rate' : [0.01, 0.05],
            'num_leaves' : [30, 50, 100]
            }

            
gs3 = GridSearchCV(model3, param_grid_lgbm, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, verbose=False) # scoring = https://scikit-learn.org/stable/modules/model_evaluation.html#scoring

gs3.fit(x_train_transformed, y_train)

gs3_test_score = mean_squared_error(y_train, gs3.predict(x_train_transformed))
print(f'Best RMSE {(-gs3.best_score_)**0.5} params {gs3.best_params_}')


Best RMSE 39653747.55346821 params {'learning_rate': 0.05, 'n_estimators': 300, 'num_leaves': 100}


In [12]:
# XGBoost GridSearch

model4 = XGBRegressor()

param_grid_xgb={'n_estimators': [50, 100, 200, 250]}

            
gs4 = GridSearchCV(model4, param_grid_xgb, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, verbose=False)

gs4.fit(x_train_transformed, y_train)

gs4_test_score = mean_squared_error(y_train, gs4.predict(x_train_transformed))
print(f'Best RMSE {(-gs4.best_score_)**0.5} params {gs4.best_params_}')

Best RMSE 39725664.19745018 params {'n_estimators': 250}


In [13]:
# RandomForest GridSearch
# n_estimators = number of trees in the foreset

model5 = RandomForestRegressor()

param_grid_rf={'n_estimators': [50, 100, 200]}

            
gs5 = GridSearchCV(model5, param_grid_rf, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, verbose=False)

gs5.fit(x_train_transformed, y_train)

gs5_test_score = mean_squared_error(y_train, gs5.predict(x_train_transformed))
print(f'Best RMSE {(-gs5.best_score_)**0.5} params {gs5.best_params_}')

Best RMSE 44167056.8646646 params {'n_estimators': 200}
