In [4]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['font.size'] = 16
plt.rcParams['figure.figsize'] = 20, 10
plt.rcParams['axes.unicode_minus'] = False

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
# 지표를 하나만 설정할 경우
from sklearn.model_selection import cross_val_score
# 지표를 하나 이상 설정할 경우
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 모델의 최적의 하이퍼파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor

# 차원축소
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# 군집화
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth

# import pytagcloud
from IPython.display import Image

# 저장
import pickle

from tqdm import tqdm


# 딥러닝 관련
import tensorflow as tf
from tensorflow import keras

from tensorflow.python.client import device_lib

from pycaret.regression import *

# GPU 사용
gpus = tf.config.experimental.list_physical_devices('GPU')
# gpu가 있다면..
if len(gpus) > 0 :
    try :
        for gpu in gpus :
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e :
        print(e)

In [5]:
tf.__version__

'2.5.0'

In [6]:
keras.__version__

'2.5.0'

In [7]:
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 8850295132635499951,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 4018601984
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 8145828450785417568
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5"]

In [8]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## 예측 코드

In [13]:
train = pd.read_csv('data/train_부산,대구,강원(임대료)_대전,충남(지하철).csv', encoding='cp949')
test = pd.read_csv('data/test_C2152결측채움.csv', encoding='cp949')
submission = pd.read_csv('data/sample_submission.csv')
agegender_info= pd.read_csv('data/age_gender_info.csv')

In [14]:
train.drop(['Unnamed: 0'], axis=1, inplace= True)
test.drop(['Unnamed: 0'], axis=1, inplace= True)

In [17]:
print(train.shape)
print(test.shape)
print(submission.shape)

(2952, 15)
(1022, 14)
(150, 2)


In [19]:
clf = setup(data = train, target = '등록차량수')

Unnamed: 0,Description,Value
0,session_id,8446
1,Target,등록차량수
2,Original Data,"(2952, 15)"
3,Missing Values,True
4,Numeric Features,6
5,Categorical Features,8
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(2066, 1289)"


In [21]:
best_5 = compare_models(sort = 'MAE', n_select = 5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,8.0894,916.4624,29.4847,0.9947,0.1051,0.0324,0.712
dt,Decision Tree Regressor,12.5229,3436.2791,54.8812,0.9811,0.1465,0.0451,0.041
rf,Random Forest Regressor,27.0199,3528.8537,58.4468,0.9802,0.1447,0.0714,0.501
ridge,Ridge Regression,33.6714,3232.3135,56.3694,0.9816,0.1682,0.0964,0.049
xgboost,Extreme Gradient Boosting,34.8396,2789.7121,52.395,0.9842,0.1567,0.1072,1.089
lightgbm,Light Gradient Boosting Machine,40.5747,4658.5495,67.8216,0.9735,0.1844,0.1192,0.153
omp,Orthogonal Matching Pursuit,42.0538,4455.9879,66.5979,0.9744,0.2348,0.1559,0.263
catboost,CatBoost Regressor,48.979,4517.7937,66.9934,0.9744,0.212,0.1559,1.381
gbr,Gradient Boosting Regressor,82.8912,11750.4508,108.1657,0.933,0.3054,0.2624,0.418
lasso,Lasso Regression,112.5459,22504.7619,149.5215,0.873,0.4279,0.3222,0.466


In [23]:
blended = blend_models(estimator_list = best_5, fold = 10)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,26.1162,3851.9703,62.0642,0.9792,0.1517,0.076
1,21.6158,1894.4554,43.5253,0.9887,0.1941,0.0954
2,21.4663,1897.2622,43.5576,0.9901,0.0989,0.0529
3,22.8815,2323.7855,48.2057,0.9891,0.17,0.0862
4,17.2028,741.7447,27.235,0.9951,0.1298,0.0627
5,20.0284,1126.8473,33.5685,0.9915,0.1252,0.0649
6,20.6631,1207.8578,34.7542,0.9927,0.1098,0.0594
7,20.2477,1279.6066,35.7716,0.9947,0.0885,0.0477
8,19.7327,1212.4165,34.8198,0.9939,0.1065,0.0551
9,23.8288,2064.0731,45.4321,0.9862,0.1217,0.0579


In [24]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,16.4498,875.4254,29.5876,0.9957,0.0822,0.0403


In [25]:
final_model = finalize_model(blended)

In [27]:
test['임대료'] = test['임대료'].replace('-', np.NaN)

In [33]:
test['임대보증금'] = test['임대보증금'].replace('-', np.NaN)

In [34]:
predictions = predict_model(final_model, data = test)
predictions

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,Label
0,C1072,754,아파트,경기도,국민임대,39.79,116,14,H,22830000,189840,0.0,2,683,669.409557
1,C1072,754,아파트,경기도,국민임대,46.81,30,14,A,36048000,249930,0.0,2,683,726.401152
2,C1072,754,아파트,경기도,국민임대,46.90,112,14,H,36048000,249930,0.0,2,683,677.847699
3,C1072,754,아파트,경기도,국민임대,46.90,120,14,H,36048000,249930,0.0,2,683,677.844977
4,C1072,754,아파트,경기도,국민임대,51.46,60,14,H,43497000,296780,0.0,2,683,673.539396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,C1267,675,아파트,경상남도,행복주택,36.77,126,38,L,,,0.0,1,467,375.207412
1018,C2189,382,아파트,전라북도,국민임대,29.19,96,45,H,6872000,106400,0.0,2,300,232.741834
1019,C2189,382,아파트,전라북도,국민임대,29.19,20,45,H,6872000,106400,0.0,2,300,232.086939
1020,C2189,382,아파트,전라북도,국민임대,39.45,202,45,H,13410000,144600,0.0,2,300,234.894066


In [35]:
submission['num'] = predictions['Label']
submission['num']

0      669.409557
1      726.401152
2      677.847699
3      677.844977
4      673.539396
          ...    
145    192.525197
146    192.525416
147    192.412553
148    192.412553
149    192.412553
Name: num, Length: 150, dtype: float64

In [36]:
submission.to_csv('pycaret_test1_0708.csv', index = False)