In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

plt.rc('font', family='Malgun Gothic')  # 폰트 지정
plt.rc('axes', unicode_minus=False)  # 마이너스 폰트 설정
%config InlineBackend.figure_format = 'retina'  # 그래프 글씨 뚜렷

df_train = pd.read_csv("../data/df_train.csv")
df_test = pd.read_csv("../data/df_test.csv")

df_train['target'] = df_train['사망자수']*10 + df_train['중상자수']*5 + df_train['경상자수']*3 + df_train['부상자수']
df_train.drop(columns = ['사망자수','중상자수','경상자수','부상자수'], axis=1, inplace = True)

X = df_train.drop(columns = 'target',axis=1)
y = df_train['target']

x_train, x_test, y_train , y_test = train_test_split(X, y, test_size = 0.3, random_state=42 )

# pycaret

In [2]:
# 필요한 라이브러리 불러오기
from pycaret.regression import *

In [3]:
# PyCaret 설정
reg = setup(data=df_train, target='target', session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,target
2,Target type,Regression
3,Original data shape,"(39601, 32)"
4,Transformed data shape,"(39601, 32)"
5,Transformed train set shape,"(27720, 32)"
6,Transformed test set shape,"(11881, 32)"
7,Numeric features,31
8,Preprocess,True
9,Imputation type,simple


In [4]:
# 모델 비교 및 선택
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,2.1478,9.8362,3.1319,0.0257,0.4607,0.6276,0.356
ridge,Ridge Regression,2.1486,9.8443,3.1333,0.0249,0.4612,0.6283,0.027
br,Bayesian Ridge,2.1494,9.8465,3.1336,0.0247,0.4611,0.6287,0.043
lr,Linear Regression,2.1492,9.8473,3.1337,0.0246,0.4613,0.6286,0.585
lightgbm,Light Gradient Boosting Machine,2.1532,9.8684,3.1371,0.0225,0.4616,0.6278,0.343
omp,Orthogonal Matching Pursuit,2.1522,9.8742,3.1381,0.0218,0.4616,0.6293,0.026
lar,Least Angle Regression,2.154,9.9268,3.1466,0.0163,0.4618,0.6286,0.028
catboost,CatBoost Regressor,2.1618,9.9566,3.1514,0.0134,0.4636,0.6295,1.267
rf,Random Forest Regressor,2.168,9.9995,3.1582,0.0092,0.465,0.631,0.69
xgboost,Extreme Gradient Boosting,2.1664,10.0009,3.1585,0.0089,0.465,0.6306,0.601


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [5]:
# 모델 학습
# best_model을 사용해 다양한 모델 중 가장 좋은 모델을 선택하고 학습합니다.
final_model = finalize_model(best_model)

In [9]:
# 모델 저장
save_model(final_model, '../model/003pycaret_regression_model')

# 모델 로드
loaded_model = load_model('../model/003pycaret_regression_model')

Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Loaded


# 제출

In [11]:
sub_file = pd.read_csv("../data/sample_submission.csv")

In [12]:
sub_file['ECLO'] = predict_model(final_model, df_test)['prediction_label']

In [13]:
sub_file['ECLO'] = round(sub_file['ECLO'])

In [14]:
sub_file.ECLO.value_counts()

5.0    7969
4.0    2604
6.0     287
3.0      92
7.0      11
Name: ECLO, dtype: int64

In [15]:
sub_file.to_csv("../data/sub_file/004pycaret.csv", index=False)