# 라이브러리

In [45]:
import pandas as pd
import random
import os
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

In [46]:
# 한글 폰트 깨짐 현상 해결을 위한 나눔 폰트 설치
# 코드 1회 실행 후 주석 처리하고 런타임 재시작 및 모두 실행
# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf

In [47]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

## 데이터 로드

In [48]:
# 경로 설정
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
# csv 파일 읽어오기
# root = '/content/drive/MyDrive/최종프로젝트/교통/분석/2nd_modified_data/'
root = '/content/drive/MyDrive/Project/'
C4_depsouth = pd.read_csv(root + 'Data_sungsan_depsouth.csv', encoding='cp949')
C4_depsouth_test = pd.read_csv(root + 'sungsan_depsouth_test.csv', encoding='cp949')

In [50]:
# 데이터 확인
print(C4_depsouth.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24096 entries, 0 to 24095
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            24096 non-null  object 
 1   dow             24096 non-null  int64  
 2   time            24096 non-null  object 
 3   branch_name     24096 non-null  object 
 4   district_name   24096 non-null  object 
 5   branch_num      24096 non-null  object 
 6   dep_point       24096 non-null  object 
 7   arr_point       24096 non-null  object 
 8   lane            24096 non-null  int64  
 9   distance        24096 non-null  int64  
 10  volume          24096 non-null  int64  
 11  speed           24096 non-null  float64
 12  classification  0 non-null      float64
dtypes: float64(2), int64(4), object(7)
memory usage: 2.4+ MB
None


In [51]:
# 결측치 확인
print(C4_depsouth.isnull().sum())

date                  0
dow                   0
time                  0
branch_name           0
district_name         0
branch_num            0
dep_point             0
arr_point             0
lane                  0
distance              0
volume                0
speed                 0
classification    24096
dtype: int64


In [52]:
# date 컬럼과 time 컬럼을 합쳐 datetime이라는 컬럼 만들기
C4_depsouth['datetime'] = C4_depsouth['date'] + ' ' + C4_depsouth['time']
C4_depsouth_test['datetime'] = C4_depsouth_test['date'] + ' ' + C4_depsouth_test['time']

In [53]:
# date 컬럼과 time 컬럼 제거
# C4_depsouth = C4_depsouth.drop(C4_depsouth[['date', 'time']], axis=1)

In [54]:
# datetime 문자형 컬럼을 datetime 자료형으로 변환
C4_depsouth['datetime'] = pd.to_datetime(C4_depsouth['datetime'])
C4_depsouth_test['datetime'] = pd.to_datetime(C4_depsouth_test['datetime'])

# classification 컬럼값 변경

In [55]:
C4_depsouth.describe()

Unnamed: 0,dow,lane,distance,volume,speed,classification
count,24096.0,24096.0,24096.0,24096.0,24096.0,0.0
mean,3.997012,3.0,1177.0,3006.297643,45.591413,
std,1.998295,0.0,0.0,1161.459452,9.800949,
min,1.0,3.0,1177.0,162.0,8.73,
25%,2.0,3.0,1177.0,2227.0,40.3,
50%,4.0,3.0,1177.0,3535.0,45.89,
75%,6.0,3.0,1177.0,3893.0,52.6,
max,7.0,3.0,1177.0,5120.0,66.71,


In [56]:
C4_depsouth.loc[C4_depsouth['speed'] >= C4_depsouth['speed'].mean(), 'classification'] = 1
C4_depsouth.loc[C4_depsouth['speed'] < 15, 'classification'] = 3
C4_depsouth.loc[(C4_depsouth['speed'] >= 15) 
                      & (C4_depsouth['speed'] < 25) 
                      & ((C4_depsouth['volume'] >= C4_depsouth['volume'].mean())), 'classification'] = 3
C4_depsouth.loc[(C4_depsouth['speed'] >= 15) 
                      & (C4_depsouth['speed'] < C4_depsouth['speed'].mean()) 
                      & ((C4_depsouth['volume'] < C4_depsouth['volume'].mean())), 'classification'] = 2
C4_depsouth.loc[(C4_depsouth['speed'] >= 25) 
                      & (C4_depsouth['speed'] < C4_depsouth['speed'].mean()) 
                      & ((C4_depsouth['volume'] >= C4_depsouth['volume'].mean())), 'classification'] = 2

In [57]:
C4_depsouth['year'] = C4_depsouth['datetime'].dt.year
C4_depsouth['month'] = C4_depsouth['datetime'].dt.month
C4_depsouth['day'] = C4_depsouth['datetime'].dt.day
C4_depsouth['hour'] = C4_depsouth['datetime'].dt.hour

In [58]:
# C4_depsouth = C4_depsouth.drop(['datetime'], axis=1)

In [59]:
C4_depsouth

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2019-01-01,2,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1981,60.80,1.0,2019-01-01 00:00:00,2019,1,1,0
1,2019-01-01,2,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,2415,59.86,1.0,2019-01-01 01:00:00,2019,1,1,1
2,2019-01-01,2,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1553,62.31,1.0,2019-01-01 02:00:00,2019,1,1,2
3,2019-01-01,2,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1027,65.04,1.0,2019-01-01 03:00:00,2019,1,1,3
4,2019-01-01,2,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,918,64.03,1.0,2019-01-01 04:00:00,2019,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24091,2021-09-30,4,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3656,24.12,3.0,2021-09-30 19:00:00,2021,9,30,19
24092,2021-09-30,4,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3784,35.29,2.0,2021-09-30 20:00:00,2021,9,30,20
24093,2021-09-30,4,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3831,40.94,2.0,2021-09-30 21:00:00,2021,9,30,21
24094,2021-09-30,4,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3387,46.57,1.0,2021-09-30 22:00:00,2021,9,30,22


# 월별로 데이터 나누기

In [60]:
C4_ds_month = C4_depsouth['month']
C4_ds_month_list  = sorted(set(C4_ds_month))
C4_ds_month_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [61]:
month_data = []
for i in range(0, len(C4_ds_month_list)):
  month = C4_depsouth[C4_depsouth['month'] == C4_ds_month_list[i]]
  month = month.reset_index(drop=True)
  month_data.append(month)

In [62]:
train_jan = month_data[0]
train_feb = month_data[1]
train_mar = month_data[2]
train_apr = month_data[3]
train_may = month_data[4]
train_jun = month_data[5]
train_jul = month_data[6]
train_aug = month_data[7]
train_sep = month_data[8]
train_oct = month_data[9]
train_nov = month_data[10]
train_dec = month_data[11]

In [63]:
C4_depsouth_test['year'] = C4_depsouth_test['datetime'].dt.year
C4_depsouth_test['month'] = C4_depsouth_test['datetime'].dt.month
C4_depsouth_test['day'] = C4_depsouth_test['datetime'].dt.day
C4_depsouth_test['hour'] = C4_depsouth_test['datetime'].dt.hour

In [64]:
C4_ds_test_mon = C4_depsouth_test['month']
C4_ds_test_mon_list  = sorted(set(C4_ds_test_mon))
C4_ds_test_mon_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [65]:
month_test_data = []
for i in range(0, len(C4_ds_month_list)):
  month = C4_depsouth_test[C4_depsouth_test['month'] == C4_ds_test_mon_list[i]]
  month = month.reset_index(drop=True)
  month_test_data.append(month)

In [66]:
test_jan = month_test_data[0]
test_feb = month_test_data[1]
test_mar = month_test_data[2]
test_apr = month_test_data[3]
test_may = month_test_data[4]
test_jun = month_test_data[5]
test_jul = month_test_data[6]
test_aug = month_test_data[7]
test_sep = month_test_data[8]
test_oct = month_test_data[9]
test_nov = month_test_data[10]
test_dec = month_test_data[11]

In [67]:
test_dec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            744 non-null    object        
 1   dow             744 non-null    int64         
 2   time            744 non-null    object        
 3   branch_name     744 non-null    object        
 4   district_name   744 non-null    object        
 5   branch_num      744 non-null    object        
 6   dep_point       744 non-null    object        
 7   arr_point       744 non-null    object        
 8   lane            744 non-null    int64         
 9   distance        744 non-null    int64         
 10  volume          744 non-null    int64         
 11  speed           744 non-null    float64       
 12  classification  0 non-null      float64       
 13  datetime        744 non-null    datetime64[ns]
 14  year            744 non-null    int64         
 15  month 

# 1월 데이터 머신러닝

## 데이터 가공

In [68]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [69]:
X1 = train_jan.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [70]:
y1 = train_jan[['datetime', 'classification']]
X1_1 = X1.drop(columns = ['datetime', 'classification'])
y1_1 = X1.datetime

In [71]:
X1_1_scaler = scaler.fit_transform(X1_1)

In [72]:
X1_1_sc = pd.DataFrame(X1_1_scaler)
X1_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X1 = pd.concat([y1_1, X1_1_sc], axis = 1)

In [73]:
X1_test = test_jan.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [74]:
y1_test = test_jan[['datetime', 'classification']]
X1_1_test = X1_test.drop(columns = ['datetime', 'classification'])
y1_1_test = X1_test.datetime

In [75]:
X1_1_test_scaler = scaler.fit_transform(X1_1_test)

In [76]:
X1_1_test_sc = pd.DataFrame(X1_1_test_scaler)
X1_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X1_test = pd.concat([y1_1_test, X1_1_test_sc], axis = 1)
X1_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-01-01 00:00:00,0.833333,0.314393,0.935862,0.0,0.0,0.0,0.000000
1,2022-01-01 01:00:00,0.833333,0.345932,0.933907,0.0,0.0,0.0,0.043478
2,2022-01-01 02:00:00,0.833333,0.170213,0.980641,0.0,0.0,0.0,0.086957
3,2022-01-01 03:00:00,0.833333,0.095620,0.967149,0.0,0.0,0.0,0.130435
4,2022-01-01 04:00:00,0.833333,0.082103,0.932538,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-01-31 19:00:00,0.000000,0.679850,0.674423,0.0,0.0,1.0,0.826087
740,2022-01-31 20:00:00,0.000000,0.748436,0.694759,0.0,0.0,1.0,0.869565
741,2022-01-31 21:00:00,0.000000,0.661827,0.678530,0.0,0.0,1.0,0.913043
742,2022-01-31 22:00:00,0.000000,0.387735,0.727610,0.0,0.0,1.0,0.956522


## LightGBM

In [77]:
# optuna 설치
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [78]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error

In [79]:
import lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score

In [80]:
X1 = X1.drop(columns = ['datetime'])
y1 = y1.drop(columns = ['datetime'])
X1_test = X1_test.drop(columns = ['datetime'])
y1_test = y1_test.drop(columns = ['datetime'])

In [81]:
# LigthGBM 하이퍼파라미터 값 지정
def objectiveLGBM(trial: Trial, X, y):
    param = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'learning_rate': 0.01,
        'n_estimators': trial.suggest_int('n_estimators', 700, 3000),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'gpu_use_dp':True
    }
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

     # 학습 모델 생성
    model = LGBMClassifier(**param)
    lgb_model = model.fit(X_train, y_train, verbose=True) # 학습 진행
    train_preds = lgb_model.predict(X_train)
    test_preds = lgb_model.predict(X_test)

    # 모델 성능 확인
    train_precision = precision_score(y_test, test_preds, average= "macro")
    
    return train_precision

In [82]:
'''
study1 = optuna.create_study(direction='maximize',sampler=TPESampler())
study1.optimize(lambda trial : objectiveLGBM(trial, X1, y1), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study1.best_trial.value,study1.best_trial.params))
'''

"\nstudy1 = optuna.create_study(direction='maximize',sampler=TPESampler())\nstudy1.optimize(lambda trial : objectiveLGBM(trial, X1, y1), n_trials=20)\nprint('Best trial: score {},\nparams {}'.format(study1.best_trial.value,study1.best_trial.params))\n"

In [83]:
'''
optuna.visualization.plot_param_importances(study1) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study1) # 최적화 과정 시각화
'''

'\noptuna.visualization.plot_param_importances(study1) # 파라미터 중요도 확인 그래프\noptuna.visualization.plot_optimization_history(study1) # 최적화 과정 시각화\n'

In [84]:
X1_train, X1_val, y1_train, y1_val = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

In [85]:
X1_train.shape, X1_val.shape, y1_train.shape, y1_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [86]:
model = LGBMClassifier(
num_leaves = 271, 
n_estimators = 711, 
feature_fraction = 0.5721850715047968, 
bagging_fraction = 0.7857814996942327, 
bagging_freq = 1,
min_child_samples = 28,
objective =  "multiclass",
boosting_type =  "gbdt"
)

In [87]:
model1 = model.fit(X1_train, y1_train,
          eval_set = [(X1_train, y1_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.721264	training's multi_logloss: 0.721264
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.609195	training's multi_logloss: 0.609195
[3]	training's multi_logloss: 0.544243	training's multi_logloss: 0.544243
[4]	training's multi_logloss: 0.494918	training's multi_logloss: 0.494918
[5]	training's multi_logloss: 0.437854	training's multi_logloss: 0.437854
[6]	training's multi_logloss: 0.402985	training's multi_logloss: 0.402985
[7]	training's multi_logloss: 0.367397	training's multi_logloss: 0.367397
[8]	training's multi_logloss: 0.33951	training's multi_logloss: 0.33951
[9]	training's multi_logloss: 0.322243	training's multi_logloss: 0.322243
[10]	training's multi_logloss: 0.297031	training's multi_logloss: 0.297031
[11]	training's multi_logloss: 0.278101	training's multi_logloss: 0.278101
[12]	training's multi_logloss: 0.257532	training's multi_logloss: 0.257532
[13]	training's multi_logloss: 0.239419	training

In [88]:
train1_preds = model1.predict(X1_train)
val1_preds = model1.predict(X1_val)

In [89]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [90]:
get_clf_eval(y1_train, train1_preds)
get_clf_eval(y1_val, val1_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9868
재현율: 0.9959


In [91]:
preds_1 = model1.predict(X1_test)
preds_1

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 1., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 2., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 1., 2., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 1.

In [92]:
test_jan['classification'] = preds_1
test_jan

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-01-01,6,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1585,57.65,1.0,2022-01-01 00:00:00,2022,1,1,0
1,2022-01-01,6,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1711,57.55,1.0,2022-01-01 01:00:00,2022,1,1,1
2,2022-01-01,6,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1009,59.94,1.0,2022-01-01 02:00:00,2022,1,1,2
3,2022-01-01,6,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,711,59.25,1.0,2022-01-01 03:00:00,2022,1,1,3
4,2022-01-01,6,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,657,57.48,1.0,2022-01-01 04:00:00,2022,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-01-31,1,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3045,44.28,1.0,2022-01-31 19:00:00,2022,1,31,19
740,2022-01-31,1,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3319,45.32,1.0,2022-01-31 20:00:00,2022,1,31,20
741,2022-01-31,1,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,2973,44.49,1.0,2022-01-31 21:00:00,2022,1,31,21
742,2022-01-31,1,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1878,47.00,1.0,2022-01-31 22:00:00,2022,1,31,22


# 2월 데이터 머신러닝

## 데이터 가공

In [93]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [94]:
X2 = train_feb.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [95]:
y2 = train_feb[['datetime', 'classification']]
X2_1 = X2.drop(columns = ['datetime', 'classification'])
y2_1 = X2.datetime

In [96]:
X2_1_scaler = scaler.fit_transform(X2_1)

In [97]:
X2_1_sc = pd.DataFrame(X2_1_scaler)
X2_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X2 = pd.concat([y2_1, X2_1_sc], axis = 1)
X2

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-02-01 00:00:00,0.666667,0.508818,0.891959,0.0,0.0,0.000000,0.000000
1,2019-02-01 01:00:00,0.666667,0.339212,0.948247,0.0,0.0,0.000000,0.043478
2,2019-02-01 02:00:00,0.666667,0.221206,0.964742,0.0,0.0,0.000000,0.086957
3,2019-02-01 03:00:00,0.666667,0.142826,0.941443,0.0,0.0,0.000000,0.130435
4,2019-02-01 04:00:00,0.666667,0.199434,0.881856,0.0,0.0,0.000000,0.173913
...,...,...,...,...,...,...,...,...
2035,2021-02-28 19:00:00,1.000000,0.641411,0.443299,1.0,0.0,0.964286,0.826087
2036,2021-02-28 20:00:00,1.000000,0.576747,0.515258,1.0,0.0,0.964286,0.869565
2037,2021-02-28 21:00:00,1.000000,0.623993,0.512577,1.0,0.0,0.964286,0.913043
2038,2021-02-28 22:00:00,1.000000,0.588940,0.483505,1.0,0.0,0.964286,0.956522


In [98]:
X2_test = test_feb.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [99]:
y2_test = test_feb[['datetime', 'classification']]
X2_1_test = X2_test.drop(columns = ['datetime', 'classification'])
y2_1_test = X2_test.datetime

In [100]:
X2_1_test_scaler = scaler.fit_transform(X2_1_test)

In [101]:
X2_1_test_sc = pd.DataFrame(X2_1_test_scaler)
X2_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X2_test = pd.concat([y2_1_test, X2_1_test_sc], axis = 1)
X2_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-02-01 00:00:00,0.166667,0.112972,0.876391,0.0,0.0,0.0,0.000000
1,2022-02-01 01:00:00,0.166667,0.065094,0.900308,0.0,0.0,0.0,0.043478
2,2022-02-01 02:00:00,0.166667,0.030660,0.876154,0.0,0.0,0.0,0.086957
3,2022-02-01 03:00:00,0.166667,0.000000,0.798958,0.0,0.0,0.0,0.130435
4,2022-02-01 04:00:00,0.166667,0.013443,0.687426,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
667,2022-02-28 19:00:00,0.000000,0.833491,0.093535,0.0,0.0,1.0,0.826087
668,2022-02-28 20:00:00,0.000000,0.848585,0.376036,0.0,0.0,1.0,0.869565
669,2022-02-28 21:00:00,0.000000,0.785613,0.541321,0.0,0.0,1.0,0.913043
670,2022-02-28 22:00:00,0.000000,0.753302,0.658537,0.0,0.0,1.0,0.956522


## LightGBM

In [102]:
X2 = X2.drop(columns = ['datetime'])
y2 = y2.drop(columns = ['datetime'])
X2_test = X2_test.drop(columns = ['datetime'])
y2_test = y2_test.drop(columns = ['datetime'])

In [103]:
'''
study2 = optuna.create_study(direction='maximize',sampler=TPESampler())
study2.optimize(lambda trial : objectiveLGBM(trial, X2, y2), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study2.best_trial.value,study2.best_trial.params))
'''

"\nstudy2 = optuna.create_study(direction='maximize',sampler=TPESampler())\nstudy2.optimize(lambda trial : objectiveLGBM(trial, X2, y2), n_trials=20)\nprint('Best trial: score {},\nparams {}'.format(study2.best_trial.value,study2.best_trial.params))\n"

In [104]:
'''
optuna.visualization.plot_param_importances(study2) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study2) # 최적화 과정 시각화
'''

'\noptuna.visualization.plot_param_importances(study2) # 파라미터 중요도 확인 그래프\noptuna.visualization.plot_optimization_history(study2) # 최적화 과정 시각화\n'

In [105]:
X2_train, X2_val, y2_train, y2_val = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

In [106]:
X2_train.shape, X2_val.shape, y2_train.shape, y2_val.shape

((1632, 7), (408, 7), (1632, 1), (408, 1))

In [107]:
model = LGBMClassifier(
num_leaves = 164, 
n_estimators = 713, 
feature_fraction = 0.6863700410208164, 
bagging_fraction = 0.7502349131140396, 
bagging_freq = 1,
min_child_samples = 27,
objective =  "multiclass",
boosting_type =  "gbdt"
)

In [108]:
model2 = model.fit(X2_train, y2_train,
          eval_set = [(X2_train, y2_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.666049	training's multi_logloss: 0.666049
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.581757	training's multi_logloss: 0.581757
[3]	training's multi_logloss: 0.502235	training's multi_logloss: 0.502235
[4]	training's multi_logloss: 0.441748	training's multi_logloss: 0.441748
[5]	training's multi_logloss: 0.391117	training's multi_logloss: 0.391117
[6]	training's multi_logloss: 0.348944	training's multi_logloss: 0.348944
[7]	training's multi_logloss: 0.317776	training's multi_logloss: 0.317776
[8]	training's multi_logloss: 0.290575	training's multi_logloss: 0.290575
[9]	training's multi_logloss: 0.26139	training's multi_logloss: 0.26139
[10]	training's multi_logloss: 0.241162	training's multi_logloss: 0.241162
[11]	training's multi_logloss: 0.216668	training's multi_logloss: 0.216668
[12]	training's multi_logloss: 0.198848	training's multi_logloss: 0.198848
[13]	training's multi_logloss: 0.183091	training

In [109]:
train2_preds = model2.predict(X2_train)
val2_preds = model2.predict(X2_val)

In [110]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [111]:
get_clf_eval(y2_train, train2_preds)
get_clf_eval(y2_val, val2_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9981
재현율: 0.9985


In [112]:
preds_2= model2.predict(X2_test)
preds_2

array([1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 3., 2., 3., 3., 3., 2.,
       2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       3., 3., 2., 2., 2., 3., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3.,
       3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 3., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2.

In [113]:
test_feb['classification'] = preds_2
test_feb

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-02-01,2,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,753,57.96,1.0,2022-02-01 00:00:00,2022,2,1,0
1,2022-02-01,2,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,550,58.97,1.0,2022-02-01 01:00:00,2022,2,1,1
2,2022-02-01,2,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,404,57.95,1.0,2022-02-01 02:00:00,2022,2,1,2
3,2022-02-01,2,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,274,54.69,1.0,2022-02-01 03:00:00,2022,2,1,3
4,2022-02-01,2,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,331,49.98,1.0,2022-02-01 04:00:00,2022,2,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2022-02-28,1,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3808,24.90,3.0,2022-02-28 19:00:00,2022,2,28,19
668,2022-02-28,1,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3872,36.83,2.0,2022-02-28 20:00:00,2022,2,28,20
669,2022-02-28,1,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3605,43.81,2.0,2022-02-28 21:00:00,2022,2,28,21
670,2022-02-28,1,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3468,48.76,1.0,2022-02-28 22:00:00,2022,2,28,22


# 3월 데이터 머신러닝

## 데이터 가공

In [114]:
X3 = train_mar.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [115]:
y3 = train_mar[['datetime', 'classification']]
X3_1 = X3.drop(columns = ['datetime', 'classification'])
y3_1 = X3.datetime

In [116]:
X3_1_scaler = scaler.fit_transform(X3_1)

In [117]:
X3_1_sc = pd.DataFrame(X3_1_scaler)
X3_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X3 = pd.concat([y3_1, X3_1_sc], axis = 1)
X3

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-03-01 00:00:00,0.666667,0.569626,0.870859,0.0,0.0,0.0,0.000000
1,2019-03-01 01:00:00,0.666667,0.445132,0.949092,0.0,0.0,0.0,0.043478
2,2019-03-01 02:00:00,0.666667,0.297786,0.974172,0.0,0.0,0.0,0.086957
3,2019-03-01 03:00:00,0.666667,0.219234,0.993824,0.0,0.0,0.0,0.130435
4,2019-03-01 04:00:00,0.666667,0.249226,0.951900,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-03-31 19:00:00,0.333333,0.802666,0.469774,1.0,0.0,1.0,0.826087
2228,2021-03-31 20:00:00,0.333333,0.818615,0.545199,1.0,0.0,1.0,0.869565
2229,2021-03-31 21:00:00,0.333333,0.813616,0.552124,1.0,0.0,1.0,0.913043
2230,2021-03-31 22:00:00,0.333333,0.814568,0.641961,1.0,0.0,1.0,0.956522


In [118]:
X3_test = test_mar.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [119]:
y3_test = test_mar[['datetime', 'classification']]
X3_1_test = X3_test.drop(columns = ['datetime', 'classification'])
y3_1_test = X3_test.datetime

In [120]:
X3_1_test_scaler = scaler.fit_transform(X3_1_test)

In [121]:
X3_1_test_sc = pd.DataFrame(X3_1_test_scaler)
X3_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X3_test = pd.concat([y3_1_test, X3_1_test_sc], axis = 1)
X3_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-03-01 00:00:00,0.166667,0.308335,0.920380,0.0,0.0,0.0,0.000000
1,2022-03-01 01:00:00,0.166667,0.195128,0.961589,0.0,0.0,0.0,0.043478
2,2022-03-01 02:00:00,0.166667,0.112491,0.930822,0.0,0.0,0.0,0.086957
3,2022-03-01 03:00:00,0.166667,0.078338,0.923177,0.0,0.0,0.0,0.130435
4,2022-03-01 04:00:00,0.166667,0.111775,0.852881,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-03-31 19:00:00,0.500000,0.771197,0.405370,0.0,0.0,1.0,0.826087
740,2022-03-31 20:00:00,0.500000,0.740387,0.629685,0.0,0.0,1.0,0.869565
741,2022-03-31 21:00:00,0.500000,0.741342,0.698863,0.0,0.0,1.0,0.913043
742,2022-03-31 22:00:00,0.500000,0.689276,0.763379,0.0,0.0,1.0,0.956522


## LightGBM

In [122]:
X3 = X3.drop(columns = ['datetime'])
y3 = y3.drop(columns = ['datetime'])
X3_test = X3_test.drop(columns = ['datetime'])
y3_test = y3_test.drop(columns = ['datetime'])

In [123]:
'''
study3 = optuna.create_study(direction='maximize',sampler=TPESampler())
study3.optimize(lambda trial : objectiveLGBM(trial, X3, y3), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study3.best_trial.value,study3.best_trial.params))
'''

"\nstudy3 = optuna.create_study(direction='maximize',sampler=TPESampler())\nstudy3.optimize(lambda trial : objectiveLGBM(trial, X3, y3), n_trials=20)\nprint('Best trial: score {},\nparams {}'.format(study3.best_trial.value,study3.best_trial.params))\n"

In [124]:
'''
optuna.visualization.plot_param_importances(study3) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study3) # 최적화 과정 시각화
'''

'\noptuna.visualization.plot_param_importances(study3) # 파라미터 중요도 확인 그래프\noptuna.visualization.plot_optimization_history(study3) # 최적화 과정 시각화\n'

In [125]:
X3_train, X3_val, y3_train, y3_val = train_test_split(X3, y3, test_size = 0.2, random_state = 42)

In [126]:
X3_train.shape, X3_val.shape, y3_train.shape, y3_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [127]:
model = LGBMClassifier(
num_leaves = 245, 
n_estimators = 1360, 
feature_fraction = 0.5525604237403039, 
bagging_fraction = 0.8334223646708685, 
bagging_freq = 1,
min_child_samples = 21,
objective =  "multiclass",
boosting_type =  "gbdt"
)

In [128]:
model3 = model.fit(X3_train, y3_train,
          eval_set = [(X3_train, y3_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.663221	training's multi_logloss: 0.663221
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.552986	training's multi_logloss: 0.552986
[3]	training's multi_logloss: 0.495394	training's multi_logloss: 0.495394
[4]	training's multi_logloss: 0.449406	training's multi_logloss: 0.449406
[5]	training's multi_logloss: 0.397058	training's multi_logloss: 0.397058
[6]	training's multi_logloss: 0.363707	training's multi_logloss: 0.363707
[7]	training's multi_logloss: 0.329926	training's multi_logloss: 0.329926
[8]	training's multi_logloss: 0.302529	training's multi_logloss: 0.302529
[9]	training's multi_logloss: 0.284943	training's multi_logloss: 0.284943
[10]	training's multi_logloss: 0.260326	training's multi_logloss: 0.260326
[11]	training's multi_logloss: 0.243462	training's multi_logloss: 0.243462
[12]	training's multi_logloss: 0.223014	training's multi_logloss: 0.223014
[13]	training's multi_logloss: 0.206034	traini

In [129]:
train3_preds = model3.predict(X3_train)
val3_preds = model3.predict(X3_val)

In [130]:
get_clf_eval(y3_train, train3_preds)
get_clf_eval(y3_val, val3_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [131]:
preds_3= model3.predict(X3_test)
preds_3

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       2., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2.

In [132]:
test_mar['classification'] = preds_3
test_mar

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-03-01,2,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1655,58.87,1.0,2022-03-01 00:00:00,2022,3,1,0
1,2022-03-01,2,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1181,61.08,1.0,2022-03-01 01:00:00,2022,3,1,1
2,2022-03-01,2,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,835,59.43,1.0,2022-03-01 02:00:00,2022,3,1,2
3,2022-03-01,2,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,692,59.02,1.0,2022-03-01 03:00:00,2022,3,1,3
4,2022-03-01,2,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,832,55.25,1.0,2022-03-01 04:00:00,2022,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-03-31,4,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3593,31.25,2.0,2022-03-31 19:00:00,2022,3,31,19
740,2022-03-31,4,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3464,43.28,2.0,2022-03-31 20:00:00,2022,3,31,20
741,2022-03-31,4,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3468,46.99,1.0,2022-03-31 21:00:00,2022,3,31,21
742,2022-03-31,4,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3250,50.45,1.0,2022-03-31 22:00:00,2022,3,31,22


# 4월 데이터 머신러닝

## 데이터 가공

In [133]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [134]:
X4 = train_apr.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [135]:
y4 = train_apr[['datetime', 'classification']]
X4_1 = X4.drop(columns = ['datetime', 'classification'])
y4_1 = X4.datetime

In [136]:
X4_1_scaler = scaler.fit_transform(X4_1)

In [137]:
X4_1_sc = pd.DataFrame(X4_1_scaler)
X4_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X4 = pd.concat([y4_1, X4_1_sc], axis = 1)
X4

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-04-01 00:00:00,0.000000,0.255092,0.919963,0.0,0.0,0.0,0.000000
1,2019-04-01 01:00:00,0.000000,0.123909,0.897575,0.0,0.0,0.0,0.043478
2,2019-04-01 02:00:00,0.000000,0.080262,0.880037,0.0,0.0,0.0,0.086957
3,2019-04-01 03:00:00,0.000000,0.059893,0.879478,0.0,0.0,0.0,0.130435
4,2019-04-01 04:00:00,0.000000,0.173375,0.856530,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-04-30 19:00:00,0.666667,0.869787,0.336194,1.0,0.0,1.0,0.826087
2156,2021-04-30 20:00:00,0.666667,0.880213,0.297948,1.0,0.0,1.0,0.869565
2157,2021-04-30 21:00:00,0.666667,0.921678,0.320149,1.0,0.0,1.0,0.913043
2158,2021-04-30 22:00:00,0.666667,0.872696,0.444590,1.0,0.0,1.0,0.956522


In [138]:
X4_test = test_apr.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [139]:
y4_test = test_apr[['datetime', 'classification']]
X4_1_test = X4_test.drop(columns = ['datetime', 'classification'])
y4_1_test = X4_test.datetime

In [140]:
X4_1_test_scaler = scaler.fit_transform(X4_1_test)

In [141]:
X4_1_test_sc = pd.DataFrame(X4_1_test_scaler)
X4_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X4_test = pd.concat([y4_1_test, X4_1_test_sc], axis = 1)
X4_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-04-01 00:00:00,0.666667,0.325082,0.942645,0.0,0.0,0.0,0.000000
1,2022-04-01 01:00:00,0.666667,0.193368,0.927046,0.0,0.0,0.0,0.043478
2,2022-04-01 02:00:00,0.666667,0.125175,0.896808,0.0,0.0,0.0,0.086957
3,2022-04-01 03:00:00,0.666667,0.101588,0.874250,0.0,0.0,0.0,0.130435
4,2022-04-01 04:00:00,0.666667,0.216721,0.855052,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-04-30 19:00:00,0.833333,0.817609,0.486441,0.0,0.0,1.0,0.826087
716,2022-04-30 20:00:00,0.833333,0.794722,0.352292,0.0,0.0,1.0,0.869565
717,2022-04-30 21:00:00,0.833333,0.712985,0.227502,0.0,0.0,1.0,0.913043
718,2022-04-30 22:00:00,0.833333,0.641990,0.399328,0.0,0.0,1.0,0.956522


## LightGBM

In [142]:
X4 = X4.drop(columns = ['datetime'])
y4 = y4.drop(columns = ['datetime'])
X4_test = X4_test.drop(columns = ['datetime'])
y4_test = y4_test.drop(columns = ['datetime'])

In [143]:
'''
study4 = optuna.create_study(direction='maximize',sampler=TPESampler())
study4.optimize(lambda trial : objectiveLGBM(trial, X4, y4), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study4.best_trial.value,study4.best_trial.params))
'''

"\nstudy4 = optuna.create_study(direction='maximize',sampler=TPESampler())\nstudy4.optimize(lambda trial : objectiveLGBM(trial, X4, y4), n_trials=20)\nprint('Best trial: score {},\nparams {}'.format(study4.best_trial.value,study4.best_trial.params))\n"

In [144]:
'''
optuna.visualization.plot_param_importances(study4) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study4) # 최적화 과정 시각화
'''

'\noptuna.visualization.plot_param_importances(study4) # 파라미터 중요도 확인 그래프\noptuna.visualization.plot_optimization_history(study4) # 최적화 과정 시각화\n'

In [145]:
X4_train, X4_val, y4_train, y4_val = train_test_split(X4, y4, test_size = 0.2, random_state = 42)

In [146]:
X4_train.shape, X4_val.shape, y4_train.shape, y4_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [147]:
model = LGBMClassifier(
num_leaves = 430, 
n_estimators = 2955, 
feature_fraction = 0.7515808824316346, 
bagging_fraction = 0.9064340565522693, 
bagging_freq = 6,
min_child_samples = 6,
objective =  "multiclass",
boosting_type =  "gbdt"
)

In [148]:
model4 = model.fit(X4_train, y4_train,
          eval_set = [(X4_train, y4_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.631648	training's multi_logloss: 0.631648
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.549629	training's multi_logloss: 0.549629
[3]	training's multi_logloss: 0.478834	training's multi_logloss: 0.478834
[4]	training's multi_logloss: 0.421187	training's multi_logloss: 0.421187
[5]	training's multi_logloss: 0.372214	training's multi_logloss: 0.372214
[6]	training's multi_logloss: 0.330505	training's multi_logloss: 0.330505
[7]	training's multi_logloss: 0.29693	training's multi_logloss: 0.29693
[8]	training's multi_logloss: 0.267945	training's multi_logloss: 0.267945
[9]	training's multi_logloss: 0.239398	training's multi_logloss: 0.239398
[10]	training's multi_logloss: 0.218318	training's multi_logloss: 0.218318
[11]	training's multi_logloss: 0.19562	training's multi_logloss: 0.19562
[12]	training's multi_logloss: 0.177301	training's multi_logloss: 0.177301
[13]	training's multi_logloss: 0.160764	training's

In [149]:
train4_preds = model4.predict(X4_train)
val4_preds = model4.predict(X4_val)

In [150]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [151]:
get_clf_eval(y4_train, train4_preds)
get_clf_eval(y4_val, val4_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9651
재현율: 0.9651


In [152]:
preds_4= model4.predict(X4_test)
preds_4

array([1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       3., 3., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 2., 2., 2., 3., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 3., 2., 2., 2.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 3., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3.,
       3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 3., 2., 2.,
       2., 3., 2., 3., 2., 3., 3., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [153]:
test_apr['classification'] = preds_4
test_apr

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-04-01,5,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1602,60.71,1.0,2022-04-01 00:00:00,2022,4,1,0
1,2022-04-01,5,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1038,60.06,1.0,2022-04-01 01:00:00,2022,4,1,1
2,2022-04-01,5,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,746,58.80,1.0,2022-04-01 02:00:00,2022,4,1,2
3,2022-04-01,5,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,645,57.86,1.0,2022-04-01 03:00:00,2022,4,1,3
4,2022-04-01,5,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1138,57.06,1.0,2022-04-01 04:00:00,2022,4,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-04-30,6,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3711,41.70,2.0,2022-04-30 19:00:00,2022,4,30,19
716,2022-04-30,6,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3613,36.11,2.0,2022-04-30 20:00:00,2022,4,30,20
717,2022-04-30,6,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3263,30.91,2.0,2022-04-30 21:00:00,2022,4,30,21
718,2022-04-30,6,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,2959,38.07,2.0,2022-04-30 22:00:00,2022,4,30,22


# 5월 데이터 머신러닝

## 데이터 가공

In [154]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [155]:
X5 = train_may.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [156]:
y5 = train_may[['datetime', 'classification']]
X5_1 = X5.drop(columns = ['datetime', 'classification'])
y5_1 = X5.datetime

In [157]:
X5_1_scaler = scaler.fit_transform(X5_1)

In [158]:
X5_1_sc = pd.DataFrame(X5_1_scaler)
X5_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X5 = pd.concat([y5_1, X5_1_sc], axis = 1)
X5

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-05-01 00:00:00,0.333333,0.574078,0.902803,0.0,0.0,0.0,0.000000
1,2019-05-01 01:00:00,0.333333,0.432182,0.975207,0.0,0.0,0.0,0.043478
2,2019-05-01 02:00:00,0.333333,0.313239,1.000000,0.0,0.0,0.0,0.086957
3,2019-05-01 03:00:00,0.333333,0.223974,0.955803,0.0,0.0,0.0,0.130435
4,2019-05-01 04:00:00,0.333333,0.238117,0.916457,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-05-31 19:00:00,0.000000,0.790401,0.269853,1.0,0.0,1.0,0.826087
2228,2021-05-31 20:00:00,0.000000,0.799444,0.383220,1.0,0.0,1.0,0.869565
2229,2021-05-31 21:00:00,0.000000,0.818456,0.598275,1.0,0.0,1.0,0.913043
2230,2021-05-31 22:00:00,0.000000,0.815210,0.642472,1.0,0.0,1.0,0.956522


In [159]:
X5_test = test_may.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [160]:
y5_test = test_may[['datetime', 'classification']]
X5_1_test = X5_test.drop(columns = ['datetime', 'classification'])
y5_1_test = X5_test.datetime

In [161]:
X5_1_test_scaler = scaler.fit_transform(X5_1_test)

In [162]:
X5_1_test_sc = pd.DataFrame(X5_1_test_scaler)
X5_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X5_test = pd.concat([y5_1_test, X5_1_test_sc], axis = 1)
X5_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-05-01 00:00:00,1.000000,0.338473,0.917772,0.0,0.0,0.0,0.000000
1,2022-05-01 01:00:00,1.000000,0.223561,0.955791,0.0,0.0,0.0,0.043478
2,2022-05-01 02:00:00,1.000000,0.139966,0.974801,0.0,0.0,0.0,0.086957
3,2022-05-01 03:00:00,1.000000,0.089376,0.966844,0.0,0.0,0.0,0.130435
4,2022-05-01 04:00:00,1.000000,0.084076,0.928603,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-05-31 19:00:00,0.166667,0.882197,0.157825,0.0,0.0,1.0,0.826087
740,2022-05-31 20:00:00,0.166667,0.799085,0.221485,0.0,0.0,1.0,0.869565
741,2022-05-31 21:00:00,0.166667,0.799807,0.505747,0.0,0.0,1.0,0.913043
742,2022-05-31 22:00:00,0.166667,0.796194,0.458444,0.0,0.0,1.0,0.956522


## LightGBM

In [163]:
X5 = X5.drop(columns = ['datetime'])
y5 = y5.drop(columns = ['datetime'])
X5_test = X5_test.drop(columns = ['datetime'])
y5_test = y5_test.drop(columns = ['datetime'])

In [164]:
# study5 = optuna.create_study(direction='maximize',sampler=TPESampler())
# study5.optimize(lambda trial : objectiveLGBM(trial, X5, y5), n_trials=20)
# print('Best trial: score {},\nparams {}'.format(study5.best_trial.value,study5.best_trial.params))

In [165]:
# optuna.visualization.plot_param_importances(study5) # 파라미터 중요도 확인 그래프
# optuna.visualization.plot_optimization_history(study5) # 최적화 과정 시각화

In [166]:
X5_train, X5_val, y5_train, y5_val = train_test_split(X5, y5, test_size = 0.2, random_state = 42)

In [167]:
X5_train.shape, X5_val.shape, y5_train.shape, y5_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [168]:
model = LGBMClassifier(
num_leaves = 314, 
n_estimators = 2642, 
feature_fraction = 0.49438393519849194, 
bagging_fraction = 0.8502710723691365, 
bagging_freq = 6,
min_child_samples = 40,
objective =  "multiclass",
boosting_type =  "gbdt"
)

In [169]:
model5 = model.fit(X5_train, y5_train,
          eval_set = [(X5_train, y5_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.756551	training's multi_logloss: 0.756551
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.7069	training's multi_logloss: 0.7069
[3]	training's multi_logloss: 0.631773	training's multi_logloss: 0.631773
[4]	training's multi_logloss: 0.621038	training's multi_logloss: 0.621038
[5]	training's multi_logloss: 0.568768	training's multi_logloss: 0.568768
[6]	training's multi_logloss: 0.523486	training's multi_logloss: 0.523486
[7]	training's multi_logloss: 0.479612	training's multi_logloss: 0.479612
[8]	training's multi_logloss: 0.441282	training's multi_logloss: 0.441282
[9]	training's multi_logloss: 0.41038	training's multi_logloss: 0.41038
[10]	training's multi_logloss: 0.379743	training's multi_logloss: 0.379743
[11]	training's multi_logloss: 0.362235	training's multi_logloss: 0.362235
[12]	training's multi_logloss: 0.34001	training's multi_logloss: 0.34001
[13]	training's multi_logloss: 0.313785	training's mul

In [170]:
train5_preds = model5.predict(X5_train)
val5_preds = model5.predict(X5_val)

In [171]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [172]:
get_clf_eval(y5_train, train5_preds)
get_clf_eval(y5_val, val5_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9970
재현율: 0.9969


In [173]:
preds_5= model5.predict(X5_test)
preds_5

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 3., 3., 2., 2., 3., 2.,
       2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 2., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 3., 3., 3., 3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 3., 3., 3., 3., 3., 2., 2., 2., 2., 2., 2., 3., 2.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 3., 2., 2., 2., 2.,
       2., 3., 3., 3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 3., 3., 3., 3., 3., 3., 2., 2., 2., 3., 3., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 3., 3., 3., 3., 3., 3., 2., 2.,
       2., 2., 3., 2., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 3., 3., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2.

In [174]:
test_may['classification'] = preds_5
test_may

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-05-01,7,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1861,59.11,1.0,2022-05-01 00:00:00,2022,5,1,0
1,2022-05-01,7,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1384,60.83,1.0,2022-05-01 01:00:00,2022,5,1,1
2,2022-05-01,7,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1037,61.69,1.0,2022-05-01 02:00:00,2022,5,1,2
3,2022-05-01,7,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,827,61.33,1.0,2022-05-01 03:00:00,2022,5,1,3
4,2022-05-01,7,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,805,59.60,1.0,2022-05-01 04:00:00,2022,5,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-05-31,2,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,4118,24.73,3.0,2022-05-31 19:00:00,2022,5,31,19
740,2022-05-31,2,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3773,27.61,3.0,2022-05-31 20:00:00,2022,5,31,20
741,2022-05-31,2,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3776,40.47,2.0,2022-05-31 21:00:00,2022,5,31,21
742,2022-05-31,2,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3761,38.33,2.0,2022-05-31 22:00:00,2022,5,31,22


# 6월 데이터 머신러닝

## 데이터 가공

In [175]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [176]:
X6 = train_jun.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [177]:
y6 = train_jun[['datetime', 'classification']]
X6_1 = X6.drop(columns = ['datetime', 'classification'])
y6_1 = X6.datetime

In [178]:
X6_1_scaler = scaler.fit_transform(X6_1)

In [179]:
X6_1_sc = pd.DataFrame(X6_1_scaler)
X6_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X6 = pd.concat([y6_1, X6_1_sc], axis = 1)
X6

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-06-01 00:00:00,0.833333,0.623562,0.879317,0.0,0.0,0.0,0.000000
1,2019-06-01 01:00:00,0.833333,0.459273,0.960432,0.0,0.0,0.0,0.043478
2,2019-06-01 02:00:00,0.833333,0.339162,0.985252,0.0,0.0,0.0,0.086957
3,2019-06-01 03:00:00,0.833333,0.254717,0.976079,0.0,0.0,0.0,0.130435
4,2019-06-01 04:00:00,0.833333,0.306259,0.940827,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-06-30 19:00:00,0.333333,0.869075,0.373741,1.0,0.0,1.0,0.826087
2156,2021-06-30 20:00:00,0.333333,0.838932,0.380935,1.0,0.0,1.0,0.869565
2157,2021-06-30 21:00:00,0.333333,0.868385,0.568705,1.0,0.0,1.0,0.913043
2158,2021-06-30 22:00:00,0.333333,0.890244,0.552158,1.0,0.0,1.0,0.956522


In [180]:
X6_test = test_jun.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [181]:
y6_test = test_jun[['datetime', 'classification']]
X6_1_test = X6_test.drop(columns = ['datetime', 'classification'])
y6_1_test = X6_test.datetime

In [182]:
X6_1_test_scaler = scaler.fit_transform(X6_1_test)

In [183]:
X6_1_test_sc = pd.DataFrame(X6_1_test_scaler)
X6_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X6_test = pd.concat([y6_1_test, X6_1_test_sc], axis = 1)
X6_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-06-01 00:00:00,0.333333,0.255185,0.784625,0.0,0.0,0.0,0.000000
1,2022-06-01 01:00:00,0.333333,0.115073,0.906141,0.0,0.0,0.0,0.043478
2,2022-06-01 02:00:00,0.333333,0.053869,0.971908,0.0,0.0,0.0,0.086957
3,2022-06-01 03:00:00,0.333333,0.046029,0.958406,0.0,0.0,0.0,0.130435
4,2022-06-01 04:00:00,0.333333,0.159838,0.925740,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-06-30 19:00:00,0.500000,0.939808,0.157448,0.0,0.0,1.0,0.826087
716,2022-06-30 20:00:00,0.500000,0.874305,0.098650,0.0,0.0,1.0,0.869565
717,2022-06-30 21:00:00,0.500000,0.895549,0.228005,0.0,0.0,1.0,0.913043
718,2022-06-30 22:00:00,0.500000,0.910470,0.137631,0.0,0.0,1.0,0.956522


## LightGBM

In [184]:
X6 = X6.drop(columns = ['datetime'])
y6 = y6.drop(columns = ['datetime'])
X6_test = X6_test.drop(columns = ['datetime'])
y6_test = y6_test.drop(columns = ['datetime'])

In [185]:
# study6 = optuna.create_study(direction='maximize',sampler=TPESampler())
# study6.optimize(lambda trial : objectiveLGBM(trial, X6, y6), n_trials=20)
# print('Best trial: score {},\nparams {}'.format(study6.best_trial.value,study6.best_trial.params))

In [186]:
# optuna.visualization.plot_param_importances(study6) # 파라미터 중요도 확인 그래프
# optuna.visualization.plot_optimization_history(study6) # 최적화 과정 시각화

In [187]:
X6_train, X6_val, y6_train, y6_val = train_test_split(X6, y6, test_size = 0.2, random_state = 42)

In [188]:
X6_train.shape, X6_val.shape, y6_train.shape, y6_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [189]:
params6 = {'num_leaves': 506, 'n_estimators': 1946, 'feature_fraction': 0.6922821375462858, 'bagging_fraction': 0.794758081958033, 'bagging_freq': 2, 'min_child_samples': 34}

In [190]:
model = LGBMClassifier(**params6)

In [191]:
model6 = model.fit(X6_train, y6_train,
          eval_set = [(X6_train, y6_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.687946	training's multi_logloss: 0.687946
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.605867	training's multi_logloss: 0.605867
[3]	training's multi_logloss: 0.516444	training's multi_logloss: 0.516444
[4]	training's multi_logloss: 0.452815	training's multi_logloss: 0.452815
[5]	training's multi_logloss: 0.400567	training's multi_logloss: 0.400567
[6]	training's multi_logloss: 0.358179	training's multi_logloss: 0.358179
[7]	training's multi_logloss: 0.324882	training's multi_logloss: 0.324882
[8]	training's multi_logloss: 0.297782	training's multi_logloss: 0.297782
[9]	training's multi_logloss: 0.268021	training's multi_logloss: 0.268021
[10]	training's multi_logloss: 0.247071	training's multi_logloss: 0.247071
[11]	training's multi_logloss: 0.221926	training's multi_logloss: 0.221926
[12]	training's multi_logloss: 0.203237	training's multi_logloss: 0.203237
[13]	training's multi_logloss: 0.186327	traini

In [192]:
train6_preds = model6.predict(X6_train)
val6_preds = model6.predict(X6_val)

In [193]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [194]:
get_clf_eval(y6_train, train6_preds)
get_clf_eval(y6_val, val6_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9985
재현율: 0.9697


In [195]:
preds_6= model6.predict(X6_test)
preds_6

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 3., 3., 2., 2., 2., 2.,
       2., 2., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       2., 2., 3., 2., 2., 2., 2., 3., 3., 3., 2., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 2., 2., 2., 3., 3., 3., 2., 2., 3., 3., 3.,
       3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 3., 3.,
       3., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 3., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 3., 2., 2., 2., 2., 2., 3., 3., 3., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 2., 2., 2., 2., 2., 3., 3.,
       3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 3., 3., 3., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2.

In [196]:
test_jun['classification'] = preds_6
test_jun

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-06-01,3,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1430,53.94,1.0,2022-06-01 00:00:00,2022,6,1,0
1,2022-06-01,3,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,876,59.52,1.0,2022-06-01 01:00:00,2022,6,1,1
2,2022-06-01,3,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,634,62.54,1.0,2022-06-01 02:00:00,2022,6,1,2
3,2022-06-01,3,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,603,61.92,1.0,2022-06-01 03:00:00,2022,6,1,3
4,2022-06-01,3,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1053,60.42,1.0,2022-06-01 04:00:00,2022,6,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-06-30,4,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,4137,25.14,3.0,2022-06-30 19:00:00,2022,6,30,19
716,2022-06-30,4,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3878,22.44,3.0,2022-06-30 20:00:00,2022,6,30,20
717,2022-06-30,4,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3962,28.38,3.0,2022-06-30 21:00:00,2022,6,30,21
718,2022-06-30,4,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,4021,24.23,3.0,2022-06-30 22:00:00,2022,6,30,22


# 7월 데이터 머신러닝

## 데이터 가공

In [197]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [198]:
X7 = train_jul.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [199]:
y7 = train_jul[['datetime', 'classification']]
X7_1 = X7.drop(columns = ['datetime', 'classification'])
y7_1 = X7.datetime

In [200]:
X7_1_scaler = scaler.fit_transform(X7_1)

In [201]:
X7_1_sc = pd.DataFrame(X7_1_scaler)
X7_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X7 = pd.concat([y7_1, X7_1_sc], axis = 1)
X7

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-07-01 00:00:00,0.000000,0.257331,0.946602,0.0,0.0,0.0,0.000000
1,2019-07-01 01:00:00,0.000000,0.162023,0.965273,0.0,0.0,0.0,0.043478
2,2019-07-01 02:00:00,0.000000,0.085533,0.902353,0.0,0.0,0.0,0.086957
3,2019-07-01 03:00:00,0.000000,0.078446,0.883495,0.0,0.0,0.0,0.130435
4,2019-07-01 04:00:00,0.000000,0.202835,0.883869,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-07-31 19:00:00,0.833333,0.657625,0.599515,1.0,0.0,1.0,0.826087
2228,2021-07-31 20:00:00,0.833333,0.607283,0.638350,1.0,0.0,1.0,0.869565
2229,2021-07-31 21:00:00,0.833333,0.616080,0.636669,1.0,0.0,1.0,0.913043
2230,2021-07-31 22:00:00,0.833333,0.537879,0.663368,1.0,0.0,1.0,0.956522


In [202]:
X7_test = test_jul.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [203]:
y7_test = test_jul[['datetime', 'classification']]
X7_1_test = X7_test.drop(columns = ['datetime', 'classification'])
y7_1_test = X7_test.datetime

In [204]:
X7_1_test_scaler = scaler.fit_transform(X7_1_test)

In [205]:
X7_1_test_sc = pd.DataFrame(X7_1_test_scaler)
X7_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X7_test = pd.concat([y7_1_test, X7_1_test_sc], axis = 1)
X7_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-07-01 00:00:00,0.666667,0.097983,0.240599,0.0,0.0,0.0,0.000000
1,2022-07-01 01:00:00,0.666667,0.092554,0.871628,0.0,0.0,0.0,0.043478
2,2022-07-01 02:00:00,0.666667,0.044467,0.939752,0.0,0.0,0.0,0.086957
3,2022-07-01 03:00:00,0.666667,0.000000,0.929907,0.0,0.0,0.0,0.130435
4,2022-07-01 04:00:00,0.666667,0.037746,0.877535,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-07-31 19:00:00,1.000000,0.743020,0.543808,0.0,0.0,1.0,0.826087
740,2022-07-31 20:00:00,1.000000,0.709152,0.563694,0.0,0.0,1.0,0.869565
741,2022-07-31 21:00:00,1.000000,0.649948,0.588502,0.0,0.0,1.0,0.913043
742,2022-07-31 22:00:00,1.000000,0.611686,0.593424,0.0,0.0,1.0,0.956522


## LightGBM

In [206]:
X7 = X7.drop(columns = ['datetime'])
y7 = y7.drop(columns = ['datetime'])
X7_test = X7_test.drop(columns = ['datetime'])
y7_test = y7_test.drop(columns = ['datetime'])

In [207]:
# study7 = optuna.create_study(direction='maximize',sampler=TPESampler())
# study7.optimize(lambda trial : objectiveLGBM(trial, X7, y7), n_trials=20)
# print('Best trial: score {},\nparams {}'.format(study7.best_trial.value,study7.best_trial.params))

In [208]:
# optuna.visualization.plot_param_importances(study7) # 파라미터 중요도 확인 그래프
# optuna.visualization.plot_optimization_history(study7) # 최적화 과정 시각화

In [209]:
X7_train, X7_val, y7_train, y7_val = train_test_split(X7, y7, test_size = 0.2, random_state = 42)

In [210]:
X7_train.shape, X7_val.shape, y7_train.shape, y7_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [211]:
params7 = {'num_leaves': 305, 'n_estimators': 1032, 'feature_fraction': 0.5656551814579668, 'bagging_fraction': 0.5847336580981628, 'bagging_freq': 5, 'min_child_samples': 6}

In [212]:
model = LGBMClassifier(**params7)

In [213]:
model7 = model.fit(X7_train, y7_train,
          eval_set = [(X7_train, y7_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.66534	training's multi_logloss: 0.66534
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.56264	training's multi_logloss: 0.56264
[3]	training's multi_logloss: 0.501701	training's multi_logloss: 0.501701
[4]	training's multi_logloss: 0.454249	training's multi_logloss: 0.454249
[5]	training's multi_logloss: 0.400916	training's multi_logloss: 0.400916
[6]	training's multi_logloss: 0.364815	training's multi_logloss: 0.364815
[7]	training's multi_logloss: 0.329607	training's multi_logloss: 0.329607
[8]	training's multi_logloss: 0.301479	training's multi_logloss: 0.301479
[9]	training's multi_logloss: 0.281266	training's multi_logloss: 0.281266
[10]	training's multi_logloss: 0.256254	training's multi_logloss: 0.256254
[11]	training's multi_logloss: 0.23959	training's multi_logloss: 0.23959
[12]	training's multi_logloss: 0.219368	training's multi_logloss: 0.219368
[13]	training's multi_logloss: 0.202056	training's m

In [214]:
train7_preds = model7.predict(X7_train)
val7_preds = model7.predict(X7_val)

In [215]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [216]:
get_clf_eval(y7_train, train7_preds)
get_clf_eval(y7_val, val7_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9722
재현율: 0.9984


In [217]:
preds_7= model7.predict(X7_test)
preds_7

array([3., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 3., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 3., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 2., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 2., 2., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [218]:
test_jul['classification'] = preds_7
test_jul

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-07-01,5,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,891,24.97,3.0,2022-07-01 00:00:00,2022,7,1,0
1,2022-07-01,5,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,870,57.02,1.0,2022-07-01 01:00:00,2022,7,1,1
2,2022-07-01,5,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,684,60.48,1.0,2022-07-01 02:00:00,2022,7,1,2
3,2022-07-01,5,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,512,59.98,1.0,2022-07-01 03:00:00,2022,7,1,3
4,2022-07-01,5,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,658,57.32,1.0,2022-07-01 04:00:00,2022,7,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-07-31,7,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3386,40.37,2.0,2022-07-31 19:00:00,2022,7,31,19
740,2022-07-31,7,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3255,41.38,2.0,2022-07-31 20:00:00,2022,7,31,20
741,2022-07-31,7,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3026,42.64,2.0,2022-07-31 21:00:00,2022,7,31,21
742,2022-07-31,7,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,2878,42.89,2.0,2022-07-31 22:00:00,2022,7,31,22


# 8월 데이터 머신러닝

## 데이터 가공

In [219]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [220]:
X8 = train_aug.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [221]:
y8 = train_aug[['datetime', 'classification']]
X8_1 = X8.drop(columns = ['datetime', 'classification'])
y8_1 = X8.datetime

In [222]:
X8_1_scaler = scaler.fit_transform(X8_1)

In [223]:
X8_1_sc = pd.DataFrame(X8_1_scaler)
X8_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X8 = pd.concat([y8_1, X8_1_sc], axis = 1)
X8

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-08-01 00:00:00,0.500000,0.373988,0.831510,0.0,0.0,0.0,0.000000
1,2019-08-01 01:00:00,0.500000,0.228745,0.795842,0.0,0.0,0.0,0.043478
2,2019-08-01 02:00:00,0.500000,0.138411,0.851641,0.0,0.0,0.0,0.086957
3,2019-08-01 03:00:00,0.500000,0.095901,0.828884,0.0,0.0,0.0,0.130435
4,2019-08-01 04:00:00,0.500000,0.136134,0.838731,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-08-31 19:00:00,0.166667,0.864879,0.117943,1.0,0.0,1.0,0.826087
2228,2021-08-31 20:00:00,0.166667,0.944585,0.261707,1.0,0.0,1.0,0.869565
2229,2021-08-31 21:00:00,0.166667,0.742915,0.468709,1.0,0.0,1.0,0.913043
2230,2021-08-31 22:00:00,0.166667,0.816549,0.452516,1.0,0.0,1.0,0.956522


In [224]:
X8_test = test_aug.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [225]:
y8_test = test_aug[['datetime', 'classification']]
X8_1_test = X8_test.drop(columns = ['datetime', 'classification'])
y8_1_test = X8_test.datetime

In [226]:
X8_1_test_scaler = scaler.fit_transform(X8_1_test)

In [227]:
X8_1_test_sc = pd.DataFrame(X8_1_test_scaler)
X8_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X8_test = pd.concat([y8_1_test, X8_1_test_sc], axis = 1)
X8_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-08-01 00:00:00,0.000000,0.178213,0.832074,0.0,0.0,0.0,0.000000
1,2022-08-01 01:00:00,0.000000,0.057731,0.885387,0.0,0.0,0.0,0.043478
2,2022-08-01 02:00:00,0.000000,0.021586,0.878912,0.0,0.0,0.0,0.086957
3,2022-08-01 03:00:00,0.000000,0.000000,0.891863,0.0,0.0,0.0,0.130435
4,2022-08-01 04:00:00,0.000000,0.133534,0.786100,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-08-31 19:00:00,0.333333,0.834086,0.376646,0.0,0.0,1.0,0.826087
740,2022-08-31 20:00:00,0.333333,0.920181,0.336067,0.0,0.0,1.0,0.869565
741,2022-08-31 21:00:00,0.333333,0.841365,0.526225,0.0,0.0,1.0,0.913043
742,2022-08-31 22:00:00,0.333333,0.791165,0.669545,0.0,0.0,1.0,0.956522


## LightGBM

In [228]:
X8 = X8.drop(columns = ['datetime'])
y8 = y8.drop(columns = ['datetime'])
X8_test = X8_test.drop(columns = ['datetime'])
y8_test = y8_test.drop(columns = ['datetime'])

In [229]:
study8 = optuna.create_study(direction='maximize',sampler=TPESampler())
study8.optimize(lambda trial : objectiveLGBM(trial, X8, y8), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study8.best_trial.value,study8.best_trial.params))

[32m[I 2022-12-01 01:23:11,985][0m A new study created in memory with name: no-name-0dd39581-d4f1-4686-b71a-0bd56c30596c[0m
[32m[I 2022-12-01 01:23:13,633][0m Trial 0 finished with value: 0.9894419306184012 and parameters: {'num_leaves': 267, 'n_estimators': 884, 'feature_fraction': 0.9922581693211333, 'bagging_fraction': 0.5616244847929748, 'bagging_freq': 7, 'min_child_samples': 57}. Best is trial 0 with value: 0.9894419306184012.[0m
[32m[I 2022-12-01 01:23:18,783][0m Trial 1 finished with value: 0.7719298245614036 and parameters: {'num_leaves': 217, 'n_estimators': 2833, 'feature_fraction': 0.6518469339167597, 'bagging_fraction': 0.7230268514228073, 'bagging_freq': 7, 'min_child_samples': 59}. Best is trial 0 with value: 0.9894419306184012.[0m
[32m[I 2022-12-01 01:23:23,622][0m Trial 2 finished with value: 0.9909147236702092 and parameters: {'num_leaves': 236, 'n_estimators': 2043, 'feature_fraction': 0.6482804465300751, 'bagging_fraction': 0.874049700278427, 'bagging_fre

Best trial: score 1.0,
params {'num_leaves': 416, 'n_estimators': 1245, 'feature_fraction': 0.8321763167497026, 'bagging_fraction': 0.9871784777647619, 'bagging_freq': 4, 'min_child_samples': 13}


In [230]:
optuna.visualization.plot_param_importances(study8) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study8) # 최적화 과정 시각화

In [231]:
X8_train, X8_val, y8_train, y8_val = train_test_split(X8, y8, test_size = 0.2, random_state = 42)

In [232]:
X8_train.shape, X8_val.shape, y8_train.shape, y8_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [233]:
params8 = {'num_leaves': 458, 'n_estimators': 1283, 'feature_fraction': 0.8026019641266068, 'bagging_fraction': 0.9373292888765477, 'bagging_freq': 4, 'min_child_samples': 5}

In [234]:
model = LGBMClassifier(**params8)

In [235]:
model8 = model.fit(X8_train, y8_train,
          eval_set = [(X8_train, y8_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.621499	training's multi_logloss: 0.621499
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.540819	training's multi_logloss: 0.540819
[3]	training's multi_logloss: 0.473237	training's multi_logloss: 0.473237
[4]	training's multi_logloss: 0.416828	training's multi_logloss: 0.416828
[5]	training's multi_logloss: 0.368451	training's multi_logloss: 0.368451
[6]	training's multi_logloss: 0.326649	training's multi_logloss: 0.326649
[7]	training's multi_logloss: 0.293588	training's multi_logloss: 0.293588
[8]	training's multi_logloss: 0.264521	training's multi_logloss: 0.264521
[9]	training's multi_logloss: 0.236112	training's multi_logloss: 0.236112
[10]	training's multi_logloss: 0.214379	training's multi_logloss: 0.214379
[11]	training's multi_logloss: 0.19205	training's multi_logloss: 0.19205
[12]	training's multi_logloss: 0.173823	training's multi_logloss: 0.173823
[13]	training's multi_logloss: 0.15755	training'

In [236]:
train8_preds = model8.predict(X8_train)
val8_preds = model8.predict(X8_val)

In [237]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [238]:
get_clf_eval(y8_train, train8_preds)
get_clf_eval(y8_val, val8_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9984
재현율: 0.9985


In [239]:
preds_8= model8.predict(X8_test)
preds_8

array([1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2.

In [253]:
test_aug['classification'] = preds_8
test_aug

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-08-01,1,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1235,54.32,1.0,2022-08-01 00:00:00,2022,8,1,0
1,2022-08-01,1,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,755,56.79,1.0,2022-08-01 01:00:00,2022,8,1,1
2,2022-08-01,1,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,611,56.49,1.0,2022-08-01 02:00:00,2022,8,1,2
3,2022-08-01,1,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,525,57.09,1.0,2022-08-01 03:00:00,2022,8,1,3
4,2022-08-01,1,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1057,52.19,1.0,2022-08-01 04:00:00,2022,8,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-08-31,3,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3848,33.22,2.0,2022-08-31 19:00:00,2022,8,31,19
740,2022-08-31,3,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,4191,31.34,2.0,2022-08-31 20:00:00,2022,8,31,20
741,2022-08-31,3,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3877,40.15,2.0,2022-08-31 21:00:00,2022,8,31,21
742,2022-08-31,3,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3677,46.79,1.0,2022-08-31 22:00:00,2022,8,31,22


# 9월 데이터 머신러닝

## 데이터 가공

In [240]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [241]:
X9 = train_sep.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [242]:
y9 = train_sep[['datetime', 'classification']]
X9_1 = X9.drop(columns = ['datetime', 'classification'])
y9_1 = X9.datetime

In [243]:
X9_1_scaler = scaler.fit_transform(X9_1)

In [244]:
X9_1_sc = pd.DataFrame(X9_1_scaler)
X9_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X9 = pd.concat([y9_1, X9_1_sc], axis = 1)
X9

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-09-01 00:00:00,1.0,0.448649,0.833643,0.0,0.0,0.0,0.000000
1,2019-09-01 01:00:00,1.0,0.329260,0.943447,0.0,0.0,0.0,0.043478
2,2019-09-01 02:00:00,1.0,0.212221,0.939319,0.0,0.0,0.0,0.086957
3,2019-09-01 03:00:00,1.0,0.169448,0.938906,0.0,0.0,0.0,0.130435
4,2019-09-01 04:00:00,1.0,0.173443,0.918060,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-09-30 19:00:00,0.5,0.788014,0.152735,1.0,0.0,1.0,0.826087
2156,2021-09-30 20:00:00,0.5,0.818096,0.383282,1.0,0.0,1.0,0.869565
2157,2021-09-30 21:00:00,0.5,0.829142,0.499897,1.0,0.0,1.0,0.913043
2158,2021-09-30 22:00:00,0.5,0.724794,0.616099,1.0,0.0,1.0,0.956522


In [245]:
X9_test = test_sep.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [246]:
y9_test = test_sep[['datetime', 'classification']]
X9_1_test = X9_test.drop(columns = ['datetime', 'classification'])
y9_1_test = X9_test.datetime

In [247]:
X9_1_test_scaler = scaler.fit_transform(X9_1_test)

In [248]:
X9_1_test_sc = pd.DataFrame(X9_1_test_scaler)
X9_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X9_test = pd.concat([y9_1_test, X9_1_test_sc], axis = 1)
X9_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-09-01 00:00:00,0.500000,0.365259,0.928294,0.0,0.0,0.0,0.000000
1,2022-09-01 01:00:00,0.500000,0.221971,0.966770,0.0,0.0,0.0,0.043478
2,2022-09-01 02:00:00,0.500000,0.128959,0.969880,0.0,0.0,0.0,0.086957
3,2022-09-01 03:00:00,0.500000,0.104575,0.913331,0.0,0.0,0.0,0.130435
4,2022-09-01 04:00:00,0.500000,0.232529,0.891955,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-09-30 19:00:00,0.666667,0.784816,0.223086,0.0,0.0,1.0,0.826087
716,2022-09-30 20:00:00,0.666667,0.677476,0.327439,0.0,0.0,1.0,0.869565
717,2022-09-30 21:00:00,0.666667,0.736551,0.521570,0.0,0.0,1.0,0.913043
718,2022-09-30 22:00:00,0.666667,0.852438,0.574038,0.0,0.0,1.0,0.956522


## LightGBM

In [249]:
X9 = X9.drop(columns = ['datetime'])
y9 = y9.drop(columns = ['datetime'])
X9_test = X9_test.drop(columns = ['datetime'])
y9_test = y9_test.drop(columns = ['datetime'])

In [251]:
study9 = optuna.create_study(direction='maximize',sampler=TPESampler())
study9.optimize(lambda trial : objectiveLGBM(trial, X9, y9), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study9.best_trial.value,study9.best_trial.params))

[32m[I 2022-12-01 01:30:22,702][0m A new study created in memory with name: no-name-fff8d16f-67dd-4414-ab5c-2f1704a8bbc1[0m
[32m[I 2022-12-01 01:30:27,646][0m Trial 0 finished with value: 0.8715283861619373 and parameters: {'num_leaves': 248, 'n_estimators': 2741, 'feature_fraction': 0.4850479619380983, 'bagging_fraction': 0.6495389760370698, 'bagging_freq': 7, 'min_child_samples': 51}. Best is trial 0 with value: 0.8715283861619373.[0m
[32m[I 2022-12-01 01:30:29,961][0m Trial 1 finished with value: 0.9824561403508771 and parameters: {'num_leaves': 259, 'n_estimators': 729, 'feature_fraction': 0.5703579778381733, 'bagging_fraction': 0.754508918311433, 'bagging_freq': 2, 'min_child_samples': 25}. Best is trial 1 with value: 0.9824561403508771.[0m
[32m[I 2022-12-01 01:30:33,831][0m Trial 2 finished with value: 0.9111111111111111 and parameters: {'num_leaves': 127, 'n_estimators': 2519, 'feature_fraction': 0.8923087994045691, 'bagging_fraction': 0.4718223923738958, 'bagging_fre

Best trial: score 1.0,
params {'num_leaves': 45, 'n_estimators': 1377, 'feature_fraction': 0.5964536804621, 'bagging_fraction': 0.8328653929259835, 'bagging_freq': 5, 'min_child_samples': 6}


In [252]:
optuna.visualization.plot_param_importances(study9) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study9) # 최적화 과정 시각화

In [254]:
X9_train, X9_val, y9_train, y9_val = train_test_split(X9, y9, test_size = 0.2, random_state = 42)

In [255]:
X9_train.shape, X9_val.shape, y9_train.shape, y9_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [256]:
params9 = {'num_leaves': 45, 'n_estimators': 1377, 'feature_fraction': 0.5964536804621, 'bagging_fraction': 0.8328653929259835, 'bagging_freq': 5, 'min_child_samples': 6}

In [257]:
model = LGBMClassifier(**params9)

In [258]:
model9 = model.fit(X9_train, y9_train,
          eval_set = [(X9_train, y9_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.736017	training's multi_logloss: 0.736017
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.617707	training's multi_logloss: 0.617707
[3]	training's multi_logloss: 0.544493	training's multi_logloss: 0.544493
[4]	training's multi_logloss: 0.488513	training's multi_logloss: 0.488513
[5]	training's multi_logloss: 0.431121	training's multi_logloss: 0.431121
[6]	training's multi_logloss: 0.39064	training's multi_logloss: 0.39064
[7]	training's multi_logloss: 0.351946	training's multi_logloss: 0.351946
[8]	training's multi_logloss: 0.320793	training's multi_logloss: 0.320793
[9]	training's multi_logloss: 0.298521	training's multi_logloss: 0.298521
[10]	training's multi_logloss: 0.271694	training's multi_logloss: 0.271694
[11]	training's multi_logloss: 0.253698	training's multi_logloss: 0.253698
[12]	training's multi_logloss: 0.232197	training's multi_logloss: 0.232197
[13]	training's multi_logloss: 0.213303	training

In [259]:
train9_preds = model9.predict(X9_train)
val9_preds = model9.predict(X9_val)

In [260]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [261]:
get_clf_eval(y9_train, train9_preds)
get_clf_eval(y9_val, val9_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9985
재현율: 0.9825


In [262]:
preds_9= model9.predict(X9_test)
preds_9

array([1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 2.

In [263]:
test_sep['classification'] = preds_9
test_sep

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-09-01,4,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1991,58.76,1.0,2022-09-01 00:00:00,2022,9,1,0
1,2022-09-01,4,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1421,60.74,1.0,2022-09-01 01:00:00,2022,9,1,1
2,2022-09-01,4,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1051,60.90,1.0,2022-09-01 02:00:00,2022,9,1,2
3,2022-09-01,4,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,954,57.99,1.0,2022-09-01 03:00:00,2022,9,1,3
4,2022-09-01,4,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1463,56.89,1.0,2022-09-01 04:00:00,2022,9,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-09-30,5,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3660,22.47,2.0,2022-09-30 19:00:00,2022,9,30,19
716,2022-09-30,5,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3233,27.84,2.0,2022-09-30 20:00:00,2022,9,30,20
717,2022-09-30,5,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3468,37.83,2.0,2022-09-30 21:00:00,2022,9,30,21
718,2022-09-30,5,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3929,40.53,2.0,2022-09-30 22:00:00,2022,9,30,22


# 10월 데이터 머신러닝

## 데이터 가공

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [264]:
X10 = train_oct.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [265]:
y10 = train_oct[['datetime', 'classification']]
X10_1 = X10.drop(columns = ['datetime', 'classification'])
y10_1 = X10.datetime

In [266]:
X10_1_scaler = scaler.fit_transform(X10_1)

In [267]:
X10_1_sc = pd.DataFrame(X10_1_scaler)
X10_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X10 = pd.concat([y10_1, X10_1_sc], axis = 1)
X10

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-10-01 00:00:00,0.166667,0.362506,0.944316,0.0,0.0,0.0,0.000000
1,2019-10-01 01:00:00,0.166667,0.225256,0.975271,0.0,0.0,0.0,0.043478
2,2019-10-01 02:00:00,0.166667,0.110190,0.967977,0.0,0.0,0.0,0.086957
3,2019-10-01 03:00:00,0.166667,0.091663,0.916207,0.0,0.0,0.0,0.130435
4,2019-10-01 04:00:00,0.166667,0.171867,0.946807,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1483,2020-10-31 19:00:00,0.833333,0.745490,0.509340,1.0,0.0,1.0,0.826087
1484,2020-10-31 20:00:00,0.833333,0.714286,0.601316,1.0,0.0,1.0,0.869565
1485,2020-10-31 21:00:00,0.833333,0.719405,0.610390,1.0,0.0,1.0,0.913043
1486,2020-10-31 22:00:00,0.833333,0.714773,0.653976,1.0,0.0,1.0,0.956522


In [268]:
X10_test = test_oct.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [269]:
y10_test = test_oct[['datetime', 'classification']]
X10_1_test = X10_test.drop(columns = ['datetime', 'classification'])
y10_1_test = X10_test.datetime

In [270]:
X10_1_test_scaler = scaler.fit_transform(X10_1_test)

In [271]:
X10_1_test_sc = pd.DataFrame(X10_1_test_scaler)
X10_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X10_test = pd.concat([y10_1_test, X10_1_test_sc], axis = 1)
X10_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-10-01 00:00:00,0.666667,0.209291,0.948775,0.0,0.0,0.0,0.000000
1,2021-10-01 01:00:00,0.666667,0.114735,0.958426,0.0,0.0,0.0,0.043478
2,2021-10-01 02:00:00,0.666667,0.063351,0.933185,0.0,0.0,0.0,0.086957
3,2021-10-01 03:00:00,0.666667,0.050680,0.890126,0.0,0.0,0.0,0.130435
4,2021-10-01 04:00:00,0.666667,0.171046,0.871566,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2021-10-31 19:00:00,1.000000,0.803144,0.400396,0.0,0.0,1.0,0.826087
740,2021-10-31 20:00:00,1.000000,0.678085,0.305122,0.0,0.0,1.0,0.869565
741,2021-10-31 21:00:00,1.000000,0.701314,0.360802,0.0,0.0,1.0,0.913043
742,2021-10-31 22:00:00,1.000000,0.809714,0.633259,0.0,0.0,1.0,0.956522


## LightGBM

In [272]:
X10 = X10.drop(columns = ['datetime'])
y10 = y10.drop(columns = ['datetime'])
X10_test = X10_test.drop(columns = ['datetime'])
y10_test = y10_test.drop(columns = ['datetime'])

In [273]:
study10 = optuna.create_study(direction='maximize',sampler=TPESampler())
study10.optimize(lambda trial : objectiveLGBM(trial, X10, y10), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study10.best_trial.value,study10.best_trial.params))

[32m[I 2022-12-01 01:37:37,275][0m A new study created in memory with name: no-name-36a4a45f-12fb-40a0-a238-88aae6bd4f31[0m
[32m[I 2022-12-01 01:37:39,969][0m Trial 0 finished with value: 0.9185797827903089 and parameters: {'num_leaves': 279, 'n_estimators': 2059, 'feature_fraction': 0.5437415842289153, 'bagging_fraction': 0.7447085977741574, 'bagging_freq': 4, 'min_child_samples': 75}. Best is trial 0 with value: 0.9185797827903089.[0m
[32m[I 2022-12-01 01:37:50,036][0m Trial 1 finished with value: 0.9374215528061681 and parameters: {'num_leaves': 186, 'n_estimators': 2057, 'feature_fraction': 0.8283158548910199, 'bagging_fraction': 0.7788695071165546, 'bagging_freq': 7, 'min_child_samples': 9}. Best is trial 1 with value: 0.9374215528061681.[0m
[32m[I 2022-12-01 01:37:56,075][0m Trial 2 finished with value: 0.938983488132095 and parameters: {'num_leaves': 493, 'n_estimators': 1428, 'feature_fraction': 0.6427069599603247, 'bagging_fraction': 0.9949223447875365, 'bagging_fre

Best trial: score 1.0,
params {'num_leaves': 366, 'n_estimators': 2999, 'feature_fraction': 0.8897902219990725, 'bagging_fraction': 0.8917496781058201, 'bagging_freq': 1, 'min_child_samples': 49}


In [274]:
optuna.visualization.plot_param_importances(study10) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study10) # 최적화 과정 시각화

In [278]:
X10_train, X10_val, y10_train, y10_val = train_test_split(X10, y10, test_size = 0.2, random_state = 42)

In [279]:
X10_train.shape, X10_val.shape, y10_train.shape, y10_val.shape

((1190, 7), (298, 7), (1190, 1), (298, 1))

In [280]:
params10 = {'num_leaves': 366, 'n_estimators': 2999, 'feature_fraction': 0.8897902219990725, 'bagging_fraction': 0.8917496781058201, 'bagging_freq': 1, 'min_child_samples': 49}

In [281]:
model = LGBMClassifier(**params10)

In [282]:
model10 = model.fit(X10_train, y10_train,
          eval_set = [(X10_train, y10_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.740744	training's multi_logloss: 0.740744
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.656285	training's multi_logloss: 0.656285
[3]	training's multi_logloss: 0.569795	training's multi_logloss: 0.569795
[4]	training's multi_logloss: 0.497086	training's multi_logloss: 0.497086
[5]	training's multi_logloss: 0.438638	training's multi_logloss: 0.438638
[6]	training's multi_logloss: 0.393685	training's multi_logloss: 0.393685
[7]	training's multi_logloss: 0.357724	training's multi_logloss: 0.357724
[8]	training's multi_logloss: 0.326061	training's multi_logloss: 0.326061
[9]	training's multi_logloss: 0.294601	training's multi_logloss: 0.294601
[10]	training's multi_logloss: 0.263869	training's multi_logloss: 0.263869
[11]	training's multi_logloss: 0.237186	training's multi_logloss: 0.237186
[12]	training's multi_logloss: 0.217738	training's multi_logloss: 0.217738
[13]	training's multi_logloss: 0.196309	traini

In [286]:
train10_preds = model10.predict(X10_train)
val10_preds = model10.predict(X10_val)

In [287]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [288]:
get_clf_eval(y10_train, train10_preds)
get_clf_eval(y10_val, val10_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9974
재현율: 0.9762


In [290]:
preds_10= model10.predict(X10_test)
preds_10

array([1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 3., 2., 2., 2., 2., 2.,
       3., 3., 3., 3., 2., 3., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       3., 3., 3., 3., 3., 3., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 2., 2., 3., 3., 3., 3., 3., 3., 2., 2., 3.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 2., 2., 2., 2., 2., 2., 2., 3., 2., 2., 3., 3., 3., 3., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 3., 2., 2., 2., 2., 2., 2.,
       3., 3., 3., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 3., 2., 2., 2., 2., 2., 2., 3., 3., 3., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 3., 3., 2., 2., 3., 3., 2., 3., 3., 3., 3., 3., 2.,
       3., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 3., 3.,
       3., 3., 3., 3., 2., 2., 2., 2., 3., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2.

In [291]:
test_oct['classification'] = preds_10
test_oct

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-10-01,5,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1393,59.67,1.0,2021-10-01 00:00:00,2021,10,1,0
1,2021-10-01,5,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,990,60.06,1.0,2021-10-01 01:00:00,2021,10,1,1
2,2021-10-01,5,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,771,59.04,1.0,2021-10-01 02:00:00,2021,10,1,2
3,2021-10-01,5,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,717,57.30,1.0,2021-10-01 03:00:00,2021,10,1,3
4,2021-10-01,5,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1230,56.55,1.0,2021-10-01 04:00:00,2021,10,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2021-10-31,7,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3924,37.51,2.0,2021-10-31 19:00:00,2021,10,31,19
740,2021-10-31,7,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3391,33.66,2.0,2021-10-31 20:00:00,2021,10,31,20
741,2021-10-31,7,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3490,35.91,2.0,2021-10-31 21:00:00,2021,10,31,21
742,2021-10-31,7,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3952,46.92,2.0,2021-10-31 22:00:00,2021,10,31,22


# 11월 데이터 머신러닝

## 데이터 가공

In [292]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [293]:
X11 = train_nov.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [294]:
y11 = train_nov[['datetime', 'classification']]
X11_1 = X11.drop(columns = ['datetime', 'classification'])
y11_1 = X11.datetime

In [295]:
X11_1_scaler = scaler.fit_transform(X11_1)

In [296]:
X11_1_sc = pd.DataFrame(X11_1_scaler)
X11_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X11 = pd.concat([y11_1, X11_1_sc], axis = 1)
X11

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-11-01 00:00:00,0.666667,0.295366,0.471619,0.0,0.0,0.0,0.000000
1,2019-11-01 01:00:00,0.666667,0.183171,0.000000,0.0,0.0,0.0,0.043478
2,2019-11-01 02:00:00,0.666667,0.129512,0.493726,0.0,0.0,0.0,0.086957
3,2019-11-01 03:00:00,0.666667,0.074390,0.753635,0.0,0.0,0.0,0.130435
4,2019-11-01 04:00:00,0.666667,0.134878,0.855407,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1435,2020-11-30 19:00:00,0.000000,0.867073,0.246365,1.0,0.0,1.0,0.826087
1436,2020-11-30 20:00:00,0.000000,0.923171,0.375822,1.0,0.0,1.0,0.869565
1437,2020-11-30 21:00:00,0.000000,0.877317,0.478590,1.0,0.0,1.0,0.913043
1438,2020-11-30 22:00:00,0.000000,0.597073,0.618004,1.0,0.0,1.0,0.956522


In [297]:
X11_test = test_nov.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [298]:
y11_test = test_nov[['datetime', 'classification']]
X11_1_test = X11_test.drop(columns = ['datetime', 'classification'])
y11_1_test = X11_test.datetime

In [299]:
X11_1_test_scaler = scaler.fit_transform(X11_1_test)

In [300]:
X11_1_test_sc = pd.DataFrame(X11_1_test_scaler)
X11_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X11_test = pd.concat([y11_1_test, X11_1_test_sc], axis = 1)
X11_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-11-01 00:00:00,0.000000,0.156922,0.894522,0.0,0.0,0.0,0.000000
1,2021-11-01 01:00:00,0.000000,0.058972,0.896560,0.0,0.0,0.0,0.043478
2,2021-11-01 02:00:00,0.000000,0.008859,0.899049,0.0,0.0,0.0,0.086957
3,2021-11-01 03:00:00,0.000000,0.008099,0.877999,0.0,0.0,0.0,0.130435
4,2021-11-01 04:00:00,0.000000,0.133637,0.879357,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2021-11-30 19:00:00,0.166667,0.812959,0.316433,0.0,0.0,1.0,0.826087
716,2021-11-30 20:00:00,0.166667,0.890661,0.528067,0.0,0.0,1.0,0.869565
717,2021-11-30 21:00:00,0.166667,0.876993,0.647804,0.0,0.0,1.0,0.913043
718,2021-11-30 22:00:00,0.166667,0.602885,0.714124,0.0,0.0,1.0,0.956522


## LightGBM

In [301]:
X11 = X11.drop(columns = ['datetime'])
y11 = y11.drop(columns = ['datetime'])
X11_test = X11_test.drop(columns = ['datetime'])
y11_test = y11_test.drop(columns = ['datetime'])

In [302]:
study11 = optuna.create_study(direction='maximize',sampler=TPESampler())
study11.optimize(lambda trial : objectiveLGBM(trial, X11, y11), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study11.best_trial.value,study11.best_trial.params))

[32m[I 2022-12-01 01:43:39,944][0m A new study created in memory with name: no-name-0306e8ae-0565-4b89-a514-53f45ae06176[0m
[32m[I 2022-12-01 01:43:44,002][0m Trial 0 finished with value: 0.924066924066924 and parameters: {'num_leaves': 369, 'n_estimators': 2383, 'feature_fraction': 0.9423290691285752, 'bagging_fraction': 0.7343146475841473, 'bagging_freq': 2, 'min_child_samples': 52}. Best is trial 0 with value: 0.924066924066924.[0m
[32m[I 2022-12-01 01:43:52,032][0m Trial 1 finished with value: 0.8693660745384882 and parameters: {'num_leaves': 476, 'n_estimators': 2448, 'feature_fraction': 0.6398090140967945, 'bagging_fraction': 0.6690533161706156, 'bagging_freq': 1, 'min_child_samples': 76}. Best is trial 0 with value: 0.924066924066924.[0m
[32m[I 2022-12-01 01:43:55,633][0m Trial 2 finished with value: 0.8687546607009694 and parameters: {'num_leaves': 270, 'n_estimators': 1390, 'feature_fraction': 0.662154628022262, 'bagging_fraction': 0.7108583900595373, 'bagging_freq'

Best trial: score 1.0,
params {'num_leaves': 16, 'n_estimators': 853, 'feature_fraction': 0.42133408856644233, 'bagging_fraction': 0.5729709364685253, 'bagging_freq': 1, 'min_child_samples': 6}


In [303]:
optuna.visualization.plot_param_importances(study11) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study11) # 최적화 과정 시각화

In [304]:
X11_train, X11_val, y11_train, y11_val = train_test_split(X11, y11, test_size = 0.2, random_state = 42)

In [305]:
X11_train.shape, X11_val.shape, y11_train.shape, y11_val.shape

((1152, 7), (288, 7), (1152, 1), (288, 1))

In [306]:
params11 = {'num_leaves': 16, 'n_estimators': 853, 'feature_fraction': 0.42133408856644233, 'bagging_fraction': 0.5729709364685253, 'bagging_freq': 1, 'min_child_samples': 6}

In [307]:
model = LGBMClassifier(**params11)

In [308]:
model11 = model.fit(X11_train, y11_train,
          eval_set = [(X11_train, y11_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.756436	training's multi_logloss: 0.756436
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.720356	training's multi_logloss: 0.720356
[3]	training's multi_logloss: 0.640579	training's multi_logloss: 0.640579
[4]	training's multi_logloss: 0.631456	training's multi_logloss: 0.631456
[5]	training's multi_logloss: 0.575796	training's multi_logloss: 0.575796
[6]	training's multi_logloss: 0.524493	training's multi_logloss: 0.524493
[7]	training's multi_logloss: 0.476911	training's multi_logloss: 0.476911
[8]	training's multi_logloss: 0.435001	training's multi_logloss: 0.435001
[9]	training's multi_logloss: 0.402518	training's multi_logloss: 0.402518
[10]	training's multi_logloss: 0.370078	training's multi_logloss: 0.370078
[11]	training's multi_logloss: 0.350785	training's multi_logloss: 0.350785
[12]	training's multi_logloss: 0.32942	training's multi_logloss: 0.32942
[13]	training's multi_logloss: 0.302716	training

In [316]:
train11_preds = model11.predict(X11_train)
val11_preds = model11.predict(X11_val)

In [310]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [317]:
get_clf_eval(y11_train, train11_preds)
get_clf_eval(y11_val, val11_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [318]:
preds_11= model11.predict(X11_test)
preds_11

array([1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3.,
       3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       3., 3., 2., 2., 2., 2., 2., 2., 3., 3., 3., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 3.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 3., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 2., 3., 3., 3., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 3., 3., 3., 3., 3.,
       2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 3., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 2., 2., 2., 2., 2., 2., 3.,
       3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2.

In [319]:
test_nov['classification'] = preds_11
test_nov

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-11-01,1,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1075,57.82,1.0,2021-11-01 00:00:00,2021,11,1,0
1,2021-11-01,1,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,688,57.91,1.0,2021-11-01 01:00:00,2021,11,1,1
2,2021-11-01,1,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,490,58.02,1.0,2021-11-01 02:00:00,2021,11,1,2
3,2021-11-01,1,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,487,57.09,1.0,2021-11-01 03:00:00,2021,11,1,3
4,2021-11-01,1,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,983,57.15,1.0,2021-11-01 04:00:00,2021,11,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2021-11-30,2,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3667,32.28,2.0,2021-11-30 19:00:00,2021,11,30,19
716,2021-11-30,2,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3974,41.63,2.0,2021-11-30 20:00:00,2021,11,30,20
717,2021-11-30,2,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3920,46.92,1.0,2021-11-30 21:00:00,2021,11,30,21
718,2021-11-30,2,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,2837,49.85,1.0,2021-11-30 22:00:00,2021,11,30,22


# 12월 데이터 머신러닝

## 데이터 가공

In [320]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [321]:
X12 = train_dec.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [322]:
y12 = train_dec[['datetime', 'classification']]
X12_1 = X12.drop(columns = ['datetime', 'classification'])
y12_1 = X12.datetime

In [323]:
X12_1_scaler = scaler.fit_transform(X12_1)

In [324]:
X12_1_sc = pd.DataFrame(X12_1_scaler)
X12_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X12 = pd.concat([y12_1, X12_1_sc], axis = 1)
X12

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-12-01 00:00:00,1.0,0.496891,0.842117,0.0,0.0,0.0,0.000000
1,2019-12-01 01:00:00,1.0,0.321377,0.936100,0.0,0.0,0.0,0.043478
2,2019-12-01 02:00:00,1.0,0.175514,0.983970,0.0,0.0,0.0,0.086957
3,2019-12-01 03:00:00,1.0,0.140363,0.946201,0.0,0.0,0.0,0.130435
4,2019-12-01 04:00:00,1.0,0.106648,0.938296,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1483,2020-12-31 19:00:00,0.5,0.783835,0.057971,1.0,0.0,1.0,0.826087
1484,2020-12-31 20:00:00,0.5,0.777379,0.257356,1.0,0.0,1.0,0.869565
1485,2020-12-31 21:00:00,0.5,0.819464,0.476724,1.0,0.0,1.0,0.913043
1486,2020-12-31 22:00:00,0.5,0.640363,0.572025,1.0,0.0,1.0,0.956522


In [325]:
X12_test = test_dec.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [326]:
y12_test = test_dec[['datetime', 'classification']]
X12_1_test = X12_test.drop(columns = ['datetime', 'classification'])
y12_1_test = X12_test.datetime

In [327]:
X12_1_test_scaler = scaler.fit_transform(X12_1_test)

In [328]:
X12_1_test_sc = pd.DataFrame(X12_1_test_scaler)
X12_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X12_test = pd.concat([y12_1_test, X12_1_test_sc], axis = 1)
X12_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-12-01 00:00:00,0.333333,0.381201,0.912365,0.0,0.0,0.0,0.000000
1,2021-12-01 01:00:00,0.333333,0.238073,0.929800,0.0,0.0,0.0,0.043478
2,2021-12-01 02:00:00,0.333333,0.124377,0.916953,0.0,0.0,0.0,0.086957
3,2021-12-01 03:00:00,0.333333,0.077380,0.849966,0.0,0.0,0.0,0.130435
4,2021-12-01 04:00:00,0.333333,0.219084,0.849507,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2021-12-31 19:00:00,0.666667,0.722288,0.056664,0.0,0.0,1.0,0.826087
740,2021-12-31 20:00:00,0.666667,0.774745,0.264969,0.0,0.0,1.0,0.869565
741,2021-12-31 21:00:00,0.666667,0.837883,0.494150,0.0,0.0,1.0,0.913043
742,2021-12-31 22:00:00,0.666667,0.634465,0.593714,0.0,0.0,1.0,0.956522


## LightGBM

In [329]:
X12 = X12.drop(columns = ['datetime'])
y12 = y12.drop(columns = ['datetime'])
X12_test = X12_test.drop(columns = ['datetime'])
y12_test = y12_test.drop(columns = ['datetime'])

In [330]:
study12 = optuna.create_study(direction='maximize',sampler=TPESampler())
study12.optimize(lambda trial : objectiveLGBM(trial, X12, y12), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study12.best_trial.value,study12.best_trial.params))

[32m[I 2022-12-01 01:49:20,586][0m A new study created in memory with name: no-name-01af5b7a-dc30-49b4-8f50-0f0bcef20086[0m
[32m[I 2022-12-01 01:49:22,869][0m Trial 0 finished with value: 0.9743589743589745 and parameters: {'num_leaves': 421, 'n_estimators': 845, 'feature_fraction': 0.692533571092421, 'bagging_fraction': 0.5526888740902236, 'bagging_freq': 2, 'min_child_samples': 26}. Best is trial 0 with value: 0.9743589743589745.[0m
[32m[I 2022-12-01 01:49:29,625][0m Trial 1 finished with value: 0.9520933977455717 and parameters: {'num_leaves': 256, 'n_estimators': 2569, 'feature_fraction': 0.7862905639630257, 'bagging_fraction': 0.8545997887129484, 'bagging_freq': 4, 'min_child_samples': 59}. Best is trial 0 with value: 0.9743589743589745.[0m
[32m[I 2022-12-01 01:49:31,988][0m Trial 2 finished with value: 0.9544745702640439 and parameters: {'num_leaves': 198, 'n_estimators': 1295, 'feature_fraction': 0.688704103999282, 'bagging_fraction': 0.8261603750741302, 'bagging_freq

Best trial: score 1.0,
params {'num_leaves': 475, 'n_estimators': 1148, 'feature_fraction': 0.88160864826669, 'bagging_fraction': 0.8583465939456989, 'bagging_freq': 3, 'min_child_samples': 24}


In [331]:
optuna.visualization.plot_param_importances(study12) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study12) # 최적화 과정 시각화

In [332]:
X12_train, X12_val, y12_train, y12_val = train_test_split(X12, y12, test_size = 0.2, random_state = 42)

In [333]:
X12_train.shape, X12_val.shape, y12_train.shape, y12_val.shape

((1190, 7), (298, 7), (1190, 1), (298, 1))

In [347]:
params12 = {'num_leaves': 475, 'n_estimators': 1148, 'feature_fraction': 0.88160864826669, 'bagging_fraction': 0.8583465939456989, 'bagging_freq': 3, 'min_child_samples': 24}

In [348]:
model = LGBMClassifier(**params12)

In [349]:
model12 = model.fit(X12_train, y12_train,
          eval_set = [(X12_train, y12_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.682429	training's multi_logloss: 0.682429
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.599867	training's multi_logloss: 0.599867
[3]	training's multi_logloss: 0.51113	training's multi_logloss: 0.51113
[4]	training's multi_logloss: 0.445381	training's multi_logloss: 0.445381
[5]	training's multi_logloss: 0.391617	training's multi_logloss: 0.391617
[6]	training's multi_logloss: 0.349427	training's multi_logloss: 0.349427
[7]	training's multi_logloss: 0.318073	training's multi_logloss: 0.318073
[8]	training's multi_logloss: 0.290552	training's multi_logloss: 0.290552
[9]	training's multi_logloss: 0.260936	training's multi_logloss: 0.260936
[10]	training's multi_logloss: 0.233238	training's multi_logloss: 0.233238
[11]	training's multi_logloss: 0.208934	training's multi_logloss: 0.208934
[12]	training's multi_logloss: 0.191106	training's multi_logloss: 0.191106
[13]	training's multi_logloss: 0.171703	training

In [350]:
train12_preds = model12.predict(X12_train)
val12_preds = model12.predict(X12_val)

In [351]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [352]:
get_clf_eval(y12_train, train12_preds)
get_clf_eval(y12_val, val12_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9259
재현율: 0.9929


In [353]:
preds_12= model12.predict(X12_test)
preds_12

array([1., 1., 1., 1., 1., 1., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 3., 2., 3., 2., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 3.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       2., 3., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1.,
       2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2.

In [354]:
test_dec['classification'] = preds_12
test_dec

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-12-01,3,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,2236,58.94,1.0,2021-12-01 00:00:00,2021,12,1,0
1,2021-12-01,3,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1633,59.70,1.0,2021-12-01 01:00:00,2021,12,1,1
2,2021-12-01,3,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1154,59.14,1.0,2021-12-01 02:00:00,2021,12,1,2
3,2021-12-01,3,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,956,56.22,1.0,2021-12-01 03:00:00,2021,12,1,3
4,2021-12-01,3,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1553,56.20,1.0,2021-12-01 04:00:00,2021,12,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2021-12-31,5,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3673,21.64,3.0,2021-12-31 19:00:00,2021,12,31,19
740,2021-12-31,5,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3894,30.72,2.0,2021-12-31 20:00:00,2021,12,31,20
741,2021-12-31,5,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,4160,40.71,2.0,2021-12-31 21:00:00,2021,12,31,21
742,2021-12-31,5,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3303,45.05,1.0,2021-12-31 22:00:00,2021,12,31,22


In [355]:
result = pd.concat([test_jan,
                    test_feb,
                    test_mar,
                    test_apr,
                    test_may,
                    test_jun,
                    test_jul,
                    test_aug,
                    test_sep,
                    test_oct,
                    test_nov,
                    test_dec])
result = result.sort_values(by = 'datetime')
result = result.reset_index(drop = True)
result

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-10-01,5,0:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1393,59.67,1.0,2021-10-01 00:00:00,2021,10,1,0
1,2021-10-01,5,1:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,990,60.06,1.0,2021-10-01 01:00:00,2021,10,1,1
2,2021-10-01,5,2:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,771,59.04,1.0,2021-10-01 02:00:00,2021,10,1,2
3,2021-10-01,5,3:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,717,57.30,1.0,2021-10-01 03:00:00,2021,10,1,3
4,2021-10-01,5,4:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,1230,56.55,1.0,2021-10-01 04:00:00,2021,10,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2022-09-30,5,19:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3660,22.47,2.0,2022-09-30 19:00:00,2022,9,30,19
8756,2022-09-30,5,20:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3233,27.84,2.0,2022-09-30 20:00:00,2022,9,30,20
8757,2022-09-30,5,21:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3468,37.83,2.0,2022-09-30 21:00:00,2022,9,30,21
8758,2022-09-30,5,22:00:00,성산대교,영등포구,C-04,성산대교남단,성산대교북단,3,1177,3929,40.53,2.0,2022-09-30 22:00:00,2022,9,30,22


In [356]:
result = result.drop(columns = ['dow', 
                                'district_name', 
                                'branch_num', 
                                'arr_point', 
                                'lane', 
                                'distance', 
                                'volume', 
                                'speed', 
                                'datetime', 
                                'year', 
                                'month', 
                                'day', 
                                'hour'])
result

Unnamed: 0,date,time,branch_name,dep_point,classification
0,2021-10-01,0:00:00,성산대교,성산대교남단,1.0
1,2021-10-01,1:00:00,성산대교,성산대교남단,1.0
2,2021-10-01,2:00:00,성산대교,성산대교남단,1.0
3,2021-10-01,3:00:00,성산대교,성산대교남단,1.0
4,2021-10-01,4:00:00,성산대교,성산대교남단,1.0
...,...,...,...,...,...
8755,2022-09-30,19:00:00,성산대교,성산대교남단,2.0
8756,2022-09-30,20:00:00,성산대교,성산대교남단,2.0
8757,2022-09-30,21:00:00,성산대교,성산대교남단,2.0
8758,2022-09-30,22:00:00,성산대교,성산대교남단,2.0


In [357]:
result.to_csv('sungsan_depsouth_result.csv', index = False)