# 라이브러리

In [1]:
import pandas as pd
import random
import os
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

In [2]:
# 한글 폰트 깨짐 현상 해결을 위한 나눔 폰트 설치
# 코드 1회 실행 후 주석 처리하고 런타임 재시작 및 모두 실행
# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

## 데이터 로드

In [4]:
# 경로 설정
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# csv 파일 읽어오기
# root = '/content/drive/MyDrive/최종프로젝트/교통/분석/2nd_modified_data/'
root = '/content/drive/MyDrive/Project/'
C13_depsouth = pd.read_csv(root + 'Data_hannam_depsouth.csv', encoding='cp949')
C13_depsouth_test = pd.read_csv(root + 'hannam_depsouth_test.csv', encoding='cp949')

In [6]:
# 데이터 확인
print(C13_depsouth.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24096 entries, 0 to 24095
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            24096 non-null  object 
 1   dow             24096 non-null  int64  
 2   time            24096 non-null  object 
 3   branch_name     24096 non-null  object 
 4   district_name   24096 non-null  object 
 5   branch_num      24096 non-null  object 
 6   dep_point       24096 non-null  object 
 7   arr_point       24096 non-null  object 
 8   lane            24096 non-null  int64  
 9   distance        24096 non-null  int64  
 10  volume          24096 non-null  int64  
 11  speed           24096 non-null  float64
 12  classification  0 non-null      float64
dtypes: float64(2), int64(4), object(7)
memory usage: 2.4+ MB
None


In [7]:
# 결측치 확인
print(C13_depsouth.isnull().sum())

date                  0
dow                   0
time                  0
branch_name           0
district_name         0
branch_num            0
dep_point             0
arr_point             0
lane                  0
distance              0
volume                0
speed                 0
classification    24096
dtype: int64


In [8]:
# date 컬럼과 time 컬럼을 합쳐 datetime이라는 컬럼 만들기
C13_depsouth['datetime'] = C13_depsouth['date'] + ' ' + C13_depsouth['time']
C13_depsouth_test['datetime'] = C13_depsouth_test['date'] + ' ' + C13_depsouth_test['time']

In [9]:
# date 컬럼과 time 컬럼 제거
# C4_depsouth = C4_depsouth.drop(C4_depsouth[['date', 'time']], axis=1)

In [10]:
# datetime 문자형 컬럼을 datetime 자료형으로 변환
C13_depsouth['datetime'] = pd.to_datetime(C13_depsouth['datetime'])
C13_depsouth_test['datetime'] = pd.to_datetime(C13_depsouth_test['datetime'])

# classification 컬럼값 변경

In [11]:
C13_depsouth.describe()

Unnamed: 0,dow,lane,distance,volume,speed,classification
count,24096.0,24096.0,24096.0,24096.0,24096.0,0.0
mean,3.997012,6.0,822.0,3884.846863,53.664836,
std,1.998295,0.0,0.0,1670.640918,9.305516,
min,1.0,6.0,822.0,278.0,9.25,
25%,2.0,6.0,822.0,2336.0,51.49,
50%,4.0,6.0,822.0,4578.0,56.52,
75%,6.0,6.0,822.0,5217.0,59.45,
max,7.0,6.0,822.0,7213.0,71.78,


In [12]:
C13_depsouth.loc[C13_depsouth['speed'] >= C13_depsouth['speed'].mean(), 'classification'] = 1
C13_depsouth.loc[C13_depsouth['speed'] < 15, 'classification'] = 3
C13_depsouth.loc[(C13_depsouth['speed'] >= 15) 
                      & (C13_depsouth['speed'] < 25) 
                      & ((C13_depsouth['volume'] >= C13_depsouth['volume'].mean())), 'classification'] = 3
C13_depsouth.loc[(C13_depsouth['speed'] >= 15) 
                      & (C13_depsouth['speed'] < C13_depsouth['speed'].mean()) 
                      & ((C13_depsouth['volume'] < C13_depsouth['volume'].mean())), 'classification'] = 2
C13_depsouth.loc[(C13_depsouth['speed'] >= 25) 
                      & (C13_depsouth['speed'] < C13_depsouth['speed'].mean()) 
                      & ((C13_depsouth['volume'] >= C13_depsouth['volume'].mean())), 'classification'] = 2

In [13]:
C13_depsouth['classification']

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
24091    2.0
24092    2.0
24093    2.0
24094    1.0
24095    1.0
Name: classification, Length: 24096, dtype: float64

In [14]:
C13_depsouth['year'] = C13_depsouth['datetime'].dt.year
C13_depsouth['month'] = C13_depsouth['datetime'].dt.month
C13_depsouth['day'] = C13_depsouth['datetime'].dt.day
C13_depsouth['hour'] = C13_depsouth['datetime'].dt.hour

In [15]:
C13_depsouth

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2019-01-01,2,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,2815,62.37,1.0,2019-01-01 00:00:00,2019,1,1,0
1,2019-01-01,2,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,2730,64.06,1.0,2019-01-01 01:00:00,2019,1,1,1
2,2019-01-01,2,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1971,64.43,1.0,2019-01-01 02:00:00,2019,1,1,2
3,2019-01-01,2,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1498,63.31,1.0,2019-01-01 03:00:00,2019,1,1,3
4,2019-01-01,2,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1355,62.64,1.0,2019-01-01 04:00:00,2019,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24091,2021-09-30,4,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4404,48.05,2.0,2021-09-30 19:00:00,2021,9,30,19
24092,2021-09-30,4,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4267,49.70,2.0,2021-09-30 20:00:00,2021,9,30,20
24093,2021-09-30,4,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4308,53.22,2.0,2021-09-30 21:00:00,2021,9,30,21
24094,2021-09-30,4,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3779,55.69,1.0,2021-09-30 22:00:00,2021,9,30,22


# 월별로 데이터 나누기

In [16]:
C13_ds_month = C13_depsouth['month']
C13_ds_month_list  = sorted(set(C13_ds_month))
C13_ds_month_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [17]:
month_data = []
for i in range(0, len(C13_ds_month_list)):
  month = C13_depsouth[C13_depsouth['month'] == C13_ds_month_list[i]]
  month = month.reset_index(drop=True)
  month_data.append(month)

In [18]:
train_jan = month_data[0]
train_feb = month_data[1]
train_mar = month_data[2]
train_apr = month_data[3]
train_may = month_data[4]
train_jun = month_data[5]
train_jul = month_data[6]
train_aug = month_data[7]
train_sep = month_data[8]
train_oct = month_data[9]
train_nov = month_data[10]
train_dec = month_data[11]

In [19]:
C13_depsouth_test['year'] = C13_depsouth_test['datetime'].dt.year
C13_depsouth_test['month'] = C13_depsouth_test['datetime'].dt.month
C13_depsouth_test['day'] = C13_depsouth_test['datetime'].dt.day
C13_depsouth_test['hour'] = C13_depsouth_test['datetime'].dt.hour

In [20]:
C13_ds_test_mon = C13_depsouth_test['month']
C13_ds_test_mon_list  = sorted(set(C13_ds_test_mon))
C13_ds_test_mon_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [21]:
month_test_data = []
for i in range(0, len(C13_ds_month_list)):
  month = C13_depsouth_test[C13_depsouth_test['month'] == C13_ds_test_mon_list[i]]
  month = month.reset_index(drop=True)
  month_test_data.append(month)

In [22]:
test_jan = month_test_data[0]
test_feb = month_test_data[1]
test_mar = month_test_data[2]
test_apr = month_test_data[3]
test_may = month_test_data[4]
test_jun = month_test_data[5]
test_jul = month_test_data[6]
test_aug = month_test_data[7]
test_sep = month_test_data[8]
test_oct = month_test_data[9]
test_nov = month_test_data[10]
test_dec = month_test_data[11]

In [23]:
test_dec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            744 non-null    object        
 1   dow             744 non-null    int64         
 2   time            744 non-null    object        
 3   branch_name     744 non-null    object        
 4   district_name   744 non-null    object        
 5   branch_num      744 non-null    object        
 6   dep_point       744 non-null    object        
 7   arr_point       744 non-null    object        
 8   lane            744 non-null    int64         
 9   distance        744 non-null    int64         
 10  volume          744 non-null    int64         
 11  speed           744 non-null    float64       
 12  classification  0 non-null      float64       
 13  datetime        744 non-null    datetime64[ns]
 14  year            744 non-null    int64         
 15  month 

# 1월 데이터 머신러닝

## 데이터 가공

In [24]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [25]:
X1 = train_jan.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [26]:
y1 = train_jan[['datetime', 'classification']]
X1_1 = X1.drop(columns = ['datetime', 'classification'])
y1_1 = X1.datetime

In [27]:
X1_1_scaler = scaler.fit_transform(X1_1)

In [28]:
X1_1_sc = pd.DataFrame(X1_1_scaler)
X1_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X1 = pd.concat([y1_1, X1_1_sc], axis = 1)

In [29]:
X1_test = test_jan.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [30]:
y1_test = test_jan[['datetime', 'classification']]
X1_1_test = X1_test.drop(columns = ['datetime', 'classification'])
y1_1_test = X1_test.datetime

In [31]:
X1_1_test_scaler = scaler.fit_transform(X1_1_test)

In [32]:
X1_1_test_sc = pd.DataFrame(X1_1_test_scaler)
X1_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X1_test = pd.concat([y1_1_test, X1_1_test_sc], axis = 1)
X1_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-01-01 00:00:00,0.833333,0.210573,0.853897,0.0,0.0,0.0,0.000000
1,2022-01-01 01:00:00,0.833333,0.152051,0.863028,0.0,0.0,0.0,0.043478
2,2022-01-01 02:00:00,0.833333,0.069307,0.852198,0.0,0.0,0.0,0.086957
3,2022-01-01 03:00:00,0.833333,0.038720,0.853684,0.0,0.0,0.0,0.130435
4,2022-01-01 04:00:00,0.833333,0.031471,0.837545,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-01-31 19:00:00,0.000000,0.449257,0.809726,0.0,0.0,1.0,0.826087
740,2022-01-31 20:00:00,0.000000,0.449257,0.803143,0.0,0.0,1.0,0.869565
741,2022-01-31 21:00:00,0.000000,0.371464,0.797197,0.0,0.0,1.0,0.913043
742,2022-01-31 22:00:00,0.000000,0.270156,0.774687,0.0,0.0,1.0,0.956522


## LightGBM

In [33]:
# optuna 설치
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.3-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 4.1 MB/s 
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting cliff
  Downloading cliff-4.1.0-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 8.2 MB/s 
Collecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 50.0 MB/s 
[?25hCollecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 6.2 MB/s 
Collecting cmd2>=1.0.0
  Downloading cmd2-2.4.2-py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 54.6 MB/s 
[?25hCollecting stevedore>=2.0.1
  Downloading stevedore-4.1.1-py3-none-any.whl (5

In [34]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error

In [35]:
import lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score

In [36]:
X1 = X1.drop(columns = ['datetime'])
y1 = y1.drop(columns = ['datetime'])
X1_test = X1_test.drop(columns = ['datetime'])
y1_test = y1_test.drop(columns = ['datetime'])

In [37]:
# LigthGBM 하이퍼파라미터 값 지정
def objectiveLGBM(trial: Trial, X, y):
    param = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'learning_rate': 0.01,
        'n_estimators': trial.suggest_int('n_estimators', 700, 3000),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'gpu_use_dp':True
    }
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

     # 학습 모델 생성
    model = LGBMClassifier(**param)
    lgb_model = model.fit(X_train, y_train, verbose=True) # 학습 진행
    train_preds = lgb_model.predict(X_train)
    test_preds = lgb_model.predict(X_test)

    # 모델 성능 확인
    train_precision = precision_score(y_test, test_preds, average= "macro")
    
    return train_precision

In [38]:
study1 = optuna.create_study(direction='maximize',sampler=TPESampler())
study1.optimize(lambda trial : objectiveLGBM(trial, X1, y1), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study1.best_trial.value,study1.best_trial.params))

[32m[I 2022-12-01 02:48:36,268][0m A new study created in memory with name: no-name-fb92ea86-cfcd-4020-a368-e40f44c4bdb1[0m
[32m[I 2022-12-01 02:48:39,388][0m Trial 0 finished with value: 0.9928887671733078 and parameters: {'num_leaves': 476, 'n_estimators': 1337, 'feature_fraction': 0.5808245176617153, 'bagging_fraction': 0.7283724386486068, 'bagging_freq': 3, 'min_child_samples': 72}. Best is trial 0 with value: 0.9928887671733078.[0m
[32m[I 2022-12-01 02:48:41,556][0m Trial 1 finished with value: 0.9879879879879879 and parameters: {'num_leaves': 53, 'n_estimators': 1392, 'feature_fraction': 0.8401680715963247, 'bagging_fraction': 0.7062801064042199, 'bagging_freq': 6, 'min_child_samples': 100}. Best is trial 0 with value: 0.9928887671733078.[0m
[32m[I 2022-12-01 02:48:43,883][0m Trial 2 finished with value: 0.7942528735632184 and parameters: {'num_leaves': 222, 'n_estimators': 1815, 'feature_fraction': 0.6420730543414961, 'bagging_fraction': 0.6242418438852158, 'bagging_f

Best trial: score 1.0,
params {'num_leaves': 236, 'n_estimators': 1200, 'feature_fraction': 0.8687822268375953, 'bagging_fraction': 0.9959053264230845, 'bagging_freq': 3, 'min_child_samples': 12}


In [39]:
optuna.visualization.plot_param_importances(study1) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study1) # 최적화 과정 시각화

In [40]:
X1_train, X1_val, y1_train, y1_val = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

In [41]:
X1_train.shape, X1_val.shape, y1_train.shape, y1_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [42]:
model = LGBMClassifier(**study1.best_trial.params)

In [43]:
model1 = model.fit(X1_train, y1_train,
          eval_set = [(X1_train, y1_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.506124	training's multi_logloss: 0.506124
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.434881	training's multi_logloss: 0.434881
[3]	training's multi_logloss: 0.376958	training's multi_logloss: 0.376958
[4]	training's multi_logloss: 0.329973	training's multi_logloss: 0.329973
[5]	training's multi_logloss: 0.290812	training's multi_logloss: 0.290812
[6]	training's multi_logloss: 0.258024	training's multi_logloss: 0.258024
[7]	training's multi_logloss: 0.233376	training's multi_logloss: 0.233376
[8]	training's multi_logloss: 0.211691	training's multi_logloss: 0.211691
[9]	training's multi_logloss: 0.189407	training's multi_logloss: 0.189407
[10]	training's multi_logloss: 0.169589	training's multi_logloss: 0.169589
[11]	training's multi_logloss: 0.152155	training's multi_logloss: 0.152155
[12]	training's multi_logloss: 0.139052	training's multi_logloss: 0.139052
[13]	training's multi_logloss: 0.125131	traini

In [44]:
train1_preds = model1.predict(X1_train)
val1_preds = model1.predict(X1_val)

In [45]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [46]:
get_clf_eval(y1_train, train1_preds)
get_clf_eval(y1_val, val1_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9990
재현율: 0.9969


In [47]:
preds_1 = model1.predict(X1_test)
preds_1

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 3., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 2., 1., 1., 1., 1., 2., 2.,
       1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 1., 2., 2., 1., 1., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 3., 2.

In [48]:
test_jan['classification'] = preds_1
test_jan

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-01-01,6,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1519,57.90,1.0,2022-01-01 00:00:00,2022,1,1,0
1,2022-01-01,6,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1188,58.33,1.0,2022-01-01 01:00:00,2022,1,1,1
2,2022-01-01,6,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,720,57.82,1.0,2022-01-01 02:00:00,2022,1,1,2
3,2022-01-01,6,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,547,57.89,1.0,2022-01-01 03:00:00,2022,1,1,3
4,2022-01-01,6,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,506,57.13,1.0,2022-01-01 04:00:00,2022,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-01-31,1,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,2869,55.82,1.0,2022-01-31 19:00:00,2022,1,31,19
740,2022-01-31,1,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,2869,55.51,1.0,2022-01-31 20:00:00,2022,1,31,20
741,2022-01-31,1,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,2429,55.23,1.0,2022-01-31 21:00:00,2022,1,31,21
742,2022-01-31,1,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1856,54.17,1.0,2022-01-31 22:00:00,2022,1,31,22


# 2월 데이터 머신러닝

## 데이터 가공

In [49]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [50]:
X2 = train_feb.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [51]:
y2 = train_feb[['datetime', 'classification']]
X2_1 = X2.drop(columns = ['datetime', 'classification'])
y2_1 = X2.datetime

In [52]:
X2_1_scaler = scaler.fit_transform(X2_1)

In [53]:
X2_1_sc = pd.DataFrame(X2_1_scaler)
X2_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X2 = pd.concat([y2_1, X2_1_sc], axis = 1)
X2

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-02-01 00:00:00,0.666667,0.406360,0.980837,0.0,0.0,0.000000,0.000000
1,2019-02-01 01:00:00,0.666667,0.309204,0.981815,0.0,0.0,0.000000,0.043478
2,2019-02-01 02:00:00,0.666667,0.216523,0.933320,0.0,0.0,0.000000,0.086957
3,2019-02-01 03:00:00,0.666667,0.163471,0.895581,0.0,0.0,0.000000,0.130435
4,2019-02-01 04:00:00,0.666667,0.156120,0.875636,0.0,0.0,0.000000,0.173913
...,...,...,...,...,...,...,...,...
2035,2021-02-28 19:00:00,1.000000,0.599872,0.794290,1.0,0.0,0.964286,0.826087
2036,2021-02-28 20:00:00,1.000000,0.541387,0.821470,1.0,0.0,0.964286,0.869565
2037,2021-02-28 21:00:00,1.000000,0.576382,0.805241,1.0,0.0,0.964286,0.913043
2038,2021-02-28 22:00:00,1.000000,0.576382,0.838678,1.0,0.0,0.964286,0.956522


In [54]:
X2_test = test_feb.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [55]:
y2_test = test_feb[['datetime', 'classification']]
X2_1_test = X2_test.drop(columns = ['datetime', 'classification'])
y2_1_test = X2_test.datetime

In [56]:
X2_1_test_scaler = scaler.fit_transform(X2_1_test)

In [57]:
X2_1_test_sc = pd.DataFrame(X2_1_test_scaler)
X2_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X2_test = pd.concat([y2_1_test, X2_1_test_sc], axis = 1)
X2_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-02-01 00:00:00,0.166667,0.090402,0.821273,0.0,0.0,0.0,0.000000
1,2022-02-01 01:00:00,0.166667,0.034396,0.790489,0.0,0.0,0.0,0.043478
2,2022-02-01 02:00:00,0.166667,0.010625,0.832989,0.0,0.0,0.0,0.086957
3,2022-02-01 03:00:00,0.166667,0.000000,0.838502,0.0,0.0,0.0,0.130435
4,2022-02-01 04:00:00,0.166667,0.007924,0.779692,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
667,2022-02-28 19:00:00,0.000000,0.761750,0.624627,0.0,0.0,1.0,0.826087
668,2022-02-28 20:00:00,0.000000,0.672789,0.752125,0.0,0.0,1.0,0.869565
669,2022-02-28 21:00:00,0.000000,0.644877,0.778773,0.0,0.0,1.0,0.913043
670,2022-02-28 22:00:00,0.000000,0.583108,0.813692,0.0,0.0,1.0,0.956522


## LightGBM

In [58]:
X2 = X2.drop(columns = ['datetime'])
y2 = y2.drop(columns = ['datetime'])
X2_test = X2_test.drop(columns = ['datetime'])
y2_test = y2_test.drop(columns = ['datetime'])

In [59]:
study2 = optuna.create_study(direction='maximize',sampler=TPESampler())
study2.optimize(lambda trial : objectiveLGBM(trial, X2, y2), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study2.best_trial.value,study2.best_trial.params))

[32m[I 2022-12-01 02:50:47,246][0m A new study created in memory with name: no-name-aa627b04-936a-4047-a099-6aa9ad89292e[0m
[32m[I 2022-12-01 02:50:51,089][0m Trial 0 finished with value: 0.93003300330033 and parameters: {'num_leaves': 362, 'n_estimators': 1538, 'feature_fraction': 0.4436192580441286, 'bagging_fraction': 0.7538025447514998, 'bagging_freq': 3, 'min_child_samples': 35}. Best is trial 0 with value: 0.93003300330033.[0m
[32m[I 2022-12-01 02:51:16,076][0m Trial 1 finished with value: 0.995664928292047 and parameters: {'num_leaves': 66, 'n_estimators': 1647, 'feature_fraction': 0.8089948626530594, 'bagging_fraction': 0.920767065405862, 'bagging_freq': 2, 'min_child_samples': 10}. Best is trial 1 with value: 0.995664928292047.[0m
[32m[I 2022-12-01 02:51:19,136][0m Trial 2 finished with value: 0.9429469901168015 and parameters: {'num_leaves': 100, 'n_estimators': 1698, 'feature_fraction': 0.5920969360490901, 'bagging_fraction': 0.5673384834440393, 'bagging_freq': 4,

Best trial: score 1.0,
params {'num_leaves': 331, 'n_estimators': 1095, 'feature_fraction': 0.44193037197887586, 'bagging_fraction': 0.824756913603333, 'bagging_freq': 4, 'min_child_samples': 14}


In [60]:
optuna.visualization.plot_param_importances(study2) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study2) # 최적화 과정 시각화

In [61]:
X2_train, X2_val, y2_train, y2_val = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

In [62]:
X2_train.shape, X2_val.shape, y2_train.shape, y2_val.shape

((1632, 7), (408, 7), (1632, 1), (408, 1))

In [63]:
model = LGBMClassifier(**study2.best_trial.params)

In [64]:
model2 = model.fit(X2_train, y2_train,
          eval_set = [(X2_train, y2_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.589176	training's multi_logloss: 0.589176
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.561137	training's multi_logloss: 0.561137
[3]	training's multi_logloss: 0.494908	training's multi_logloss: 0.494908
[4]	training's multi_logloss: 0.48527	training's multi_logloss: 0.48527
[5]	training's multi_logloss: 0.44023	training's multi_logloss: 0.44023
[6]	training's multi_logloss: 0.398542	training's multi_logloss: 0.398542
[7]	training's multi_logloss: 0.362087	training's multi_logloss: 0.362087
[8]	training's multi_logloss: 0.330375	training's multi_logloss: 0.330375
[9]	training's multi_logloss: 0.302735	training's multi_logloss: 0.302735
[10]	training's multi_logloss: 0.277689	training's multi_logloss: 0.277689
[11]	training's multi_logloss: 0.265361	training's multi_logloss: 0.265361
[12]	training's multi_logloss: 0.247652	training's multi_logloss: 0.247652
[13]	training's multi_logloss: 0.227234	training's

In [65]:
train2_preds = model2.predict(X2_train)
val2_preds = model2.predict(X2_val)

In [66]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [67]:
get_clf_eval(y2_train, train2_preds)
get_clf_eval(y2_val, val2_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9929
재현율: 0.9686


In [68]:
preds_2= model2.predict(X2_test)
preds_2

array([1., 1., 1., 1., 1., 2., 1., 2., 2., 1., 1., 2., 1., 2., 2., 2., 2.,
       2., 2., 2., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 1., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3.,
       2., 2., 2., 2., 1., 1., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 3., 2., 2., 2., 1., 1., 1., 1., 1., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 2., 2., 2.,
       2., 2., 1., 1., 1., 2., 2., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 3., 3.

In [69]:
test_feb['classification'] = preds_2
test_feb

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-02-01,2,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,771,56.94,1.0,2022-02-01 00:00:00,2022,2,1,0
1,2022-02-01,2,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,460,55.60,1.0,2022-02-01 01:00:00,2022,2,1,1
2,2022-02-01,2,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,328,57.45,1.0,2022-02-01 02:00:00,2022,2,1,2
3,2022-02-01,2,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,269,57.69,1.0,2022-02-01 03:00:00,2022,2,1,3
4,2022-02-01,2,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,313,55.13,1.0,2022-02-01 04:00:00,2022,2,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2022-02-28,1,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4499,48.38,2.0,2022-02-28 19:00:00,2022,2,28,19
668,2022-02-28,1,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4005,53.93,2.0,2022-02-28 20:00:00,2022,2,28,20
669,2022-02-28,1,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3850,55.09,1.0,2022-02-28 21:00:00,2022,2,28,21
670,2022-02-28,1,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3507,56.61,1.0,2022-02-28 22:00:00,2022,2,28,22


# 3월 데이터 머신러닝

## 데이터 가공

In [70]:
X3 = train_mar.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [71]:
y3 = train_mar[['datetime', 'classification']]
X3_1 = X3.drop(columns = ['datetime', 'classification'])
y3_1 = X3.datetime

In [72]:
X3_1_scaler = scaler.fit_transform(X3_1)

In [73]:
X3_1_sc = pd.DataFrame(X3_1_scaler)
X3_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X3 = pd.concat([y3_1, X3_1_sc], axis = 1)
X3

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-03-01 00:00:00,0.666667,0.469150,0.971159,0.0,0.0,0.0,0.000000
1,2019-03-01 01:00:00,0.666667,0.305147,0.971159,0.0,0.0,0.0,0.043478
2,2019-03-01 02:00:00,0.666667,0.241528,0.941742,0.0,0.0,0.0,0.086957
3,2019-03-01 03:00:00,0.666667,0.210838,0.955778,0.0,0.0,0.0,0.130435
4,2019-03-01 04:00:00,0.666667,0.171675,0.938281,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-03-31 19:00:00,0.333333,0.768382,0.618343,1.0,0.0,1.0,0.826087
2228,2021-03-31 20:00:00,0.333333,0.675991,0.759854,1.0,0.0,1.0,0.869565
2229,2021-03-31 21:00:00,0.333333,0.662084,0.767929,1.0,0.0,1.0,0.913043
2230,2021-03-31 22:00:00,0.333333,0.567775,0.740242,1.0,0.0,1.0,0.956522


In [74]:
X3_test = test_mar.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [75]:
y3_test = test_mar[['datetime', 'classification']]
X3_1_test = X3_test.drop(columns = ['datetime', 'classification'])
y3_1_test = X3_test.datetime

In [76]:
X3_1_test_scaler = scaler.fit_transform(X3_1_test)

In [77]:
X3_1_test_sc = pd.DataFrame(X3_1_test_scaler)
X3_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X3_test = pd.concat([y3_1_test, X3_1_test_sc], axis = 1)
X3_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-03-01 00:00:00,0.166667,0.183670,0.877275,0.0,0.0,0.0,0.000000
1,2022-03-01 01:00:00,0.166667,0.116514,0.894774,0.0,0.0,0.0,0.043478
2,2022-03-01 02:00:00,0.166667,0.074312,0.861643,0.0,0.0,0.0,0.086957
3,2022-03-01 03:00:00,0.166667,0.044954,0.883341,0.0,0.0,0.0,0.130435
4,2022-03-01 04:00:00,0.166667,0.054862,0.861876,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-03-31 19:00:00,0.500000,0.724037,0.643257,0.0,0.0,1.0,0.826087
740,2022-03-31 20:00:00,0.500000,0.611009,0.797713,0.0,0.0,1.0,0.869565
741,2022-03-31 21:00:00,0.500000,0.603670,0.804246,0.0,0.0,1.0,0.913043
742,2022-03-31 22:00:00,0.500000,0.528257,0.833178,0.0,0.0,1.0,0.956522


## LightGBM

In [78]:
X3 = X3.drop(columns = ['datetime'])
y3 = y3.drop(columns = ['datetime'])
X3_test = X3_test.drop(columns = ['datetime'])
y3_test = y3_test.drop(columns = ['datetime'])

In [79]:
study3 = optuna.create_study(direction='maximize',sampler=TPESampler())
study3.optimize(lambda trial : objectiveLGBM(trial, X3, y3), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study3.best_trial.value,study3.best_trial.params))

[32m[I 2022-12-01 02:52:45,819][0m A new study created in memory with name: no-name-09ab1a45-3523-4310-b9c5-8532f1391f6f[0m
[32m[I 2022-12-01 02:52:48,813][0m Trial 0 finished with value: 0.8975155279503105 and parameters: {'num_leaves': 234, 'n_estimators': 1483, 'feature_fraction': 0.659991846951961, 'bagging_fraction': 0.6730011534134472, 'bagging_freq': 4, 'min_child_samples': 47}. Best is trial 0 with value: 0.8975155279503105.[0m
[32m[I 2022-12-01 02:52:51,497][0m Trial 1 finished with value: 0.9335459861775651 and parameters: {'num_leaves': 28, 'n_estimators': 1959, 'feature_fraction': 0.5160600085808515, 'bagging_fraction': 0.651917684646619, 'bagging_freq': 5, 'min_child_samples': 89}. Best is trial 1 with value: 0.9335459861775651.[0m
[32m[I 2022-12-01 02:52:53,216][0m Trial 2 finished with value: 0.972361339171684 and parameters: {'num_leaves': 32, 'n_estimators': 1353, 'feature_fraction': 0.7654293919242445, 'bagging_fraction': 0.6393797736043761, 'bagging_freq':

Best trial: score 1.0,
params {'num_leaves': 414, 'n_estimators': 2098, 'feature_fraction': 0.8176501510630552, 'bagging_fraction': 0.8304508127984596, 'bagging_freq': 7, 'min_child_samples': 29}


In [80]:
optuna.visualization.plot_param_importances(study3) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study3) # 최적화 과정 시각화

In [81]:
X3_train, X3_val, y3_train, y3_val = train_test_split(X3, y3, test_size = 0.2, random_state = 42)

In [82]:
X3_train.shape, X3_val.shape, y3_train.shape, y3_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [83]:
model = LGBMClassifier(**study3.best_trial.params)

In [84]:
model3 = model.fit(X3_train, y3_train,
          eval_set = [(X3_train, y3_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.599127	training's multi_logloss: 0.599127
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.515814	training's multi_logloss: 0.515814
[3]	training's multi_logloss: 0.44394	training's multi_logloss: 0.44394
[4]	training's multi_logloss: 0.387926	training's multi_logloss: 0.387926
[5]	training's multi_logloss: 0.341768	training's multi_logloss: 0.341768
[6]	training's multi_logloss: 0.304252	training's multi_logloss: 0.304252
[7]	training's multi_logloss: 0.276277	training's multi_logloss: 0.276277
[8]	training's multi_logloss: 0.252796	training's multi_logloss: 0.252796
[9]	training's multi_logloss: 0.22689	training's multi_logloss: 0.22689
[10]	training's multi_logloss: 0.207989	training's multi_logloss: 0.207989
[11]	training's multi_logloss: 0.186782	training's multi_logloss: 0.186782
[12]	training's multi_logloss: 0.171524	training's multi_logloss: 0.171524
[13]	training's multi_logloss: 0.157428	training's

In [85]:
train3_preds = model3.predict(X3_train)
val3_preds = model3.predict(X3_val)

In [86]:
get_clf_eval(y3_train, train3_preds)
get_clf_eval(y3_val, val3_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [87]:
preds_3= model3.predict(X3_test)
preds_3

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 2.,
       2., 2., 1., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 3., 2., 2., 2., 2., 2., 1., 1., 1., 2., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 2., 1., 2., 2.,
       1., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 3.,
       2., 2., 2., 1., 1., 1., 1., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 3., 2., 2., 2., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 3., 2.

In [88]:
test_mar['classification'] = preds_3
test_mar

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-03-01,2,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1377,58.98,1.0,2022-03-01 00:00:00,2022,3,1,0
1,2022-03-01,2,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1011,59.73,1.0,2022-03-01 01:00:00,2022,3,1,1
2,2022-03-01,2,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,781,58.31,1.0,2022-03-01 02:00:00,2022,3,1,2
3,2022-03-01,2,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,621,59.24,1.0,2022-03-01 03:00:00,2022,3,1,3
4,2022-03-01,2,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,675,58.32,1.0,2022-03-01 04:00:00,2022,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-03-31,4,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4322,48.95,2.0,2022-03-31 19:00:00,2022,3,31,19
740,2022-03-31,4,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3706,55.57,1.0,2022-03-31 20:00:00,2022,3,31,20
741,2022-03-31,4,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3666,55.85,1.0,2022-03-31 21:00:00,2022,3,31,21
742,2022-03-31,4,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3255,57.09,1.0,2022-03-31 22:00:00,2022,3,31,22


# 4월 데이터 머신러닝

## 데이터 가공

In [89]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [90]:
X4 = train_apr.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [91]:
y4 = train_apr[['datetime', 'classification']]
X4_1 = X4.drop(columns = ['datetime', 'classification'])
y4_1 = X4.datetime

In [92]:
X4_1_scaler = scaler.fit_transform(X4_1)

In [93]:
X4_1_sc = pd.DataFrame(X4_1_scaler)
X4_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X4 = pd.concat([y4_1, X4_1_sc], axis = 1)
X4

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-04-01 00:00:00,0.000000,0.224204,0.859127,0.0,0.0,0.0,0.000000
1,2019-04-01 01:00:00,0.000000,0.130709,0.860588,0.0,0.0,0.0,0.043478
2,2019-04-01 02:00:00,0.000000,0.074581,0.855107,0.0,0.0,0.0,0.086957
3,2019-04-01 03:00:00,0.000000,0.052591,0.880504,0.0,0.0,0.0,0.130435
4,2019-04-01 04:00:00,0.000000,0.092419,0.860588,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-04-30 19:00:00,0.666667,0.719207,0.505756,1.0,0.0,1.0,0.826087
2156,2021-04-30 20:00:00,0.666667,0.631093,0.630733,1.0,0.0,1.0,0.869565
2157,2021-04-30 21:00:00,0.666667,0.652929,0.676046,1.0,0.0,1.0,0.913043
2158,2021-04-30 22:00:00,0.666667,0.615101,0.625251,1.0,0.0,1.0,0.956522


In [94]:
X4_test = test_apr.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [95]:
y4_test = test_apr[['datetime', 'classification']]
X4_1_test = X4_test.drop(columns = ['datetime', 'classification'])
y4_1_test = X4_test.datetime

In [96]:
X4_1_test_scaler = scaler.fit_transform(X4_1_test)

In [97]:
X4_1_test_sc = pd.DataFrame(X4_1_test_scaler)
X4_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X4_test = pd.concat([y4_1_test, X4_1_test_sc], axis = 1)
X4_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-04-01 00:00:00,0.666667,0.196535,0.945438,0.0,0.0,0.0,0.000000
1,2022-04-01 01:00:00,0.666667,0.130032,0.919373,0.0,0.0,0.0,0.043478
2,2022-04-01 02:00:00,0.666667,0.090480,0.905235,0.0,0.0,0.0,0.086957
3,2022-04-01 03:00:00,0.666667,0.054253,0.913850,0.0,0.0,0.0,0.130435
4,2022-04-01 04:00:00,0.666667,0.071579,0.900596,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-04-30 19:00:00,0.833333,0.719636,0.738237,0.0,0.0,1.0,0.826087
716,2022-04-30 20:00:00,0.833333,0.717536,0.834769,0.0,0.0,1.0,0.869565
717,2022-04-30 21:00:00,0.833333,0.736437,0.840512,0.0,0.0,1.0,0.913043
718,2022-04-30 22:00:00,0.833333,0.676584,0.858405,0.0,0.0,1.0,0.956522


## LightGBM

In [98]:
X4 = X4.drop(columns = ['datetime'])
y4 = y4.drop(columns = ['datetime'])
X4_test = X4_test.drop(columns = ['datetime'])
y4_test = y4_test.drop(columns = ['datetime'])

In [99]:
study4 = optuna.create_study(direction='maximize',sampler=TPESampler())
study4.optimize(lambda trial : objectiveLGBM(trial, X4, y4), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study4.best_trial.value,study4.best_trial.params))

[32m[I 2022-12-01 02:54:41,209][0m A new study created in memory with name: no-name-9885aafd-ad98-4209-b594-9c1caecfeb14[0m
[32m[I 2022-12-01 02:54:43,374][0m Trial 0 finished with value: 0.9112466124661247 and parameters: {'num_leaves': 5, 'n_estimators': 2425, 'feature_fraction': 0.651126601648238, 'bagging_fraction': 0.46302181154739364, 'bagging_freq': 1, 'min_child_samples': 81}. Best is trial 0 with value: 0.9112466124661247.[0m
[32m[I 2022-12-01 02:54:46,929][0m Trial 1 finished with value: 1.0 and parameters: {'num_leaves': 216, 'n_estimators': 1941, 'feature_fraction': 0.9635993175757042, 'bagging_fraction': 0.5987908998560498, 'bagging_freq': 7, 'min_child_samples': 31}. Best is trial 1 with value: 1.0.[0m
[32m[I 2022-12-01 02:54:50,425][0m Trial 2 finished with value: 0.9529715762273901 and parameters: {'num_leaves': 146, 'n_estimators': 2736, 'feature_fraction': 0.4189329768883181, 'bagging_fraction': 0.6703961511709933, 'bagging_freq': 3, 'min_child_samples': 90

Best trial: score 1.0,
params {'num_leaves': 216, 'n_estimators': 1941, 'feature_fraction': 0.9635993175757042, 'bagging_fraction': 0.5987908998560498, 'bagging_freq': 7, 'min_child_samples': 31}


In [100]:
optuna.visualization.plot_param_importances(study4) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study4) # 최적화 과정 시각화

In [101]:
X4_train, X4_val, y4_train, y4_val = train_test_split(X4, y4, test_size = 0.2, random_state = 42)

In [102]:
X4_train.shape, X4_val.shape, y4_train.shape, y4_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [103]:
model = LGBMClassifier(**study4.best_trial.params)

In [104]:
model4 = model.fit(X4_train, y4_train,
          eval_set = [(X4_train, y4_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.635167	training's multi_logloss: 0.635167
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.549343	training's multi_logloss: 0.549343
[3]	training's multi_logloss: 0.473484	training's multi_logloss: 0.473484
[4]	training's multi_logloss: 0.41392	training's multi_logloss: 0.41392
[5]	training's multi_logloss: 0.364647	training's multi_logloss: 0.364647
[6]	training's multi_logloss: 0.324616	training's multi_logloss: 0.324616
[7]	training's multi_logloss: 0.296339	training's multi_logloss: 0.296339
[8]	training's multi_logloss: 0.270993	training's multi_logloss: 0.270993
[9]	training's multi_logloss: 0.242033	training's multi_logloss: 0.242033
[10]	training's multi_logloss: 0.21628	training's multi_logloss: 0.21628
[11]	training's multi_logloss: 0.193715	training's multi_logloss: 0.193715
[12]	training's multi_logloss: 0.177396	training's multi_logloss: 0.177396
[13]	training's multi_logloss: 0.15942	training's 

In [105]:
train4_preds = model4.predict(X4_train)
val4_preds = model4.predict(X4_val)

In [106]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [107]:
get_clf_eval(y4_train, train4_preds)
get_clf_eval(y4_val, val4_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9961
재현율: 0.9961


In [108]:
preds_4= model4.predict(X4_test)
preds_4

array([1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1., 2., 1., 1., 1.,
       1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 2., 2., 1., 1.,
       1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 2., 2., 1., 1., 1., 1.,
       1., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3.,
       3., 2., 2., 1., 2., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       3., 2., 2., 1., 2., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [109]:
test_apr['classification'] = preds_4
test_apr

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-04-01,5,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1449,59.81,1.0,2022-04-01 00:00:00,2022,4,1,0
1,2022-04-01,5,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1069,58.63,1.0,2022-04-01 01:00:00,2022,4,1,1
2,2022-04-01,5,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,843,57.99,1.0,2022-04-01 02:00:00,2022,4,1,2
3,2022-04-01,5,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,636,58.38,1.0,2022-04-01 03:00:00,2022,4,1,3
4,2022-04-01,5,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,735,57.78,1.0,2022-04-01 04:00:00,2022,4,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-04-30,6,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4438,50.43,1.0,2022-04-30 19:00:00,2022,4,30,19
716,2022-04-30,6,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4426,54.80,1.0,2022-04-30 20:00:00,2022,4,30,20
717,2022-04-30,6,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4534,55.06,1.0,2022-04-30 21:00:00,2022,4,30,21
718,2022-04-30,6,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4192,55.87,1.0,2022-04-30 22:00:00,2022,4,30,22


# 5월 데이터 머신러닝

## 데이터 가공

In [110]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [111]:
X5 = train_may.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [112]:
y5 = train_may[['datetime', 'classification']]
X5_1 = X5.drop(columns = ['datetime', 'classification'])
y5_1 = X5.datetime

In [113]:
X5_1_scaler = scaler.fit_transform(X5_1)

In [114]:
X5_1_sc = pd.DataFrame(X5_1_scaler)
X5_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X5 = pd.concat([y5_1, X5_1_sc], axis = 1)
X5

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-05-01 00:00:00,0.333333,0.409563,0.960241,0.0,0.0,0.0,0.000000
1,2019-05-01 01:00:00,0.333333,0.288770,0.962612,0.0,0.0,0.0,0.043478
2,2019-05-01 02:00:00,0.333333,0.212803,0.947474,0.0,0.0,0.0,0.086957
3,2019-05-01 03:00:00,0.333333,0.198333,0.948021,0.0,0.0,0.0,0.130435
4,2019-05-01 04:00:00,0.333333,0.153507,0.925041,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-05-31 19:00:00,0.000000,0.698018,0.684297,1.0,0.0,1.0,0.826087
2228,2021-05-31 20:00:00,0.000000,0.516986,0.807222,1.0,0.0,1.0,0.869565
2229,2021-05-31 21:00:00,0.000000,0.606795,0.831844,1.0,0.0,1.0,0.913043
2230,2021-05-31 22:00:00,0.000000,0.542309,0.867773,1.0,0.0,1.0,0.956522


In [115]:
X5_test = test_may.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [116]:
y5_test = test_may[['datetime', 'classification']]
X5_1_test = X5_test.drop(columns = ['datetime', 'classification'])
y5_1_test = X5_test.datetime

In [117]:
X5_1_test_scaler = scaler.fit_transform(X5_1_test)

In [118]:
X5_1_test_sc = pd.DataFrame(X5_1_test_scaler)
X5_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X5_test = pd.concat([y5_1_test, X5_1_test_sc], axis = 1)
X5_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-05-01 00:00:00,1.000000,0.258815,0.998223,0.0,0.0,0.0,0.000000
1,2022-05-01 01:00:00,1.000000,0.160353,0.979862,0.0,0.0,0.0,0.043478
2,2022-05-01 02:00:00,1.000000,0.088147,0.945114,0.0,0.0,0.0,0.086957
3,2022-05-01 03:00:00,1.000000,0.049700,0.948470,0.0,0.0,0.0,0.130435
4,2022-05-01 04:00:00,1.000000,0.030008,0.931885,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-05-31 19:00:00,0.166667,0.801388,0.808095,0.0,0.0,1.0,0.826087
740,2022-05-31 20:00:00,0.166667,0.710803,0.851530,0.0,0.0,1.0,0.869565
741,2022-05-31 21:00:00,0.166667,0.807014,0.832971,0.0,0.0,1.0,0.913043
742,2022-05-31 22:00:00,0.166667,0.780945,0.825271,0.0,0.0,1.0,0.956522


## LightGBM

In [119]:
X5 = X5.drop(columns = ['datetime'])
y5 = y5.drop(columns = ['datetime'])
X5_test = X5_test.drop(columns = ['datetime'])
y5_test = y5_test.drop(columns = ['datetime'])

In [120]:
study5 = optuna.create_study(direction='maximize',sampler=TPESampler())
study5.optimize(lambda trial : objectiveLGBM(trial, X5, y5), n_trials=20) 
print('Best trial: score {},\nparams {}'.format(study5.best_trial.value,study5.best_trial.params))

[32m[I 2022-12-01 02:56:08,172][0m A new study created in memory with name: no-name-380f292e-188b-4b4a-ae24-abb1010193ff[0m
[32m[I 2022-12-01 02:56:12,335][0m Trial 0 finished with value: 0.8411851851851851 and parameters: {'num_leaves': 191, 'n_estimators': 2389, 'feature_fraction': 0.8392149584929165, 'bagging_fraction': 0.926046659141158, 'bagging_freq': 1, 'min_child_samples': 74}. Best is trial 0 with value: 0.8411851851851851.[0m
[32m[I 2022-12-01 02:56:14,370][0m Trial 1 finished with value: 0.9928057553956835 and parameters: {'num_leaves': 461, 'n_estimators': 1179, 'feature_fraction': 0.8651957290366018, 'bagging_fraction': 0.5601300696127104, 'bagging_freq': 7, 'min_child_samples': 59}. Best is trial 1 with value: 0.9928057553956835.[0m
[32m[I 2022-12-01 02:56:16,332][0m Trial 2 finished with value: 0.8641791044776119 and parameters: {'num_leaves': 408, 'n_estimators': 1200, 'feature_fraction': 0.7656902687751889, 'bagging_fraction': 0.7070100454596715, 'bagging_fr

Best trial: score 1.0,
params {'num_leaves': 199, 'n_estimators': 2726, 'feature_fraction': 0.6223719745502604, 'bagging_fraction': 0.9903186775053701, 'bagging_freq': 2, 'min_child_samples': 29}


In [121]:
optuna.visualization.plot_param_importances(study5) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study5) # 최적화 과정 시각화

In [122]:
X5_train, X5_val, y5_train, y5_val = train_test_split(X5, y5, test_size = 0.2, random_state = 42)

In [123]:
X5_train.shape, X5_val.shape, y5_train.shape, y5_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [124]:
model = LGBMClassifier(**study5.best_trial.params)

In [125]:
model5 = model.fit(X5_train, y5_train,
          eval_set = [(X5_train, y5_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.657634	training's multi_logloss: 0.657634
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.538026	training's multi_logloss: 0.538026
[3]	training's multi_logloss: 0.482092	training's multi_logloss: 0.482092
[4]	training's multi_logloss: 0.433019	training's multi_logloss: 0.433019
[5]	training's multi_logloss: 0.380108	training's multi_logloss: 0.380108
[6]	training's multi_logloss: 0.346603	training's multi_logloss: 0.346603
[7]	training's multi_logloss: 0.316993	training's multi_logloss: 0.316993
[8]	training's multi_logloss: 0.289144	training's multi_logloss: 0.289144
[9]	training's multi_logloss: 0.273146	training's multi_logloss: 0.273146
[10]	training's multi_logloss: 0.250619	training's multi_logloss: 0.250619
[11]	training's multi_logloss: 0.23404	training's multi_logloss: 0.23404
[12]	training's multi_logloss: 0.215898	training's multi_logloss: 0.215898
[13]	training's multi_logloss: 0.198846	training

In [126]:
train5_preds = model5.predict(X5_train)
val5_preds = model5.predict(X5_val)

In [127]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [128]:
get_clf_eval(y5_train, train5_preds)
get_clf_eval(y5_val, val5_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [129]:
preds_5= model5.predict(X5_test)
preds_5

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3.,
       2., 1., 1., 1., 1., 1., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 3., 2., 2., 2., 1., 2., 1., 1., 1., 1., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 2., 2., 1.,
       2., 2., 1., 1., 1., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 2., 2., 2., 1., 1.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 2., 1., 2.,
       1., 1., 1., 1., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 3., 2.

In [130]:
test_may['classification'] = preds_5
test_may

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-05-01,7,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,2095,61.80,1.0,2022-05-01 00:00:00,2022,5,1,0
1,2022-05-01,7,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1570,60.87,1.0,2022-05-01 01:00:00,2022,5,1,1
2,2022-05-01,7,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1185,59.11,1.0,2022-05-01 02:00:00,2022,5,1,2
3,2022-05-01,7,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,980,59.28,1.0,2022-05-01 03:00:00,2022,5,1,3
4,2022-05-01,7,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,875,58.44,1.0,2022-05-01 04:00:00,2022,5,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-05-31,2,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4988,52.17,1.0,2022-05-31 19:00:00,2022,5,31,19
740,2022-05-31,2,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4505,54.37,1.0,2022-05-31 20:00:00,2022,5,31,20
741,2022-05-31,2,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,5018,53.43,1.0,2022-05-31 21:00:00,2022,5,31,21
742,2022-05-31,2,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4879,53.04,1.0,2022-05-31 22:00:00,2022,5,31,22


# 6월 데이터 머신러닝

## 데이터 가공

In [131]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [132]:
X6 = train_jun.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [133]:
y6 = train_jun[['datetime', 'classification']]
X6_1 = X6.drop(columns = ['datetime', 'classification'])
y6_1 = X6.datetime

In [134]:
X6_1_scaler = scaler.fit_transform(X6_1)

In [135]:
X6_1_sc = pd.DataFrame(X6_1_scaler)
X6_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X6 = pd.concat([y6_1, X6_1_sc], axis = 1)
X6

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-06-01 00:00:00,0.833333,0.456177,0.989674,0.0,0.0,0.0,0.000000
1,2019-06-01 01:00:00,0.833333,0.304019,0.978753,0.0,0.0,0.0,0.043478
2,2019-06-01 02:00:00,0.833333,0.233724,0.979150,0.0,0.0,0.0,0.086957
3,2019-06-01 03:00:00,0.833333,0.220377,0.950357,0.0,0.0,0.0,0.130435
4,2019-06-01 04:00:00,0.833333,0.190123,0.916203,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-06-30 19:00:00,0.333333,0.744921,0.564138,1.0,0.0,1.0,0.826087
2156,2021-06-30 20:00:00,0.333333,0.535518,0.790508,1.0,0.0,1.0,0.869565
2157,2021-06-30 21:00:00,0.333333,0.565475,0.784948,1.0,0.0,1.0,0.913043
2158,2021-06-30 22:00:00,0.333333,0.544416,0.792097,1.0,0.0,1.0,0.956522


In [136]:
X6_test = test_jun.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [137]:
y6_test = test_jun[['datetime', 'classification']]
X6_1_test = X6_test.drop(columns = ['datetime', 'classification'])
y6_1_test = X6_test.datetime

In [138]:
X6_1_test_scaler = scaler.fit_transform(X6_1_test)

In [139]:
X6_1_test_sc = pd.DataFrame(X6_1_test_scaler)
X6_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X6_test = pd.concat([y6_1_test, X6_1_test_sc], axis = 1)
X6_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-06-01 00:00:00,0.333333,0.329951,0.969061,0.0,0.0,0.0,0.000000
1,2022-06-01 01:00:00,0.333333,0.194892,0.970515,0.0,0.0,0.0,0.043478
2,2022-06-01 02:00:00,0.333333,0.127537,0.920266,0.0,0.0,0.0,0.086957
3,2022-06-01 03:00:00,0.333333,0.083625,0.879983,0.0,0.0,0.0,0.130435
4,2022-06-01 04:00:00,0.333333,0.098495,0.900748,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-06-30 19:00:00,0.500000,1.000000,0.151786,0.0,0.0,1.0,0.826087
716,2022-06-30 20:00:00,0.500000,0.883485,0.266819,0.0,0.0,1.0,0.869565
717,2022-06-30 21:00:00,0.500000,0.753149,0.703073,0.0,0.0,1.0,0.913043
718,2022-06-30 22:00:00,0.500000,0.501749,0.856312,0.0,0.0,1.0,0.956522


## LightGBM

In [140]:
X6 = X6.drop(columns = ['datetime'])
y6 = y6.drop(columns = ['datetime'])
X6_test = X6_test.drop(columns = ['datetime'])
y6_test = y6_test.drop(columns = ['datetime'])

In [141]:
study6 = optuna.create_study(direction='maximize',sampler=TPESampler())
study6.optimize(lambda trial : objectiveLGBM(trial, X6, y6), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study6.best_trial.value,study6.best_trial.params))

[32m[I 2022-12-01 02:57:52,673][0m A new study created in memory with name: no-name-4ddcb0d6-211a-47e5-b7b6-98844d028b8d[0m
[32m[I 2022-12-01 02:58:00,065][0m Trial 0 finished with value: 1.0 and parameters: {'num_leaves': 125, 'n_estimators': 1200, 'feature_fraction': 0.8515731005160493, 'bagging_fraction': 0.9291283643848113, 'bagging_freq': 4, 'min_child_samples': 15}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-12-01 02:58:12,590][0m Trial 1 finished with value: 0.9914105594956659 and parameters: {'num_leaves': 118, 'n_estimators': 2125, 'feature_fraction': 0.45278468021467827, 'bagging_fraction': 0.9627977614767762, 'bagging_freq': 4, 'min_child_samples': 16}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-12-01 02:58:16,158][0m Trial 2 finished with value: 0.8888888888888888 and parameters: {'num_leaves': 193, 'n_estimators': 2411, 'feature_fraction': 0.4803020598805949, 'bagging_fraction': 0.49088540002708736, 'bagging_freq': 1, 'min_child_samples': 50}. Best is 

Best trial: score 1.0,
params {'num_leaves': 125, 'n_estimators': 1200, 'feature_fraction': 0.8515731005160493, 'bagging_fraction': 0.9291283643848113, 'bagging_freq': 4, 'min_child_samples': 15}


In [142]:
optuna.visualization.plot_param_importances(study6) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study6) # 최적화 과정 시각화

In [143]:
X6_train, X6_val, y6_train, y6_val = train_test_split(X6, y6, test_size = 0.2, random_state = 42)

In [144]:
X6_train.shape, X6_val.shape, y6_train.shape, y6_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [145]:
model = LGBMClassifier(**study6.best_trial.params)

In [146]:
model6 = model.fit(X6_train, y6_train,
          eval_set = [(X6_train, y6_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.651415	training's multi_logloss: 0.651415
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.563432	training's multi_logloss: 0.563432
[3]	training's multi_logloss: 0.487106	training's multi_logloss: 0.487106
[4]	training's multi_logloss: 0.425849	training's multi_logloss: 0.425849
[5]	training's multi_logloss: 0.375091	training's multi_logloss: 0.375091
[6]	training's multi_logloss: 0.333453	training's multi_logloss: 0.333453
[7]	training's multi_logloss: 0.301954	training's multi_logloss: 0.301954
[8]	training's multi_logloss: 0.27434	training's multi_logloss: 0.27434
[9]	training's multi_logloss: 0.245718	training's multi_logloss: 0.245718
[10]	training's multi_logloss: 0.219809	training's multi_logloss: 0.219809
[11]	training's multi_logloss: 0.197009	training's multi_logloss: 0.197009
[12]	training's multi_logloss: 0.179405	training's multi_logloss: 0.179405
[13]	training's multi_logloss: 0.161275	training

In [147]:
train6_preds = model6.predict(X6_train)
val6_preds = model6.predict(X6_val)

In [148]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [149]:
get_clf_eval(y6_train, train6_preds)
get_clf_eval(y6_val, val6_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [150]:
preds_6= model6.predict(X6_test)
preds_6

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 2.,
       2., 2., 2., 1., 2., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 3., 2., 2., 2., 2., 1., 1., 1., 1., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 1., 1., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3.,
       2., 1., 1., 2., 1., 1., 1., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 3., 3., 2., 2., 2., 2., 1., 2., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 2., 2.,
       2., 2., 2., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 3., 3.

In [151]:
test_jun['classification'] = preds_6
test_jun

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-06-01,3,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,2587,61.13,1.0,2022-06-01 00:00:00,2022,6,1,0
1,2022-06-01,3,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1815,61.20,1.0,2022-06-01 01:00:00,2022,6,1,1
2,2022-06-01,3,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1430,58.78,1.0,2022-06-01 02:00:00,2022,6,1,2
3,2022-06-01,3,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1179,56.84,1.0,2022-06-01 03:00:00,2022,6,1,3
4,2022-06-01,3,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1264,57.84,1.0,2022-06-01 04:00:00,2022,6,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-06-30,4,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,6417,21.77,3.0,2022-06-30 19:00:00,2022,6,30,19
716,2022-06-30,4,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,5751,27.31,2.0,2022-06-30 20:00:00,2022,6,30,20
717,2022-06-30,4,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,5006,48.32,2.0,2022-06-30 21:00:00,2022,6,30,21
718,2022-06-30,4,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3569,55.70,1.0,2022-06-30 22:00:00,2022,6,30,22


# 7월 데이터 머신러닝

## 데이터 가공

In [152]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [153]:
X7 = train_jul.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [154]:
y7 = train_jul[['datetime', 'classification']]
X7_1 = X7.drop(columns = ['datetime', 'classification'])
y7_1 = X7.datetime

In [155]:
X7_1_scaler = scaler.fit_transform(X7_1)

In [156]:
X7_1_sc = pd.DataFrame(X7_1_scaler)
X7_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X7 = pd.concat([y7_1, X7_1_sc], axis = 1)
X7

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-07-01 00:00:00,0.000000,0.224769,0.928647,0.0,0.0,0.0,0.000000
1,2019-07-01 01:00:00,0.000000,0.132124,0.921229,0.0,0.0,0.0,0.043478
2,2019-07-01 02:00:00,0.000000,0.083254,0.886436,0.0,0.0,0.0,0.086957
3,2019-07-01 03:00:00,0.000000,0.056511,0.909219,0.0,0.0,0.0,0.130435
4,2019-07-01 04:00:00,0.000000,0.098695,0.930590,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-07-31 19:00:00,0.833333,0.545049,0.826740,1.0,0.0,1.0,0.826087
2228,2021-07-31 20:00:00,0.833333,0.440942,0.841575,1.0,0.0,1.0,0.869565
2229,2021-07-31 21:00:00,0.833333,0.458294,0.837866,1.0,0.0,1.0,0.913043
2230,2021-07-31 22:00:00,0.833333,0.439828,0.839986,1.0,0.0,1.0,0.956522


In [157]:
X7_test = test_jul.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [158]:
y7_test = test_jul[['datetime', 'classification']]
X7_1_test = X7_test.drop(columns = ['datetime', 'classification'])
y7_1_test = X7_test.datetime

In [159]:
X7_1_test_scaler = scaler.fit_transform(X7_1_test)

In [160]:
X7_1_test_sc = pd.DataFrame(X7_1_test_scaler)
X7_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X7_test = pd.concat([y7_1_test, X7_1_test_sc], axis = 1)
X7_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-07-01 00:00:00,0.666667,0.197414,0.882118,0.0,0.0,0.0,0.000000
1,2022-07-01 01:00:00,0.666667,0.147414,0.857542,0.0,0.0,0.0,0.043478
2,2022-07-01 02:00:00,0.666667,0.087931,0.846553,0.0,0.0,0.0,0.086957
3,2022-07-01 03:00:00,0.666667,0.060862,0.817982,0.0,0.0,0.0,0.130435
4,2022-07-01 04:00:00,0.666667,0.088966,0.850150,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-07-31 19:00:00,1.000000,0.681207,0.761638,0.0,0.0,1.0,0.826087
740,2022-07-31 20:00:00,1.000000,0.634828,0.669930,0.0,0.0,1.0,0.869565
741,2022-07-31 21:00:00,1.000000,0.630517,0.677722,0.0,0.0,1.0,0.913043
742,2022-07-31 22:00:00,1.000000,0.551379,0.685315,0.0,0.0,1.0,0.956522


## LightGBM

In [161]:
X7 = X7.drop(columns = ['datetime'])
y7 = y7.drop(columns = ['datetime'])
X7_test = X7_test.drop(columns = ['datetime'])
y7_test = y7_test.drop(columns = ['datetime'])

In [162]:
study7 = optuna.create_study(direction='maximize',sampler=TPESampler())
study7.optimize(lambda trial : objectiveLGBM(trial, X7, y7), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study7.best_trial.value,study7.best_trial.params))

[32m[I 2022-12-01 02:59:23,404][0m A new study created in memory with name: no-name-573bd964-e527-431d-950b-641649d27c9a[0m
[32m[I 2022-12-01 02:59:31,250][0m Trial 0 finished with value: 1.0 and parameters: {'num_leaves': 153, 'n_estimators': 2766, 'feature_fraction': 0.9752180482771103, 'bagging_fraction': 0.5402277489340669, 'bagging_freq': 7, 'min_child_samples': 9}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-12-01 02:59:33,813][0m Trial 1 finished with value: 0.9169773800646284 and parameters: {'num_leaves': 309, 'n_estimators': 1477, 'feature_fraction': 0.6347590178215078, 'bagging_fraction': 0.5887141218894618, 'bagging_freq': 7, 'min_child_samples': 56}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-12-01 02:59:38,186][0m Trial 2 finished with value: 0.7281836631151699 and parameters: {'num_leaves': 292, 'n_estimators': 2515, 'feature_fraction': 0.702878471643956, 'bagging_fraction': 0.8900830544770595, 'bagging_freq': 3, 'min_child_samples': 93}. Best is tria

Best trial: score 1.0,
params {'num_leaves': 153, 'n_estimators': 2766, 'feature_fraction': 0.9752180482771103, 'bagging_fraction': 0.5402277489340669, 'bagging_freq': 7, 'min_child_samples': 9}


In [163]:
optuna.visualization.plot_param_importances(study7) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study7) # 최적화 과정 시각화

In [164]:
X7_train, X7_val, y7_train, y7_val = train_test_split(X7, y7, test_size = 0.2, random_state = 42)

In [165]:
X7_train.shape, X7_val.shape, y7_train.shape, y7_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [166]:
model = LGBMClassifier(**study7.best_trial.params)

In [167]:
model7 = model.fit(X7_train, y7_train,
          eval_set = [(X7_train, y7_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.63098	training's multi_logloss: 0.63098
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.547364	training's multi_logloss: 0.547364
[3]	training's multi_logloss: 0.472166	training's multi_logloss: 0.472166
[4]	training's multi_logloss: 0.413201	training's multi_logloss: 0.413201
[5]	training's multi_logloss: 0.364064	training's multi_logloss: 0.364064
[6]	training's multi_logloss: 0.323789	training's multi_logloss: 0.323789
[7]	training's multi_logloss: 0.293508	training's multi_logloss: 0.293508
[8]	training's multi_logloss: 0.26695	training's multi_logloss: 0.26695
[9]	training's multi_logloss: 0.239023	training's multi_logloss: 0.239023
[10]	training's multi_logloss: 0.213687	training's multi_logloss: 0.213687
[11]	training's multi_logloss: 0.191369	training's multi_logloss: 0.191369
[12]	training's multi_logloss: 0.174851	training's multi_logloss: 0.174851
[13]	training's multi_logloss: 0.157043	training's

In [168]:
train7_preds = model7.predict(X7_train)
val7_preds = model7.predict(X7_val)

In [169]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [170]:
get_clf_eval(y7_train, train7_preds)
get_clf_eval(y7_val, val7_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [171]:
preds_7= model7.predict(X7_test)
preds_7

array([1., 1., 1., 1., 1., 1., 1., 2., 3., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 3., 3., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 3., 3., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 3., 2., 1., 2., 2., 2., 2., 1., 2., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 2., 2., 2., 2., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [172]:
test_jul['classification'] = preds_7
test_jul

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-07-01,5,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1714,59.49,1.0,2022-07-01 00:00:00,2022,7,1,0
1,2022-07-01,5,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1424,58.26,1.0,2022-07-01 01:00:00,2022,7,1,1
2,2022-07-01,5,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1079,57.71,1.0,2022-07-01 02:00:00,2022,7,1,2
3,2022-07-01,5,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,922,56.28,1.0,2022-07-01 03:00:00,2022,7,1,3
4,2022-07-01,5,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1085,57.89,1.0,2022-07-01 04:00:00,2022,7,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-07-31,7,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4520,53.46,2.0,2022-07-31 19:00:00,2022,7,31,19
740,2022-07-31,7,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4251,48.87,2.0,2022-07-31 20:00:00,2022,7,31,20
741,2022-07-31,7,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4226,49.26,2.0,2022-07-31 21:00:00,2022,7,31,21
742,2022-07-31,7,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3767,49.64,2.0,2022-07-31 22:00:00,2022,7,31,22


# 8월 데이터 머신러닝

## 데이터 가공

In [173]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [174]:
X8 = train_aug.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [175]:
y8 = train_aug[['datetime', 'classification']]
X8_1 = X8.drop(columns = ['datetime', 'classification'])
y8_1 = X8.datetime

In [176]:
X8_1_scaler = scaler.fit_transform(X8_1)

In [177]:
X8_1_sc = pd.DataFrame(X8_1_scaler)
X8_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X8 = pd.concat([y8_1, X8_1_sc], axis = 1)
X8

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-08-01 00:00:00,0.500000,0.329257,0.959933,0.0,0.0,0.0,0.000000
1,2019-08-01 01:00:00,0.500000,0.234450,0.893466,0.0,0.0,0.0,0.043478
2,2019-08-01 02:00:00,0.500000,0.192180,0.898895,0.0,0.0,0.0,0.086957
3,2019-08-01 03:00:00,0.500000,0.147796,0.894963,0.0,0.0,0.0,0.130435
4,2019-08-01 04:00:00,0.500000,0.110507,0.887287,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-08-31 19:00:00,0.166667,0.727053,0.575922,1.0,0.0,1.0,0.826087
2228,2021-08-31 20:00:00,0.166667,0.657156,0.621794,1.0,0.0,1.0,0.869565
2229,2021-08-31 21:00:00,0.166667,0.637983,0.659053,1.0,0.0,1.0,0.913043
2230,2021-08-31 22:00:00,0.166667,0.441123,0.729077,1.0,0.0,1.0,0.956522


In [178]:
X8_test = test_aug.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [179]:
y8_test = test_aug[['datetime', 'classification']]
X8_1_test = X8_test.drop(columns = ['datetime', 'classification'])
y8_1_test = X8_test.datetime

In [180]:
X8_1_test_scaler = scaler.fit_transform(X8_1_test)

In [181]:
X8_1_test_sc = pd.DataFrame(X8_1_test_scaler)
X8_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X8_test = pd.concat([y8_1_test, X8_1_test_sc], axis = 1)
X8_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-08-01 00:00:00,0.000000,0.218750,0.862318,0.0,0.0,0.0,0.000000
1,2022-08-01 01:00:00,0.000000,0.057910,0.865408,0.0,0.0,0.0,0.043478
2,2022-08-01 02:00:00,0.000000,0.042903,0.906953,0.0,0.0,0.0,0.086957
3,2022-08-01 03:00:00,0.000000,0.016243,0.862318,0.0,0.0,0.0,0.130435
4,2022-08-01 04:00:00,0.000000,0.058792,0.808584,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-08-31 19:00:00,0.333333,0.798905,0.616824,0.0,0.0,1.0,0.826087
740,2022-08-31 20:00:00,0.333333,0.756532,0.747468,0.0,0.0,1.0,0.869565
741,2022-08-31 21:00:00,0.333333,0.728460,0.799485,0.0,0.0,1.0,0.913043
742,2022-08-31 22:00:00,0.333333,0.640890,0.846009,0.0,0.0,1.0,0.956522


## LightGBM

In [182]:
X8 = X8.drop(columns = ['datetime'])
y8 = y8.drop(columns = ['datetime'])
X8_test = X8_test.drop(columns = ['datetime'])
y8_test = y8_test.drop(columns = ['datetime'])

In [183]:
study8 = optuna.create_study(direction='maximize',sampler=TPESampler())
study8.optimize(lambda trial : objectiveLGBM(trial, X8, y8), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study8.best_trial.value,study8.best_trial.params))

[32m[I 2022-12-01 03:00:49,142][0m A new study created in memory with name: no-name-b18439bb-ba2e-43e3-b1e3-5bb9a3983384[0m
[32m[I 2022-12-01 03:00:53,081][0m Trial 0 finished with value: 0.9334069168506255 and parameters: {'num_leaves': 510, 'n_estimators': 1933, 'feature_fraction': 0.9113382047410976, 'bagging_fraction': 0.6171413599810309, 'bagging_freq': 2, 'min_child_samples': 49}. Best is trial 0 with value: 0.9334069168506255.[0m
[32m[I 2022-12-01 03:00:55,067][0m Trial 1 finished with value: 0.9853330839490009 and parameters: {'num_leaves': 133, 'n_estimators': 1593, 'feature_fraction': 0.7961926393170367, 'bagging_fraction': 0.48566043881122967, 'bagging_freq': 5, 'min_child_samples': 83}. Best is trial 1 with value: 0.9853330839490009.[0m
[32m[I 2022-12-01 03:00:57,020][0m Trial 2 finished with value: 0.8680555555555555 and parameters: {'num_leaves': 446, 'n_estimators': 1438, 'feature_fraction': 0.4633172938255464, 'bagging_fraction': 0.5195662797909983, 'bagging_

Best trial: score 1.0,
params {'num_leaves': 252, 'n_estimators': 2136, 'feature_fraction': 0.7743422108862248, 'bagging_fraction': 0.40340941175517736, 'bagging_freq': 7, 'min_child_samples': 6}


In [184]:
optuna.visualization.plot_param_importances(study8) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study8) # 최적화 과정 시각화

In [185]:
X8_train, X8_val, y8_train, y8_val = train_test_split(X8, y8, test_size = 0.2, random_state = 42)

In [186]:
X8_train.shape, X8_val.shape, y8_train.shape, y8_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [187]:
model = LGBMClassifier(**study8.best_trial.params)

In [188]:
model8 = model.fit(X8_train, y8_train,
          eval_set = [(X8_train, y8_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.61766	training's multi_logloss: 0.61766
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.53906	training's multi_logloss: 0.53906
[3]	training's multi_logloss: 0.466984	training's multi_logloss: 0.466984
[4]	training's multi_logloss: 0.410113	training's multi_logloss: 0.410113
[5]	training's multi_logloss: 0.362184	training's multi_logloss: 0.362184
[6]	training's multi_logloss: 0.322935	training's multi_logloss: 0.322935
[7]	training's multi_logloss: 0.29422	training's multi_logloss: 0.29422
[8]	training's multi_logloss: 0.26981	training's multi_logloss: 0.26981
[9]	training's multi_logloss: 0.241747	training's multi_logloss: 0.241747
[10]	training's multi_logloss: 0.221785	training's multi_logloss: 0.221785
[11]	training's multi_logloss: 0.198791	training's multi_logloss: 0.198791
[12]	training's multi_logloss: 0.182271	training's multi_logloss: 0.182271
[13]	training's multi_logloss: 0.167234	training's mul

In [189]:
train8_preds = model8.predict(X8_train)
val8_preds = model8.predict(X8_val)

In [190]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [191]:
get_clf_eval(y8_train, train8_preds)
get_clf_eval(y8_val, val8_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9988
재현율: 0.9978


In [192]:
preds_8= model8.predict(X8_test)
preds_8

array([1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 1., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 1., 2., 2., 1., 2., 2., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 3., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 2., 2., 3., 3., 3., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 3., 3.

In [193]:
test_aug['classification'] = preds_8
test_aug

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-08-01,1,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1836,56.27,1.0,2022-08-01 00:00:00,2022,8,1,0
1,2022-08-01,1,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,925,56.45,1.0,2022-08-01 01:00:00,2022,8,1,1
2,2022-08-01,1,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,840,58.87,1.0,2022-08-01 02:00:00,2022,8,1,2
3,2022-08-01,1,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,689,56.27,1.0,2022-08-01 03:00:00,2022,8,1,3
4,2022-08-01,1,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,930,53.14,1.0,2022-08-01 04:00:00,2022,8,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-08-31,3,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,5122,41.97,2.0,2022-08-31 19:00:00,2022,8,31,19
740,2022-08-31,3,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4882,49.58,2.0,2022-08-31 20:00:00,2022,8,31,20
741,2022-08-31,3,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4723,52.61,1.0,2022-08-31 21:00:00,2022,8,31,21
742,2022-08-31,3,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4227,55.32,1.0,2022-08-31 22:00:00,2022,8,31,22


# 9월 데이터 머신러닝

## 데이터 가공

In [194]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [195]:
X9 = train_sep.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [196]:
y9 = train_sep[['datetime', 'classification']]
X9_1 = X9.drop(columns = ['datetime', 'classification'])
y9_1 = X9.datetime

In [197]:
X9_1_scaler = scaler.fit_transform(X9_1)

In [198]:
X9_1_sc = pd.DataFrame(X9_1_scaler)
X9_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X9 = pd.concat([y9_1, X9_1_sc], axis = 1)
X9

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-09-01 00:00:00,1.0,0.459187,0.961768,0.0,0.0,0.0,0.000000
1,2019-09-01 01:00:00,1.0,0.310145,0.971093,0.0,0.0,0.0,0.043478
2,2019-09-01 02:00:00,1.0,0.230502,0.964938,0.0,0.0,0.0,0.086957
3,2019-09-01 03:00:00,1.0,0.177462,0.917755,0.0,0.0,0.0,0.130435
4,2019-09-01 04:00:00,1.0,0.135658,0.903394,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-09-30 19:00:00,0.5,0.674653,0.659828,1.0,0.0,1.0,0.826087
2156,2021-09-30 20:00:00,0.5,0.652016,0.690601,1.0,0.0,1.0,0.869565
2157,2021-09-30 21:00:00,0.5,0.658790,0.756248,1.0,0.0,1.0,0.913043
2158,2021-09-30 22:00:00,0.5,0.571381,0.802313,1.0,0.0,1.0,0.956522


In [199]:
X9_test = test_sep.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [200]:
y9_test = test_sep[['datetime', 'classification']]
X9_1_test = X9_test.drop(columns = ['datetime', 'classification'])
y9_1_test = X9_test.datetime

In [201]:
X9_1_test_scaler = scaler.fit_transform(X9_1_test)

In [202]:
X9_1_test_sc = pd.DataFrame(X9_1_test_scaler)
X9_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X9_test = pd.concat([y9_1_test, X9_1_test_sc], axis = 1)
X9_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-09-01 00:00:00,0.500000,0.271607,0.963984,0.0,0.0,0.0,0.000000
1,2022-09-01 01:00:00,0.500000,0.145987,0.950401,0.0,0.0,0.0,0.043478
2,2022-09-01 02:00:00,0.500000,0.104056,0.947520,0.0,0.0,0.0,0.086957
3,2022-09-01 03:00:00,0.500000,0.060414,0.942581,0.0,0.0,0.0,0.130435
4,2022-09-01 04:00:00,0.500000,0.091220,0.912122,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-09-30 19:00:00,0.666667,0.842204,0.347602,0.0,0.0,1.0,0.826087
716,2022-09-30 20:00:00,0.666667,0.740715,0.692941,0.0,0.0,1.0,0.869565
717,2022-09-30 21:00:00,0.666667,0.712134,0.800782,0.0,0.0,1.0,0.913043
718,2022-09-30 22:00:00,0.666667,0.608420,0.487549,0.0,0.0,1.0,0.956522


## LightGBM

In [203]:
X9 = X9.drop(columns = ['datetime'])
y9 = y9.drop(columns = ['datetime'])
X9_test = X9_test.drop(columns = ['datetime'])
y9_test = y9_test.drop(columns = ['datetime'])

In [204]:
study9 = optuna.create_study(direction='maximize',sampler=TPESampler())
study9.optimize(lambda trial : objectiveLGBM(trial, X9, y9), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study9.best_trial.value,study9.best_trial.params))

[32m[I 2022-12-01 03:02:14,332][0m A new study created in memory with name: no-name-1eae2a98-1fd1-48b9-a2fe-5d5d2cd8d856[0m
[32m[I 2022-12-01 03:02:19,548][0m Trial 0 finished with value: 0.9649122807017544 and parameters: {'num_leaves': 359, 'n_estimators': 2564, 'feature_fraction': 0.8533277920921385, 'bagging_fraction': 0.8690380106614539, 'bagging_freq': 7, 'min_child_samples': 62}. Best is trial 0 with value: 0.9649122807017544.[0m
[32m[I 2022-12-01 03:02:31,148][0m Trial 1 finished with value: 1.0 and parameters: {'num_leaves': 500, 'n_estimators': 1020, 'feature_fraction': 0.670468444131839, 'bagging_fraction': 0.9580660229199991, 'bagging_freq': 2, 'min_child_samples': 8}. Best is trial 1 with value: 1.0.[0m
[32m[I 2022-12-01 03:02:34,520][0m Trial 2 finished with value: 0.9067901234567901 and parameters: {'num_leaves': 107, 'n_estimators': 1909, 'feature_fraction': 0.6853175505572235, 'bagging_fraction': 0.7690761018573167, 'bagging_freq': 6, 'min_child_samples': 77

Best trial: score 1.0,
params {'num_leaves': 500, 'n_estimators': 1020, 'feature_fraction': 0.670468444131839, 'bagging_fraction': 0.9580660229199991, 'bagging_freq': 2, 'min_child_samples': 8}


In [205]:
optuna.visualization.plot_param_importances(study9) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study9) # 최적화 과정 시각화

In [206]:
X9_train, X9_val, y9_train, y9_val = train_test_split(X9, y9, test_size = 0.2, random_state = 42)

In [207]:
X9_train.shape, X9_val.shape, y9_train.shape, y9_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [208]:
model = LGBMClassifier(**study9.best_trial.params)

In [209]:
model9 = model.fit(X9_train, y9_train,
          eval_set = [(X9_train, y9_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.639829	training's multi_logloss: 0.639829
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.552578	training's multi_logloss: 0.552578
[3]	training's multi_logloss: 0.47919	training's multi_logloss: 0.47919
[4]	training's multi_logloss: 0.419691	training's multi_logloss: 0.419691
[5]	training's multi_logloss: 0.369889	training's multi_logloss: 0.369889
[6]	training's multi_logloss: 0.328442	training's multi_logloss: 0.328442
[7]	training's multi_logloss: 0.296686	training's multi_logloss: 0.296686
[8]	training's multi_logloss: 0.269309	training's multi_logloss: 0.269309
[9]	training's multi_logloss: 0.240877	training's multi_logloss: 0.240877
[10]	training's multi_logloss: 0.219142	training's multi_logloss: 0.219142
[11]	training's multi_logloss: 0.196265	training's multi_logloss: 0.196265
[12]	training's multi_logloss: 0.178436	training's multi_logloss: 0.178436
[13]	training's multi_logloss: 0.162515	training

In [210]:
train9_preds = model9.predict(X9_train)
val9_preds = model9.predict(X9_val)

In [211]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [212]:
get_clf_eval(y9_train, train9_preds)
get_clf_eval(y9_val, val9_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [213]:
preds_9= model9.predict(X9_test)
preds_9

array([1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 1., 1., 2.,
       2., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 3., 2., 2., 2., 2., 2., 1., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 3., 3., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3.,
       3., 2., 2., 2., 2., 3., 2., 1., 1., 2., 2., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [214]:
test_sep['classification'] = preds_9
test_sep

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-09-01,4,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,2234,59.59,1.0,2022-09-01 00:00:00,2022,9,1,0
1,2022-09-01,4,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1500,58.93,1.0,2022-09-01 01:00:00,2022,9,1,1
2,2022-09-01,4,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1255,58.79,1.0,2022-09-01 02:00:00,2022,9,1,2
3,2022-09-01,4,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1000,58.55,1.0,2022-09-01 03:00:00,2022,9,1,3
4,2022-09-01,4,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1180,57.07,1.0,2022-09-01 04:00:00,2022,9,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-09-30,5,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,5568,29.64,2.0,2022-09-30 19:00:00,2022,9,30,19
716,2022-09-30,5,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4975,46.42,2.0,2022-09-30 20:00:00,2022,9,30,20
717,2022-09-30,5,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4808,51.66,1.0,2022-09-30 21:00:00,2022,9,30,21
718,2022-09-30,5,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4202,36.44,2.0,2022-09-30 22:00:00,2022,9,30,22


# 10월 데이터 머신러닝

## 데이터 가공

In [215]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [216]:
X10 = train_oct.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [217]:
y10 = train_oct[['datetime', 'classification']]
X10_1 = X10.drop(columns = ['datetime', 'classification'])
y10_1 = X10.datetime

In [218]:
X10_1_scaler = scaler.fit_transform(X10_1)

In [219]:
X10_1_sc = pd.DataFrame(X10_1_scaler)
X10_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X10 = pd.concat([y10_1, X10_1_sc], axis = 1)
X10

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-10-01 00:00:00,0.166667,0.362345,0.970887,0.0,0.0,0.0,0.000000
1,2019-10-01 01:00:00,0.166667,0.224190,0.924385,0.0,0.0,0.0,0.043478
2,2019-10-01 02:00:00,0.166667,0.159706,0.880422,0.0,0.0,0.0,0.086957
3,2019-10-01 03:00:00,0.166667,0.102907,0.880031,0.0,0.0,0.0,0.130435
4,2019-10-01 04:00:00,0.166667,0.136151,0.901915,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1483,2020-10-31 19:00:00,0.833333,0.782325,0.705354,1.0,0.0,1.0,0.826087
1484,2020-10-31 20:00:00,0.833333,0.675409,0.778038,1.0,0.0,1.0,0.869565
1485,2020-10-31 21:00:00,0.833333,0.681757,0.814185,1.0,0.0,1.0,0.913043
1486,2020-10-31 22:00:00,0.833333,0.733044,0.523251,1.0,0.0,1.0,0.956522


In [220]:
X10_test = test_oct.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [221]:
y10_test = test_oct[['datetime', 'classification']]
X10_1_test = X10_test.drop(columns = ['datetime', 'classification'])
y10_1_test = X10_test.datetime

In [222]:
X10_1_test_scaler = scaler.fit_transform(X10_1_test)

In [223]:
X10_1_test_sc = pd.DataFrame(X10_1_test_scaler)
X10_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X10_test = pd.concat([y10_1_test, X10_1_test_sc], axis = 1)
X10_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-10-01 00:00:00,0.666667,0.196008,0.944902,0.0,0.0,0.0,0.000000
1,2021-10-01 01:00:00,0.666667,0.107727,0.911125,0.0,0.0,0.0,0.043478
2,2021-10-01 02:00:00,0.666667,0.051110,0.916192,0.0,0.0,0.0,0.086957
3,2021-10-01 03:00:00,0.666667,0.031836,0.932024,0.0,0.0,0.0,0.130435
4,2021-10-01 04:00:00,0.666667,0.056961,0.914925,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2021-10-31 19:00:00,1.000000,0.631217,0.855183,0.0,0.0,1.0,0.826087
740,2021-10-31 20:00:00,1.000000,0.671313,0.775174,0.0,0.0,1.0,0.869565
741,2021-10-31 21:00:00,1.000000,0.618998,0.809584,0.0,0.0,1.0,0.913043
742,2021-10-31 22:00:00,1.000000,0.781105,0.832383,0.0,0.0,1.0,0.956522


## LightGBM

In [224]:
X10 = X10.drop(columns = ['datetime'])
y10 = y10.drop(columns = ['datetime'])
X10_test = X10_test.drop(columns = ['datetime'])
y10_test = y10_test.drop(columns = ['datetime'])

In [225]:
study10 = optuna.create_study(direction='maximize',sampler=TPESampler())
study10.optimize(lambda trial : objectiveLGBM(trial, X10, y10), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study10.best_trial.value,study10.best_trial.params))

[32m[I 2022-12-01 03:04:27,072][0m A new study created in memory with name: no-name-a8884775-46d4-48cf-8a42-b9619f5953da[0m
[32m[I 2022-12-01 03:04:30,356][0m Trial 0 finished with value: 0.9850187265917603 and parameters: {'num_leaves': 507, 'n_estimators': 2939, 'feature_fraction': 0.7716760223835386, 'bagging_fraction': 0.7724885122017178, 'bagging_freq': 3, 'min_child_samples': 94}. Best is trial 0 with value: 0.9850187265917603.[0m
[32m[I 2022-12-01 03:04:31,546][0m Trial 1 finished with value: 0.9036620890553474 and parameters: {'num_leaves': 317, 'n_estimators': 1464, 'feature_fraction': 0.9199375271029108, 'bagging_fraction': 0.5054853263637675, 'bagging_freq': 3, 'min_child_samples': 89}. Best is trial 0 with value: 0.9850187265917603.[0m
[32m[I 2022-12-01 03:04:34,975][0m Trial 2 finished with value: 0.9964539007092199 and parameters: {'num_leaves': 271, 'n_estimators': 1745, 'feature_fraction': 0.8861077815969529, 'bagging_fraction': 0.9721092084528694, 'bagging_f

Best trial: score 1.0,
params {'num_leaves': 419, 'n_estimators': 762, 'feature_fraction': 0.694532801921868, 'bagging_fraction': 0.8666806760107508, 'bagging_freq': 3, 'min_child_samples': 8}


In [226]:
optuna.visualization.plot_param_importances(study10) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study10) # 최적화 과정 시각화

In [227]:
X10_train, X10_val, y10_train, y10_val = train_test_split(X10, y10, test_size = 0.2, random_state = 42)

In [228]:
X10_train.shape, X10_val.shape, y10_train.shape, y10_val.shape

((1190, 7), (298, 7), (1190, 1), (298, 1))

In [229]:
model = LGBMClassifier(**study10.best_trial.params)

In [230]:
model10 = model.fit(X10_train, y10_train,
          eval_set = [(X10_train, y10_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.651059	training's multi_logloss: 0.651059
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.563863	training's multi_logloss: 0.563863
[3]	training's multi_logloss: 0.488743	training's multi_logloss: 0.488743
[4]	training's multi_logloss: 0.428096	training's multi_logloss: 0.428096
[5]	training's multi_logloss: 0.377665	training's multi_logloss: 0.377665
[6]	training's multi_logloss: 0.335737	training's multi_logloss: 0.335737
[7]	training's multi_logloss: 0.304173	training's multi_logloss: 0.304173
[8]	training's multi_logloss: 0.277129	training's multi_logloss: 0.277129
[9]	training's multi_logloss: 0.248129	training's multi_logloss: 0.248129
[10]	training's multi_logloss: 0.226425	training's multi_logloss: 0.226425
[11]	training's multi_logloss: 0.203036	training's multi_logloss: 0.203036
[12]	training's multi_logloss: 0.185629	training's multi_logloss: 0.185629
[13]	training's multi_logloss: 0.169876	traini

In [231]:
train10_preds = model10.predict(X10_train)
val10_preds = model10.predict(X10_val)

In [232]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [233]:
get_clf_eval(y10_train, train10_preds)
get_clf_eval(y10_val, val10_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [234]:
preds_10= model10.predict(X10_test)
preds_10

array([1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 1., 2., 1., 2., 2., 1., 1.,
       2., 2., 2., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 1., 1., 1., 1., 2., 2., 1., 1., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 3., 3., 2., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 2., 2., 2., 2., 1., 1.,
       1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3.,
       3., 2., 2., 2., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 3., 2., 2., 2., 2., 1., 2., 2., 2., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 2., 1., 1., 1., 1., 1., 1.,
       1., 2., 1., 1., 1.

In [235]:
test_oct['classification'] = preds_10
test_oct

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-10-01,5,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1513,60.09,1.0,2021-10-01 00:00:00,2021,10,1,0
1,2021-10-01,5,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1000,58.49,1.0,2021-10-01 01:00:00,2021,10,1,1
2,2021-10-01,5,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,671,58.73,1.0,2021-10-01 02:00:00,2021,10,1,2
3,2021-10-01,5,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,559,59.48,1.0,2021-10-01 03:00:00,2021,10,1,3
4,2021-10-01,5,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,705,58.67,1.0,2021-10-01 04:00:00,2021,10,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2021-10-31,7,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4042,55.84,1.0,2021-10-31 19:00:00,2021,10,31,19
740,2021-10-31,7,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4275,52.05,1.0,2021-10-31 20:00:00,2021,10,31,20
741,2021-10-31,7,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3971,53.68,1.0,2021-10-31 21:00:00,2021,10,31,21
742,2021-10-31,7,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4913,54.76,1.0,2021-10-31 22:00:00,2021,10,31,22


# 11월 데이터 머신러닝

## 데이터 가공

In [236]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [237]:
X11 = train_nov.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [238]:
y11 = train_nov[['datetime', 'classification']]
X11_1 = X11.drop(columns = ['datetime', 'classification'])
y11_1 = X11.datetime

In [239]:
X11_1_scaler = scaler.fit_transform(X11_1)

In [240]:
X11_1_sc = pd.DataFrame(X11_1_scaler)
X11_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X11 = pd.concat([y11_1, X11_1_sc], axis = 1)
X11

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-11-01 00:00:00,0.666667,0.395630,0.877049,0.0,0.0,0.0,0.000000
1,2019-11-01 01:00:00,0.666667,0.283025,0.849281,0.0,0.0,0.0,0.043478
2,2019-11-01 02:00:00,0.666667,0.266723,0.861994,0.0,0.0,0.0,0.086957
3,2019-11-01 03:00:00,0.666667,0.191261,0.853797,0.0,0.0,0.0,0.130435
4,2019-11-01 04:00:00,0.666667,0.169244,0.806457,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1435,2020-11-30 19:00:00,0.000000,0.702017,0.618936,1.0,0.0,1.0,0.826087
1436,2020-11-30 20:00:00,0.000000,0.584370,0.699063,1.0,0.0,1.0,0.869565
1437,2020-11-30 21:00:00,0.000000,0.694286,0.747407,1.0,0.0,1.0,0.913043
1438,2020-11-30 22:00:00,0.000000,0.451261,0.784543,1.0,0.0,1.0,0.956522


In [241]:
X11_test = test_nov.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [242]:
y11_test = test_nov[['datetime', 'classification']]
X11_1_test = X11_test.drop(columns = ['datetime', 'classification'])
y11_1_test = X11_test.datetime

In [243]:
X11_1_test_scaler = scaler.fit_transform(X11_1_test)

In [244]:
X11_1_test_sc = pd.DataFrame(X11_1_test_scaler)
X11_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X11_test = pd.concat([y11_1_test, X11_1_test_sc], axis = 1)
X11_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-11-01 00:00:00,0.000000,0.163076,0.883652,0.0,0.0,0.0,0.000000
1,2021-11-01 01:00:00,0.000000,0.077078,0.901486,0.0,0.0,0.0,0.043478
2,2021-11-01 02:00:00,0.000000,0.048973,0.915924,0.0,0.0,0.0,0.086957
3,2021-11-01 03:00:00,0.000000,0.033322,0.875796,0.0,0.0,0.0,0.130435
4,2021-11-01 04:00:00,0.000000,0.073039,0.887049,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2021-11-30 19:00:00,0.166667,0.750084,0.727813,0.0,0.0,1.0,0.826087
716,2021-11-30 20:00:00,0.166667,0.645910,0.846921,0.0,0.0,1.0,0.869565
717,2021-11-30 21:00:00,0.166667,0.719286,0.861571,0.0,0.0,1.0,0.913043
718,2021-11-30 22:00:00,0.166667,0.473410,0.908917,0.0,0.0,1.0,0.956522


## LightGBM

In [245]:
X11 = X11.drop(columns = ['datetime'])
y11 = y11.drop(columns = ['datetime'])
X11_test = X11_test.drop(columns = ['datetime'])
y11_test = y11_test.drop(columns = ['datetime'])

In [246]:
study11 = optuna.create_study(direction='maximize',sampler=TPESampler())
study11.optimize(lambda trial : objectiveLGBM(trial, X11, y11), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study11.best_trial.value,study11.best_trial.params))

[32m[I 2022-12-01 03:05:45,071][0m A new study created in memory with name: no-name-475138f1-f2d0-4c9c-9c2f-edddc9ff4688[0m
[32m[I 2022-12-01 03:05:49,314][0m Trial 0 finished with value: 0.9904761904761905 and parameters: {'num_leaves': 445, 'n_estimators': 2678, 'feature_fraction': 0.48189468048917794, 'bagging_fraction': 0.9637287018310157, 'bagging_freq': 5, 'min_child_samples': 51}. Best is trial 0 with value: 0.9904761904761905.[0m
[32m[I 2022-12-01 03:05:51,798][0m Trial 1 finished with value: 0.9929824561403509 and parameters: {'num_leaves': 202, 'n_estimators': 1525, 'feature_fraction': 0.9404873880832906, 'bagging_fraction': 0.7702642655636422, 'bagging_freq': 5, 'min_child_samples': 43}. Best is trial 1 with value: 0.9929824561403509.[0m
[32m[I 2022-12-01 03:05:54,150][0m Trial 2 finished with value: 0.9384920634920636 and parameters: {'num_leaves': 234, 'n_estimators': 2294, 'feature_fraction': 0.7102755546108693, 'bagging_fraction': 0.8327029305149367, 'bagging_

Best trial: score 1.0,
params {'num_leaves': 383, 'n_estimators': 1351, 'feature_fraction': 0.6676684354797257, 'bagging_fraction': 0.6448754387727802, 'bagging_freq': 7, 'min_child_samples': 23}


In [247]:
optuna.visualization.plot_param_importances(study11) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study11) # 최적화 과정 시각화

In [248]:
X11_train, X11_val, y11_train, y11_val = train_test_split(X11, y11, test_size = 0.2, random_state = 42)

In [249]:
X11_train.shape, X11_val.shape, y11_train.shape, y11_val.shape

((1152, 7), (288, 7), (1152, 1), (288, 1))

In [250]:
model = LGBMClassifier(**study11.best_trial.params)

In [251]:
model11 = model.fit(X11_train, y11_train,
          eval_set = [(X11_train, y11_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.662388	training's multi_logloss: 0.662388
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.576016	training's multi_logloss: 0.576016
[3]	training's multi_logloss: 0.501178	training's multi_logloss: 0.501178
[4]	training's multi_logloss: 0.440051	training's multi_logloss: 0.440051
[5]	training's multi_logloss: 0.388713	training's multi_logloss: 0.388713
[6]	training's multi_logloss: 0.345939	training's multi_logloss: 0.345939
[7]	training's multi_logloss: 0.315116	training's multi_logloss: 0.315116
[8]	training's multi_logloss: 0.288866	training's multi_logloss: 0.288866
[9]	training's multi_logloss: 0.258889	training's multi_logloss: 0.258889
[10]	training's multi_logloss: 0.237587	training's multi_logloss: 0.237587
[11]	training's multi_logloss: 0.213163	training's multi_logloss: 0.213163
[12]	training's multi_logloss: 0.195574	training's multi_logloss: 0.195574
[13]	training's multi_logloss: 0.179286	traini

In [252]:
train11_preds = model11.predict(X11_train)
val11_preds = model11.predict(X11_val)

In [253]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [254]:
get_clf_eval(y11_train, train11_preds)
get_clf_eval(y11_val, val11_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9964
재현율: 0.9444


In [255]:
preds_11= model11.predict(X11_test)
preds_11

array([1., 1., 1., 1., 1., 1., 1., 2., 3., 2., 1., 2., 1., 1., 1., 1., 1.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3.,
       2., 2., 1., 1., 1., 1., 2., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 3., 3., 2., 2., 1., 2., 2., 1., 1., 1., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 2., 2., 2., 1.,
       1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.,
       2., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 2., 2., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 1., 2., 1., 1., 1., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 2., 2.,
       2., 2., 2., 1., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 3., 3.

In [256]:
test_nov['classification'] = preds_11
test_nov

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-11-01,1,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1439,56.77,1.0,2021-11-01 00:00:00,2021,11,1,0
1,2021-11-01,1,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,928,57.61,1.0,2021-11-01 01:00:00,2021,11,1,1
2,2021-11-01,1,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,761,58.29,1.0,2021-11-01 02:00:00,2021,11,1,2
3,2021-11-01,1,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,668,56.40,1.0,2021-11-01 03:00:00,2021,11,1,3
4,2021-11-01,1,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,904,56.93,1.0,2021-11-01 04:00:00,2021,11,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2021-11-30,2,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4927,49.43,1.0,2021-11-30 19:00:00,2021,11,30,19
716,2021-11-30,2,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4308,55.04,1.0,2021-11-30 20:00:00,2021,11,30,20
717,2021-11-30,2,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4744,55.73,1.0,2021-11-30 21:00:00,2021,11,30,21
718,2021-11-30,2,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3283,57.96,1.0,2021-11-30 22:00:00,2021,11,30,22


# 12월 데이터 머신러닝

## 데이터 가공

In [257]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [258]:
X12 = train_dec.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [259]:
y12 = train_dec[['datetime', 'classification']]
X12_1 = X12.drop(columns = ['datetime', 'classification'])
y12_1 = X12.datetime

In [260]:
X12_1_scaler = scaler.fit_transform(X12_1)

In [261]:
X12_1_sc = pd.DataFrame(X12_1_scaler)
X12_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X12 = pd.concat([y12_1, X12_1_sc], axis = 1)
X12

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-12-01 00:00:00,1.0,0.396992,0.900867,0.0,0.0,0.0,0.000000
1,2019-12-01 01:00:00,1.0,0.273851,0.922546,0.0,0.0,0.0,0.043478
2,2019-12-01 02:00:00,1.0,0.210860,0.900867,0.0,0.0,0.0,0.086957
3,2019-12-01 03:00:00,1.0,0.183459,0.873276,0.0,0.0,0.0,0.130435
4,2019-12-01 04:00:00,1.0,0.136174,0.856326,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1483,2020-12-31 19:00:00,0.5,0.732498,0.294836,1.0,0.0,1.0,0.826087
1484,2020-12-31 20:00:00,0.5,0.585129,0.448364,1.0,0.0,1.0,0.869565
1485,2020-12-31 21:00:00,0.5,0.653467,0.683484,1.0,0.0,1.0,0.913043
1486,2020-12-31 22:00:00,0.5,0.403342,0.757982,1.0,0.0,1.0,0.956522


In [262]:
X12_test = test_dec.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [263]:
y12_test = test_dec[['datetime', 'classification']]
X12_1_test = X12_test.drop(columns = ['datetime', 'classification'])
y12_1_test = X12_test.datetime

In [264]:
X12_1_test_scaler = scaler.fit_transform(X12_1_test)

In [265]:
X12_1_test_sc = pd.DataFrame(X12_1_test_scaler)
X12_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X12_test = pd.concat([y12_1_test, X12_1_test_sc], axis = 1)
X12_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-12-01 00:00:00,0.333333,0.252992,0.944209,0.0,0.0,0.0,0.000000
1,2021-12-01 01:00:00,0.333333,0.178062,0.923263,0.0,0.0,0.0,0.043478
2,2021-12-01 02:00:00,0.333333,0.149041,0.932326,0.0,0.0,0.0,0.086957
3,2021-12-01 03:00:00,0.333333,0.095753,0.916012,0.0,0.0,0.0,0.130435
4,2021-12-01 04:00:00,0.333333,0.119528,0.890030,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2021-12-31 19:00:00,0.666667,0.555173,0.383283,0.0,0.0,1.0,0.826087
740,2021-12-31 20:00:00,0.666667,0.466798,0.540181,0.0,0.0,1.0,0.869565
741,2021-12-31 21:00:00,0.666667,0.619774,0.780463,0.0,0.0,1.0,0.913043
742,2021-12-31 22:00:00,0.666667,0.518282,0.856596,0.0,0.0,1.0,0.956522


## LightGBM

In [266]:
X12 = X12.drop(columns = ['datetime'])
y12 = y12.drop(columns = ['datetime'])
X12_test = X12_test.drop(columns = ['datetime'])
y12_test = y12_test.drop(columns = ['datetime'])

In [267]:
study12 = optuna.create_study(direction='maximize',sampler=TPESampler())
study12.optimize(lambda trial : objectiveLGBM(trial, X12, y12), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study12.best_trial.value,study12.best_trial.params))

[32m[I 2022-12-01 03:06:55,111][0m A new study created in memory with name: no-name-7134f836-47cc-4f91-954a-ba083e8c309a[0m
[32m[I 2022-12-01 03:06:57,653][0m Trial 0 finished with value: 0.8888888888888888 and parameters: {'num_leaves': 91, 'n_estimators': 1303, 'feature_fraction': 0.8590500548066573, 'bagging_fraction': 0.975659074862604, 'bagging_freq': 4, 'min_child_samples': 46}. Best is trial 0 with value: 0.8888888888888888.[0m
[32m[I 2022-12-01 03:07:02,277][0m Trial 1 finished with value: 1.0 and parameters: {'num_leaves': 75, 'n_estimators': 2549, 'feature_fraction': 0.46430614390509556, 'bagging_fraction': 0.7817078130575377, 'bagging_freq': 3, 'min_child_samples': 29}. Best is trial 1 with value: 1.0.[0m
[32m[I 2022-12-01 03:07:03,969][0m Trial 2 finished with value: 0.9222222222222222 and parameters: {'num_leaves': 61, 'n_estimators': 1672, 'feature_fraction': 0.46767536004036453, 'bagging_fraction': 0.8542689022453422, 'bagging_freq': 3, 'min_child_samples': 92

Best trial: score 1.0,
params {'num_leaves': 75, 'n_estimators': 2549, 'feature_fraction': 0.46430614390509556, 'bagging_fraction': 0.7817078130575377, 'bagging_freq': 3, 'min_child_samples': 29}


In [268]:
optuna.visualization.plot_param_importances(study12) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study12) # 최적화 과정 시각화

In [269]:
X12_train, X12_val, y12_train, y12_val = train_test_split(X12, y12, test_size = 0.2, random_state = 42)

In [270]:
X12_train.shape, X12_val.shape, y12_train.shape, y12_val.shape

((1190, 7), (298, 7), (1190, 1), (298, 1))

In [271]:
model = LGBMClassifier(**study12.best_trial.params)

In [272]:
model12 = model.fit(X12_train, y12_train,
          eval_set = [(X12_train, y12_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.603689	training's multi_logloss: 0.603689
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.588935	training's multi_logloss: 0.588935
[3]	training's multi_logloss: 0.523087	training's multi_logloss: 0.523087
[4]	training's multi_logloss: 0.518339	training's multi_logloss: 0.518339
[5]	training's multi_logloss: 0.468495	training's multi_logloss: 0.468495
[6]	training's multi_logloss: 0.427807	training's multi_logloss: 0.427807
[7]	training's multi_logloss: 0.391929	training's multi_logloss: 0.391929
[8]	training's multi_logloss: 0.36063	training's multi_logloss: 0.36063
[9]	training's multi_logloss: 0.332903	training's multi_logloss: 0.332903
[10]	training's multi_logloss: 0.308117	training's multi_logloss: 0.308117
[11]	training's multi_logloss: 0.293619	training's multi_logloss: 0.293619
[12]	training's multi_logloss: 0.27659	training's multi_logloss: 0.27659
[13]	training's multi_logloss: 0.255009	training's

In [273]:
train12_preds = model12.predict(X12_train)
val12_preds = model12.predict(X12_val)

In [274]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [275]:
get_clf_eval(y12_train, train12_preds)
get_clf_eval(y12_val, val12_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.8799
재현율: 0.8799


In [276]:
preds_12= model12.predict(X12_test)
preds_12

array([1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1., 2., 1., 1., 1.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 1., 1., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 2., 2., 1., 1., 1., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 1., 2., 2., 1., 1., 1.,
       1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 1., 2.,
       2., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 2.

In [277]:
test_dec['classification'] = preds_12
test_dec

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-12-01,3,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1896,59.63,1.0,2021-12-01 00:00:00,2021,12,1,0
1,2021-12-01,3,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1439,58.59,1.0,2021-12-01 01:00:00,2021,12,1,1
2,2021-12-01,3,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1262,59.04,1.0,2021-12-01 02:00:00,2021,12,1,2
3,2021-12-01,3,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,937,58.23,1.0,2021-12-01 03:00:00,2021,12,1,3
4,2021-12-01,3,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1082,56.94,1.0,2021-12-01 04:00:00,2021,12,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2021-12-31,5,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3739,31.78,2.0,2021-12-31 19:00:00,2021,12,31,19
740,2021-12-31,5,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3200,39.57,2.0,2021-12-31 20:00:00,2021,12,31,20
741,2021-12-31,5,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4133,51.50,1.0,2021-12-31 21:00:00,2021,12,31,21
742,2021-12-31,5,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,3514,55.28,1.0,2021-12-31 22:00:00,2021,12,31,22


# 월별 데이터 합치기

In [278]:
result = pd.concat([test_jan,
                    test_feb,
                    test_mar,
                    test_apr,
                    test_may,
                    test_jun,
                    test_jul,
                    test_aug,
                    test_sep,
                    test_oct,
                    test_nov,
                    test_dec])
result = result.sort_values(by = 'datetime')
result = result.reset_index(drop = True)
result

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-10-01,5,0:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1513,60.09,1.0,2021-10-01 00:00:00,2021,10,1,0
1,2021-10-01,5,1:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,1000,58.49,1.0,2021-10-01 01:00:00,2021,10,1,1
2,2021-10-01,5,2:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,671,58.73,1.0,2021-10-01 02:00:00,2021,10,1,2
3,2021-10-01,5,3:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,559,59.48,1.0,2021-10-01 03:00:00,2021,10,1,3
4,2021-10-01,5,4:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,705,58.67,1.0,2021-10-01 04:00:00,2021,10,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2022-09-30,5,19:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,5568,29.64,2.0,2022-09-30 19:00:00,2022,9,30,19
8756,2022-09-30,5,20:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4975,46.42,2.0,2022-09-30 20:00:00,2022,9,30,20
8757,2022-09-30,5,21:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4808,51.66,1.0,2022-09-30 21:00:00,2022,9,30,21
8758,2022-09-30,5,22:00:00,한남대교,서초구,C-13,한남대교남단,한남대교북단,6,822,4202,36.44,2.0,2022-09-30 22:00:00,2022,9,30,22


In [279]:
result = result.drop(columns = ['dow', 
                                'district_name', 
                                'branch_num', 
                                'arr_point', 
                                'lane', 
                                'distance', 
                                'volume', 
                                'speed', 
                                'datetime', 
                                'year', 
                                'month', 
                                'day', 
                                'hour'])
result

Unnamed: 0,date,time,branch_name,dep_point,classification
0,2021-10-01,0:00:00,한남대교,한남대교남단,1.0
1,2021-10-01,1:00:00,한남대교,한남대교남단,1.0
2,2021-10-01,2:00:00,한남대교,한남대교남단,1.0
3,2021-10-01,3:00:00,한남대교,한남대교남단,1.0
4,2021-10-01,4:00:00,한남대교,한남대교남단,1.0
...,...,...,...,...,...
8755,2022-09-30,19:00:00,한남대교,한남대교남단,2.0
8756,2022-09-30,20:00:00,한남대교,한남대교남단,2.0
8757,2022-09-30,21:00:00,한남대교,한남대교남단,1.0
8758,2022-09-30,22:00:00,한남대교,한남대교남단,2.0


# csv 파일 만들기

In [280]:
result.to_csv('hannam_depsouth_result.csv', index = False)