# 라이브러리

In [1]:
import pandas as pd
import random
import os
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

In [2]:
# 한글 폰트 깨짐 현상 해결을 위한 나눔 폰트 설치
# 코드 1회 실행 후 주석 처리하고 런타임 재시작 및 모두 실행
# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

## 데이터 로드

In [4]:
# 경로 설정
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# csv 파일 읽어오기
root = '/content/drive/MyDrive/최종프로젝트/교통/분석/3rd_modified_data/'
# root = '/content/drive/MyDrive/Project/'
C13_depnorth = pd.read_csv(root + 'Data_hannam_depnorth.csv', encoding='cp949')
C13_depnorth_test = pd.read_csv(root + 'hannam_depnorth_test.csv', encoding='cp949')

In [6]:
# 데이터 확인
print(C13_depnorth.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24096 entries, 0 to 24095
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            24096 non-null  object 
 1   dow             24096 non-null  int64  
 2   time            24096 non-null  object 
 3   branch_name     24096 non-null  object 
 4   district_name   24096 non-null  object 
 5   branch_num      24096 non-null  object 
 6   dep_point       24096 non-null  object 
 7   arr_point       24096 non-null  object 
 8   lane            24096 non-null  int64  
 9   distance        24096 non-null  int64  
 10  volume          24096 non-null  int64  
 11  speed           24096 non-null  float64
 12  classification  24096 non-null  int64  
dtypes: float64(1), int64(5), object(7)
memory usage: 2.4+ MB
None


In [7]:
# 결측치 확인
print(C13_depnorth.isnull().sum())

date              0
dow               0
time              0
branch_name       0
district_name     0
branch_num        0
dep_point         0
arr_point         0
lane              0
distance          0
volume            0
speed             0
classification    0
dtype: int64


In [8]:
# date 컬럼과 time 컬럼을 합쳐 datetime이라는 컬럼 만들기
C13_depnorth['datetime'] = C13_depnorth['date'] + ' ' + C13_depnorth['time']
C13_depnorth_test['datetime'] = C13_depnorth_test['date'] + ' ' + C13_depnorth_test['time']

In [9]:
# date 컬럼과 time 컬럼 제거
# C4_depsouth = C4_depsouth.drop(C4_depsouth[['date', 'time']], axis=1)

In [10]:
# datetime 문자형 컬럼을 datetime 자료형으로 변환
C13_depnorth['datetime'] = pd.to_datetime(C13_depnorth['datetime'])
C13_depnorth_test['datetime'] = pd.to_datetime(C13_depnorth_test['datetime'])

# classification 컬럼값 변경

In [11]:
C13_depnorth.describe()

Unnamed: 0,dow,lane,distance,volume,speed,classification
count,24096.0,24096.0,24096.0,24096.0,24096.0,24096.0
mean,3.997012,6.0,822.0,3782.938704,51.893802,0.0
std,1.998295,0.0,0.0,1601.560139,8.71234,0.0
min,1.0,6.0,822.0,254.0,6.64,0.0
25%,2.0,6.0,822.0,2545.0,48.49,0.0
50%,4.0,6.0,822.0,4260.0,54.27,0.0
75%,6.0,6.0,822.0,5028.0,57.64,0.0
max,7.0,6.0,822.0,6678.0,68.54,0.0


In [12]:
C13_depnorth.loc[C13_depnorth['speed'] >= C13_depnorth['speed'].mean(), 'classification'] = 1
C13_depnorth.loc[C13_depnorth['speed'] < 15, 'classification'] = 3
C13_depnorth.loc[(C13_depnorth['speed'] >= 15) 
                & (C13_depnorth['speed'] < 25) 
                & ((C13_depnorth['volume'] >= C13_depnorth['volume'].mean())), 'classification'] = 3
C13_depnorth.loc[(C13_depnorth['speed'] >= 15) 
                & (C13_depnorth['speed'] < C13_depnorth['speed'].mean()) 
                & ((C13_depnorth['volume'] < C13_depnorth['volume'].mean())), 'classification'] = 2
C13_depnorth.loc[(C13_depnorth['speed'] >= 25) 
                & (C13_depnorth['speed'] < C13_depnorth['speed'].mean()) 
                & ((C13_depnorth['volume'] >= C13_depnorth['volume'].mean())), 'classification'] = 2

In [13]:
C13_depnorth['classification']

0        1
1        1
2        1
3        1
4        1
        ..
24091    2
24092    2
24093    2
24094    2
24095    1
Name: classification, Length: 24096, dtype: int64

In [14]:
C13_depnorth['year'] = C13_depnorth['datetime'].dt.year
C13_depnorth['month'] = C13_depnorth['datetime'].dt.month
C13_depnorth['day'] = C13_depnorth['datetime'].dt.day
C13_depnorth['hour'] = C13_depnorth['datetime'].dt.hour

In [15]:
C13_depnorth

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2019-01-01,2,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3881,60.75,1,2019-01-01 00:00:00,2019,1,1,0
1,2019-01-01,2,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3873,60.01,1,2019-01-01 01:00:00,2019,1,1,1
2,2019-01-01,2,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,2516,63.12,1,2019-01-01 02:00:00,2019,1,1,2
3,2019-01-01,2,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1792,63.03,1,2019-01-01 03:00:00,2019,1,1,3
4,2019-01-01,2,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1428,62.32,1,2019-01-01 04:00:00,2019,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24091,2021-09-30,4,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3755,44.79,2,2021-09-30 19:00:00,2021,9,30,19
24092,2021-09-30,4,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3815,49.87,2,2021-09-30 20:00:00,2021,9,30,20
24093,2021-09-30,4,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4615,43.42,2,2021-09-30 21:00:00,2021,9,30,21
24094,2021-09-30,4,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3939,45.61,2,2021-09-30 22:00:00,2021,9,30,22


# 월별로 데이터 나누기

In [16]:
C13_dn_month = C13_depnorth['month']
C13_dn_month_list  = sorted(set(C13_dn_month))
C13_dn_month_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [17]:
month_data = []
for i in range(0, len(C13_dn_month_list)):
  month = C13_depnorth[C13_depnorth['month'] == C13_dn_month_list[i]]
  month = month.reset_index(drop=True)
  month_data.append(month)

In [18]:
train_jan = month_data[0]
train_feb = month_data[1]
train_mar = month_data[2]
train_apr = month_data[3]
train_may = month_data[4]
train_jun = month_data[5]
train_jul = month_data[6]
train_aug = month_data[7]
train_sep = month_data[8]
train_oct = month_data[9]
train_nov = month_data[10]
train_dec = month_data[11]

In [19]:
C13_depnorth_test['year'] = C13_depnorth_test['datetime'].dt.year
C13_depnorth_test['month'] = C13_depnorth_test['datetime'].dt.month
C13_depnorth_test['day'] = C13_depnorth_test['datetime'].dt.day
C13_depnorth_test['hour'] = C13_depnorth_test['datetime'].dt.hour

In [20]:
C13_dn_test_mon = C13_depnorth_test['month']
C13_dn_test_mon_list  = sorted(set(C13_dn_test_mon))
C13_dn_test_mon_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [21]:
month_test_data = []
for i in range(0, len(C13_dn_month_list)):
  month = C13_depnorth_test[C13_depnorth_test['month'] == C13_dn_test_mon_list[i]]
  month = month.reset_index(drop=True)
  month_test_data.append(month)

In [22]:
test_jan = month_test_data[0]
test_feb = month_test_data[1]
test_mar = month_test_data[2]
test_apr = month_test_data[3]
test_may = month_test_data[4]
test_jun = month_test_data[5]
test_jul = month_test_data[6]
test_aug = month_test_data[7]
test_sep = month_test_data[8]
test_oct = month_test_data[9]
test_nov = month_test_data[10]
test_dec = month_test_data[11]

In [23]:
test_dec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            744 non-null    object        
 1   dow             744 non-null    int64         
 2   time            744 non-null    object        
 3   branch_name     744 non-null    object        
 4   district_name   744 non-null    object        
 5   branch_num      744 non-null    object        
 6   dep_point       744 non-null    object        
 7   arr_point       744 non-null    object        
 8   lane            744 non-null    int64         
 9   distance        744 non-null    int64         
 10  volume          744 non-null    int64         
 11  speed           744 non-null    float64       
 12  classification  0 non-null      float64       
 13  datetime        744 non-null    datetime64[ns]
 14  year            744 non-null    int64         
 15  month 

# 1월 데이터 머신러닝

## 데이터 가공

In [24]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [25]:
X1 = train_jan.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [26]:
y1 = train_jan[['datetime', 'classification']]
X1_1 = X1.drop(columns = ['datetime', 'classification'])
y1_1 = X1.datetime

In [27]:
X1_1_scaler = scaler.fit_transform(X1_1)

In [28]:
X1_1_sc = pd.DataFrame(X1_1_scaler)
X1_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X1 = pd.concat([y1_1, X1_1_sc], axis = 1)

In [29]:
X1_test = test_jan.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [30]:
y1_test = test_jan[['datetime', 'classification']]
X1_1_test = X1_test.drop(columns = ['datetime', 'classification'])
y1_1_test = X1_test.datetime

In [31]:
X1_1_test_scaler = scaler.fit_transform(X1_1_test)

In [32]:
X1_1_test_sc = pd.DataFrame(X1_1_test_scaler)
X1_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X1_test = pd.concat([y1_1_test, X1_1_test_sc], axis = 1)
X1_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-01-01 00:00:00,0.833333,0.254096,0.899824,0.0,0.0,0.0,0.000000
1,2022-01-01 01:00:00,0.833333,0.174109,0.956173,0.0,0.0,0.0,0.043478
2,2022-01-01 02:00:00,0.833333,0.097013,0.974760,0.0,0.0,0.0,0.086957
3,2022-01-01 03:00:00,0.833333,0.047543,0.970065,0.0,0.0,0.0,0.130435
4,2022-01-01 04:00:00,0.833333,0.041921,0.942281,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-01-31 19:00:00,0.000000,0.406039,0.835844,0.0,0.0,1.0,0.826087
740,2022-01-31 20:00:00,0.000000,0.395117,0.836627,0.0,0.0,1.0,0.869565
741,2022-01-31 21:00:00,0.000000,0.358336,0.836431,0.0,0.0,1.0,0.913043
742,2022-01-31 22:00:00,0.000000,0.175554,0.848562,0.0,0.0,1.0,0.956522


## LightGBM

In [33]:
# optuna 설치
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.3-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 16.1 MB/s 
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting cliff
  Downloading cliff-4.1.0-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 9.1 MB/s 
Collecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 65.6 MB/s 
[?25hCollecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 2.5 MB/s 
Collecting cmd2>=1.0.0
  Downloading cmd2-2.4.2-py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 56.1 MB/s 
[?25hCollecting stevedore>=2.0.1
  Downloading stevedore-4.1.1-py3-none-any.whl (

In [34]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error

In [35]:
import lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score

In [36]:
X1 = X1.drop(columns = ['datetime'])
y1 = y1.drop(columns = ['datetime'])
X1_test = X1_test.drop(columns = ['datetime'])
y1_test = y1_test.drop(columns = ['datetime'])

In [37]:
# LigthGBM 하이퍼파라미터 값 지정
def objectiveLGBM(trial: Trial, X, y):
    param = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'learning_rate': 0.01,
        'n_estimators': trial.suggest_int('n_estimators', 700, 3000),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'gpu_use_dp':True
    }
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

     # 학습 모델 생성
    model = LGBMClassifier(**param)
    lgb_model = model.fit(X_train, y_train, verbose=True) # 학습 진행
    train_preds = lgb_model.predict(X_train)
    test_preds = lgb_model.predict(X_test)

    # 모델 성능 확인
    train_precision = precision_score(y_test, test_preds, average= "macro")
    
    return train_precision

In [38]:
study1 = optuna.create_study(direction='maximize',sampler=TPESampler())
study1.optimize(lambda trial : objectiveLGBM(trial, X1, y1), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study1.best_trial.value,study1.best_trial.params))

[32m[I 2022-12-01 02:57:22,119][0m A new study created in memory with name: no-name-a991e7e8-b81a-4648-9207-c8d45563eb8e[0m
[32m[I 2022-12-01 02:57:23,093][0m Trial 0 finished with value: 0.8807826311865748 and parameters: {'num_leaves': 125, 'n_estimators': 802, 'feature_fraction': 0.9630920814402957, 'bagging_fraction': 0.4060599880577359, 'bagging_freq': 2, 'min_child_samples': 86}. Best is trial 0 with value: 0.8807826311865748.[0m
[32m[I 2022-12-01 02:57:26,512][0m Trial 1 finished with value: 0.6572327044025158 and parameters: {'num_leaves': 103, 'n_estimators': 1904, 'feature_fraction': 0.8326161250726695, 'bagging_fraction': 0.7464146091977584, 'bagging_freq': 5, 'min_child_samples': 83}. Best is trial 0 with value: 0.8807826311865748.[0m
[32m[I 2022-12-01 02:57:40,491][0m Trial 2 finished with value: 0.9835758377425045 and parameters: {'num_leaves': 32, 'n_estimators': 2580, 'feature_fraction': 0.6054472282063135, 'bagging_fraction': 0.8453734549789431, 'bagging_fre

Best trial: score 0.9968253968253968,
params {'num_leaves': 10, 'n_estimators': 2191, 'feature_fraction': 0.7463105437630405, 'bagging_fraction': 0.9790350582865355, 'bagging_freq': 4, 'min_child_samples': 5}


In [39]:
optuna.visualization.plot_param_importances(study1) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study1) # 최적화 과정 시각화

In [40]:
X1_train, X1_val, y1_train, y1_val = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

In [41]:
X1_train.shape, X1_val.shape, y1_train.shape, y1_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [42]:
model = LGBMClassifier(**study1.best_trial.params)

In [43]:
model1 = model.fit(X1_train, y1_train,
          eval_set = [(X1_train, y1_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.515109	training's multi_logloss: 0.515109
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.442052	training's multi_logloss: 0.442052
[3]	training's multi_logloss: 0.381179	training's multi_logloss: 0.381179
[4]	training's multi_logloss: 0.332627	training's multi_logloss: 0.332627
[5]	training's multi_logloss: 0.292779	training's multi_logloss: 0.292779
[6]	training's multi_logloss: 0.2598	training's multi_logloss: 0.2598
[7]	training's multi_logloss: 0.237067	training's multi_logloss: 0.237067
[8]	training's multi_logloss: 0.217745	training's multi_logloss: 0.217745
[9]	training's multi_logloss: 0.194845	training's multi_logloss: 0.194845
[10]	training's multi_logloss: 0.18023	training's multi_logloss: 0.18023
[11]	training's multi_logloss: 0.161547	training's multi_logloss: 0.161547
[12]	training's multi_logloss: 0.14882	training's multi_logloss: 0.14882
[13]	training's multi_logloss: 0.137375	training's mul

In [44]:
train1_preds = model1.predict(X1_train)
val1_preds = model1.predict(X1_val)

In [45]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [46]:
get_clf_eval(y1_train, train1_preds)
get_clf_eval(y1_val, val1_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9126
재현율: 0.9106


In [47]:
preds_1 = model1.predict(X1_test)
preds_1

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2,
       2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1,
       2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1,

In [48]:
test_jan['classification'] = preds_1
test_jan

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-01-01,6,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1882,55.21,1,2022-01-01 00:00:00,2022,1,1,0
1,2022-01-01,6,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1384,58.09,1,2022-01-01 01:00:00,2022,1,1,1
2,2022-01-01,6,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,904,59.04,1,2022-01-01 02:00:00,2022,1,1,2
3,2022-01-01,6,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,596,58.80,1,2022-01-01 03:00:00,2022,1,1,3
4,2022-01-01,6,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,561,57.38,1,2022-01-01 04:00:00,2022,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-01-31,1,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,2828,51.94,1,2022-01-31 19:00:00,2022,1,31,19
740,2022-01-31,1,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,2760,51.98,1,2022-01-31 20:00:00,2022,1,31,20
741,2022-01-31,1,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,2531,51.97,1,2022-01-31 21:00:00,2022,1,31,21
742,2022-01-31,1,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1393,52.59,1,2022-01-31 22:00:00,2022,1,31,22


# 2월 데이터 머신러닝

## 데이터 가공

In [49]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [50]:
X2 = train_feb.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [51]:
y2 = train_feb[['datetime', 'classification']]
X2_1 = X2.drop(columns = ['datetime', 'classification'])
y2_1 = X2.datetime

In [52]:
X2_1_scaler = scaler.fit_transform(X2_1)

In [53]:
X2_1_sc = pd.DataFrame(X2_1_scaler)
X2_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X2 = pd.concat([y2_1, X2_1_sc], axis = 1)
X2

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-02-01 00:00:00,0.666667,0.472218,0.961625,0.0,0.0,0.000000,0.000000
1,2019-02-01 01:00:00,0.666667,0.308499,1.000000,0.0,0.0,0.000000,0.043478
2,2019-02-01 02:00:00,0.666667,0.224918,0.957881,0.0,0.0,0.000000,0.086957
3,2019-02-01 03:00:00,0.666667,0.164188,0.926619,0.0,0.0,0.000000,0.130435
4,2019-02-01 04:00:00,0.666667,0.136328,0.892550,0.0,0.0,0.000000,0.173913
...,...,...,...,...,...,...,...,...
2035,2021-02-28 19:00:00,1.000000,0.488652,0.747286,1.0,0.0,0.964286,0.826087
2036,2021-02-28 20:00:00,1.000000,0.490531,0.756084,1.0,0.0,0.964286,0.869565
2037,2021-02-28 21:00:00,1.000000,0.508530,0.763197,1.0,0.0,0.964286,0.913043
2038,2021-02-28 22:00:00,1.000000,0.438723,0.797267,1.0,0.0,0.964286,0.956522


In [54]:
X2_test = test_feb.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [55]:
y2_test = test_feb[['datetime', 'classification']]
X2_1_test = X2_test.drop(columns = ['datetime', 'classification'])
y2_1_test = X2_test.datetime

In [56]:
X2_1_test_scaler = scaler.fit_transform(X2_1_test)

In [57]:
X2_1_test_sc = pd.DataFrame(X2_1_test_scaler)
X2_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X2_test = pd.concat([y2_1_test, X2_1_test_sc], axis = 1)
X2_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-02-01 00:00:00,0.166667,0.042553,0.861926,0.0,0.0,0.0,0.000000
1,2022-02-01 01:00:00,0.166667,0.013977,0.842375,0.0,0.0,0.0,0.043478
2,2022-02-01 02:00:00,0.166667,0.000621,0.862659,0.0,0.0,0.0,0.086957
3,2022-02-01 03:00:00,0.166667,0.001864,0.822092,0.0,0.0,0.0,0.130435
4,2022-02-01 04:00:00,0.166667,0.000000,0.775415,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
667,2022-02-28 19:00:00,0.000000,0.440130,0.692082,0.0,0.0,1.0,0.826087
668,2022-02-28 20:00:00,0.000000,0.482839,0.785435,0.0,0.0,1.0,0.869565
669,2022-02-28 21:00:00,0.000000,0.599783,0.693548,0.0,0.0,1.0,0.913043
670,2022-02-28 22:00:00,0.000000,0.547601,0.722385,0.0,0.0,1.0,0.956522


## LightGBM

In [58]:
X2 = X2.drop(columns = ['datetime'])
y2 = y2.drop(columns = ['datetime'])
X2_test = X2_test.drop(columns = ['datetime'])
y2_test = y2_test.drop(columns = ['datetime'])

In [59]:
study2 = optuna.create_study(direction='maximize',sampler=TPESampler())
study2.optimize(lambda trial : objectiveLGBM(trial, X2, y2), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study2.best_trial.value,study2.best_trial.params))

[32m[I 2022-12-01 02:59:27,771][0m A new study created in memory with name: no-name-7bd04ec1-bf36-477b-ac80-4701724a21bb[0m
[32m[I 2022-12-01 02:59:29,140][0m Trial 0 finished with value: 0.8572769953051643 and parameters: {'num_leaves': 376, 'n_estimators': 1082, 'feature_fraction': 0.8662094693593421, 'bagging_fraction': 0.513750777115205, 'bagging_freq': 6, 'min_child_samples': 82}. Best is trial 0 with value: 0.8572769953051643.[0m
[32m[I 2022-12-01 02:59:54,325][0m Trial 1 finished with value: 0.9120689655172414 and parameters: {'num_leaves': 489, 'n_estimators': 1098, 'feature_fraction': 0.522429688858273, 'bagging_fraction': 0.7609853793584809, 'bagging_freq': 7, 'min_child_samples': 9}. Best is trial 1 with value: 0.9120689655172414.[0m
[32m[I 2022-12-01 02:59:57,196][0m Trial 2 finished with value: 0.8215130023640662 and parameters: {'num_leaves': 326, 'n_estimators': 2926, 'feature_fraction': 0.7330354770303449, 'bagging_fraction': 0.45082001052902315, 'bagging_fre

Best trial: score 0.9927536231884058,
params {'num_leaves': 275, 'n_estimators': 817, 'feature_fraction': 0.990241640298787, 'bagging_fraction': 0.9734391077787886, 'bagging_freq': 4, 'min_child_samples': 61}


In [60]:
optuna.visualization.plot_param_importances(study2) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study2) # 최적화 과정 시각화

In [61]:
X2_train, X2_val, y2_train, y2_val = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

In [62]:
X2_train.shape, X2_val.shape, y2_train.shape, y2_val.shape

((1632, 7), (408, 7), (1632, 1), (408, 1))

In [63]:
model = LGBMClassifier(**study2.best_trial.params)

In [64]:
model2 = model.fit(X2_train, y2_train,
          eval_set = [(X2_train, y2_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.641358	training's multi_logloss: 0.641358
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.560667	training's multi_logloss: 0.560667
[3]	training's multi_logloss: 0.487742	training's multi_logloss: 0.487742
[4]	training's multi_logloss: 0.43041	training's multi_logloss: 0.43041
[5]	training's multi_logloss: 0.382582	training's multi_logloss: 0.382582
[6]	training's multi_logloss: 0.343157	training's multi_logloss: 0.343157
[7]	training's multi_logloss: 0.313528	training's multi_logloss: 0.313528
[8]	training's multi_logloss: 0.28757	training's multi_logloss: 0.28757
[9]	training's multi_logloss: 0.260003	training's multi_logloss: 0.260003
[10]	training's multi_logloss: 0.235333	training's multi_logloss: 0.235333
[11]	training's multi_logloss: 0.213654	training's multi_logloss: 0.213654
[12]	training's multi_logloss: 0.198174	training's multi_logloss: 0.198174
[13]	training's multi_logloss: 0.180814	training's

In [65]:
train2_preds = model2.predict(X2_train)
val2_preds = model2.predict(X2_val)

In [66]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [67]:
get_clf_eval(y2_train, train2_preds)
get_clf_eval(y2_val, val2_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.8962
재현율: 0.9484


In [68]:
preds_2= model2.predict(X2_test)
preds_2

array([1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1,
       2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1,
       2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [69]:
test_feb['classification'] = preds_2
test_feb

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-02-01,2,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,536,55.73,1,2022-02-01 00:00:00,2022,2,1,0
1,2022-02-01,2,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,352,54.93,1,2022-02-01 01:00:00,2022,2,1,1
2,2022-02-01,2,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,266,55.76,1,2022-02-01 02:00:00,2022,2,1,2
3,2022-02-01,2,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,274,54.10,1,2022-02-01 03:00:00,2022,2,1,3
4,2022-02-01,2,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,262,52.19,1,2022-02-01 04:00:00,2022,2,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2022-02-28,1,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3096,48.78,2,2022-02-28 19:00:00,2022,2,28,19
668,2022-02-28,1,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3371,52.60,1,2022-02-28 20:00:00,2022,2,28,20
669,2022-02-28,1,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4124,48.84,2,2022-02-28 21:00:00,2022,2,28,21
670,2022-02-28,1,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3788,50.02,2,2022-02-28 22:00:00,2022,2,28,22


# 3월 데이터 머신러닝

## 데이터 가공

In [70]:
X3 = train_mar.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [71]:
y3 = train_mar[['datetime', 'classification']]
X3_1 = X3.drop(columns = ['datetime', 'classification'])
y3_1 = X3.datetime

In [72]:
X3_1_scaler = scaler.fit_transform(X3_1)

In [73]:
X3_1_sc = pd.DataFrame(X3_1_scaler)
X3_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X3 = pd.concat([y3_1, X3_1_sc], axis = 1)
X3

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-03-01 00:00:00,0.666667,0.552172,0.931612,0.0,0.0,0.0,0.000000
1,2019-03-01 01:00:00,0.666667,0.396538,0.980736,0.0,0.0,0.0,0.043478
2,2019-03-01 02:00:00,0.666667,0.290431,0.996340,0.0,0.0,0.0,0.086957
3,2019-03-01 03:00:00,0.666667,0.242186,0.997881,0.0,0.0,0.0,0.130435
4,2019-03-01 04:00:00,0.666667,0.208527,0.962628,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-03-31 19:00:00,0.333333,0.542074,0.757272,1.0,0.0,1.0,0.826087
2228,2021-03-31 20:00:00,0.333333,0.557782,0.750530,1.0,0.0,1.0,0.869565
2229,2021-03-31 21:00:00,0.333333,0.710370,0.639568,1.0,0.0,1.0,0.913043
2230,2021-03-31 22:00:00,0.333333,0.632633,0.604700,1.0,0.0,1.0,0.956522


In [74]:
X3_test = test_mar.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [75]:
y3_test = test_mar[['datetime', 'classification']]
X3_1_test = X3_test.drop(columns = ['datetime', 'classification'])
y3_1_test = X3_test.datetime

In [76]:
X3_1_test_scaler = scaler.fit_transform(X3_1_test)

In [77]:
X3_1_test_sc = pd.DataFrame(X3_1_test_scaler)
X3_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X3_test = pd.concat([y3_1_test, X3_1_test_sc], axis = 1)
X3_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-03-01 00:00:00,0.166667,0.192074,0.883230,0.0,0.0,0.0,0.000000
1,2022-03-01 01:00:00,0.166667,0.120375,0.953623,0.0,0.0,0.0,0.043478
2,2022-03-01 02:00:00,0.166667,0.071205,0.915528,0.0,0.0,0.0,0.086957
3,2022-03-01 03:00:00,0.166667,0.043907,0.847205,0.0,0.0,0.0,0.130435
4,2022-03-01 04:00:00,0.166667,0.043578,0.726294,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-03-31 19:00:00,0.500000,0.559119,0.609938,0.0,0.0,1.0,0.826087
740,2022-03-31 20:00:00,0.500000,0.551718,0.590476,0.0,0.0,1.0,0.869565
741,2022-03-31 21:00:00,0.500000,0.660911,0.320497,0.0,0.0,1.0,0.913043
742,2022-03-31 22:00:00,0.500000,0.593159,0.452588,0.0,0.0,1.0,0.956522


## LightGBM

In [78]:
X3 = X3.drop(columns = ['datetime'])
y3 = y3.drop(columns = ['datetime'])
X3_test = X3_test.drop(columns = ['datetime'])
y3_test = y3_test.drop(columns = ['datetime'])

In [79]:
study3 = optuna.create_study(direction='maximize',sampler=TPESampler())
study3.optimize(lambda trial : objectiveLGBM(trial, X3, y3), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study3.best_trial.value,study3.best_trial.params))

[32m[I 2022-12-01 03:01:00,062][0m A new study created in memory with name: no-name-2bc8d44f-7605-48f2-868b-eee9e8f18645[0m
[32m[I 2022-12-01 03:01:01,148][0m Trial 0 finished with value: 0.6525252525252525 and parameters: {'num_leaves': 393, 'n_estimators': 866, 'feature_fraction': 0.5655236282005291, 'bagging_fraction': 0.5249117935463029, 'bagging_freq': 7, 'min_child_samples': 98}. Best is trial 0 with value: 0.6525252525252525.[0m
[32m[I 2022-12-01 03:01:02,023][0m Trial 1 finished with value: 0.6602564102564102 and parameters: {'num_leaves': 363, 'n_estimators': 704, 'feature_fraction': 0.6396704365246377, 'bagging_fraction': 0.48636655967287634, 'bagging_freq': 3, 'min_child_samples': 99}. Best is trial 1 with value: 0.6602564102564102.[0m
[32m[I 2022-12-01 03:01:04,481][0m Trial 2 finished with value: 0.6600877192982456 and parameters: {'num_leaves': 18, 'n_estimators': 1274, 'feature_fraction': 0.5305497356124664, 'bagging_fraction': 0.8177727838937872, 'bagging_fre

Best trial: score 0.9980506822612085,
params {'num_leaves': 334, 'n_estimators': 2700, 'feature_fraction': 0.9992081720528703, 'bagging_fraction': 0.7030138338897365, 'bagging_freq': 6, 'min_child_samples': 41}


In [80]:
optuna.visualization.plot_param_importances(study3) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study3) # 최적화 과정 시각화

In [81]:
X3_train, X3_val, y3_train, y3_val = train_test_split(X3, y3, test_size = 0.2, random_state = 42)

In [82]:
X3_train.shape, X3_val.shape, y3_train.shape, y3_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [83]:
model = LGBMClassifier(**study3.best_trial.params)

In [84]:
model3 = model.fit(X3_train, y3_train,
          eval_set = [(X3_train, y3_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.572494	training's multi_logloss: 0.572494
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.497385	training's multi_logloss: 0.497385
[3]	training's multi_logloss: 0.433276	training's multi_logloss: 0.433276
[4]	training's multi_logloss: 0.381736	training's multi_logloss: 0.381736
[5]	training's multi_logloss: 0.338476	training's multi_logloss: 0.338476
[6]	training's multi_logloss: 0.301603	training's multi_logloss: 0.301603
[7]	training's multi_logloss: 0.275289	training's multi_logloss: 0.275289
[8]	training's multi_logloss: 0.252044	training's multi_logloss: 0.252044
[9]	training's multi_logloss: 0.225809	training's multi_logloss: 0.225809
[10]	training's multi_logloss: 0.203267	training's multi_logloss: 0.203267
[11]	training's multi_logloss: 0.183437	training's multi_logloss: 0.183437
[12]	training's multi_logloss: 0.16947	training's multi_logloss: 0.16947
[13]	training's multi_logloss: 0.153415	training

In [85]:
train3_preds = model3.predict(X3_train)
val3_preds = model3.predict(X3_val)

In [86]:
get_clf_eval(y3_train, train3_preds)
get_clf_eval(y3_val, val3_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.6562
재현율: 0.6632


In [87]:
preds_3= model3.predict(X3_test)
preds_3

array([1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,

In [88]:
test_mar['classification'] = preds_3
test_mar

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-03-01,2,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1526,58.59,1,2022-03-01 00:00:00,2022,3,1,0
1,2022-03-01,2,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1090,60.29,1,2022-03-01 01:00:00,2022,3,1,1
2,2022-03-01,2,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,791,59.37,1,2022-03-01 02:00:00,2022,3,1,2
3,2022-03-01,2,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,625,57.72,1,2022-03-01 03:00:00,2022,3,1,3
4,2022-03-01,2,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,623,54.80,2,2022-03-01 04:00:00,2022,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-03-31,4,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3758,51.99,2,2022-03-31 19:00:00,2022,3,31,19
740,2022-03-31,4,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3713,51.52,2,2022-03-31 20:00:00,2022,3,31,20
741,2022-03-31,4,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4377,45.00,2,2022-03-31 21:00:00,2022,3,31,21
742,2022-03-31,4,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3965,48.19,2,2022-03-31 22:00:00,2022,3,31,22


# 4월 데이터 머신러닝

## 데이터 가공

In [89]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [90]:
X4 = train_apr.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [91]:
y4 = train_apr[['datetime', 'classification']]
X4_1 = X4.drop(columns = ['datetime', 'classification'])
y4_1 = X4.datetime

In [92]:
X4_1_scaler = scaler.fit_transform(X4_1)

In [93]:
X4_1_sc = pd.DataFrame(X4_1_scaler)
X4_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X4 = pd.concat([y4_1, X4_1_sc], axis = 1)
X4

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-04-01 00:00:00,0.000000,0.230183,0.882712,0.0,0.0,0.0,0.000000
1,2019-04-01 01:00:00,0.000000,0.130491,0.857442,0.0,0.0,0.0,0.043478
2,2019-04-01 02:00:00,0.000000,0.077970,0.820146,0.0,0.0,0.0,0.086957
3,2019-04-01 03:00:00,0.000000,0.064840,0.846114,0.0,0.0,0.0,0.130435
4,2019-04-01 04:00:00,0.000000,0.123359,0.841060,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-04-30 19:00:00,0.666667,0.602853,0.290345,1.0,0.0,1.0,0.826087
2156,2021-04-30 20:00:00,0.666667,0.519047,0.686999,1.0,0.0,1.0,0.869565
2157,2021-04-30 21:00:00,0.666667,0.625547,0.673057,1.0,0.0,1.0,0.913043
2158,2021-04-30 22:00:00,0.666667,0.599773,0.653886,1.0,0.0,1.0,0.956522


In [94]:
X4_test = test_apr.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [95]:
y4_test = test_apr[['datetime', 'classification']]
X4_1_test = X4_test.drop(columns = ['datetime', 'classification'])
y4_1_test = X4_test.datetime

In [96]:
X4_1_test_scaler = scaler.fit_transform(X4_1_test)

In [97]:
X4_1_test_sc = pd.DataFrame(X4_1_test_scaler)
X4_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X4_test = pd.concat([y4_1_test, X4_1_test_sc], axis = 1)
X4_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-04-01 00:00:00,0.666667,0.200602,0.933942,0.0,0.0,0.0,0.000000
1,2022-04-01 01:00:00,0.666667,0.117352,0.965667,0.0,0.0,0.0,0.043478
2,2022-04-01 02:00:00,0.666667,0.075226,0.968057,0.0,0.0,0.0,0.086957
3,2022-04-01 03:00:00,0.666667,0.043798,0.928075,0.0,0.0,0.0,0.130435
4,2022-04-01 04:00:00,0.666667,0.068873,0.891786,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-04-30 19:00:00,0.833333,0.620695,0.580617,0.0,0.0,1.0,0.826087
716,2022-04-30 20:00:00,0.833333,0.671849,0.552586,0.0,0.0,1.0,0.869565
717,2022-04-30 21:00:00,0.833333,0.717319,0.661886,0.0,0.0,1.0,0.913043
718,2022-04-30 22:00:00,0.833333,0.674022,0.594959,0.0,0.0,1.0,0.956522


## LightGBM

In [98]:
X4 = X4.drop(columns = ['datetime'])
y4 = y4.drop(columns = ['datetime'])
X4_test = X4_test.drop(columns = ['datetime'])
y4_test = y4_test.drop(columns = ['datetime'])

In [99]:
study4 = optuna.create_study(direction='maximize',sampler=TPESampler())
study4.optimize(lambda trial : objectiveLGBM(trial, X4, y4), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study4.best_trial.value,study4.best_trial.params))

[32m[I 2022-12-01 03:02:41,463][0m A new study created in memory with name: no-name-3a669a33-9173-4dfd-9475-a3f7601f6e84[0m
[32m[I 2022-12-01 03:02:45,194][0m Trial 0 finished with value: 0.9897750511247443 and parameters: {'num_leaves': 103, 'n_estimators': 2532, 'feature_fraction': 0.5781559062798796, 'bagging_fraction': 0.7198133937353208, 'bagging_freq': 5, 'min_child_samples': 87}. Best is trial 0 with value: 0.9897750511247443.[0m
[32m[I 2022-12-01 03:02:48,555][0m Trial 1 finished with value: 0.9203393337471549 and parameters: {'num_leaves': 296, 'n_estimators': 777, 'feature_fraction': 0.8301859554968429, 'bagging_fraction': 0.8678983771863087, 'bagging_freq': 3, 'min_child_samples': 21}. Best is trial 0 with value: 0.9897750511247443.[0m
[32m[I 2022-12-01 03:02:52,022][0m Trial 2 finished with value: 0.8742368742368742 and parameters: {'num_leaves': 322, 'n_estimators': 1621, 'feature_fraction': 0.4150453939492909, 'bagging_fraction': 0.8028268473830663, 'bagging_fr

Best trial: score 0.9980039920159681,
params {'num_leaves': 223, 'n_estimators': 725, 'feature_fraction': 0.4920075247300208, 'bagging_fraction': 0.7658404053565132, 'bagging_freq': 7, 'min_child_samples': 41}


In [100]:
optuna.visualization.plot_param_importances(study4) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study4) # 최적화 과정 시각화

In [101]:
X4_train, X4_val, y4_train, y4_val = train_test_split(X4, y4, test_size = 0.2, random_state = 42)

In [102]:
X4_train.shape, X4_val.shape, y4_train.shape, y4_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [103]:
model = LGBMClassifier(**study4.best_trial.params)

In [104]:
model4 = model.fit(X4_train, y4_train,
          eval_set = [(X4_train, y4_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.697045	training's multi_logloss: 0.697045
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.665031	training's multi_logloss: 0.665031
[3]	training's multi_logloss: 0.595734	training's multi_logloss: 0.595734
[4]	training's multi_logloss: 0.586698	training's multi_logloss: 0.586698
[5]	training's multi_logloss: 0.535131	training's multi_logloss: 0.535131
[6]	training's multi_logloss: 0.492245	training's multi_logloss: 0.492245
[7]	training's multi_logloss: 0.452661	training's multi_logloss: 0.452661
[8]	training's multi_logloss: 0.417763	training's multi_logloss: 0.417763
[9]	training's multi_logloss: 0.388065	training's multi_logloss: 0.388065
[10]	training's multi_logloss: 0.359894	training's multi_logloss: 0.359894
[11]	training's multi_logloss: 0.343721	training's multi_logloss: 0.343721
[12]	training's multi_logloss: 0.323021	training's multi_logloss: 0.323021
[13]	training's multi_logloss: 0.299014	traini

In [105]:
train4_preds = model4.predict(X4_train)
val4_preds = model4.predict(X4_val)

In [106]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [107]:
get_clf_eval(y4_train, train4_preds)
get_clf_eval(y4_val, val4_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9419
재현율: 0.8829


In [108]:
preds_4= model4.predict(X4_test)
preds_4

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2,
       2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1,
       1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 3, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,

In [109]:
test_apr['classification'] = preds_4
test_apr

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-04-01,5,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1622,59.05,1,2022-04-01 00:00:00,2022,4,1,0
1,2022-04-01,5,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1124,60.51,1,2022-04-01 01:00:00,2022,4,1,1
2,2022-04-01,5,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,872,60.62,1,2022-04-01 02:00:00,2022,4,1,2
3,2022-04-01,5,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,684,58.78,1,2022-04-01 03:00:00,2022,4,1,3
4,2022-04-01,5,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,834,57.11,1,2022-04-01 04:00:00,2022,4,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-04-30,6,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4135,42.79,2,2022-04-30 19:00:00,2022,4,30,19
716,2022-04-30,6,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4441,41.50,2,2022-04-30 20:00:00,2022,4,30,20
717,2022-04-30,6,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4713,46.53,2,2022-04-30 21:00:00,2022,4,30,21
718,2022-04-30,6,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4454,43.45,2,2022-04-30 22:00:00,2022,4,30,22


# 5월 데이터 머신러닝

## 데이터 가공

In [110]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [111]:
X5 = train_may.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [112]:
y5 = train_may[['datetime', 'classification']]
X5_1 = X5.drop(columns = ['datetime', 'classification'])
y5_1 = X5.datetime

In [113]:
X5_1_scaler = scaler.fit_transform(X5_1)

In [114]:
X5_1_sc = pd.DataFrame(X5_1_scaler)
X5_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X5 = pd.concat([y5_1, X5_1_sc], axis = 1)
X5

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-05-01 00:00:00,0.333333,0.471356,0.962362,0.0,0.0,0.0,0.000000
1,2019-05-01 01:00:00,0.333333,0.335074,0.974908,0.0,0.0,0.0,0.043478
2,2019-05-01 02:00:00,0.333333,0.284152,0.989770,0.0,0.0,0.0,0.086957
3,2019-05-01 03:00:00,0.333333,0.219031,0.950010,0.0,0.0,0.0,0.130435
4,2019-05-01 04:00:00,0.333333,0.209401,0.968346,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-05-31 19:00:00,0.000000,0.567162,0.765489,1.0,0.0,1.0,0.826087
2228,2021-05-31 20:00:00,0.000000,0.606006,0.723412,1.0,0.0,1.0,0.869565
2229,2021-05-31 21:00:00,0.000000,0.693325,0.598147,1.0,0.0,1.0,0.913043
2230,2021-05-31 22:00:00,0.000000,0.609270,0.671492,1.0,0.0,1.0,0.956522


In [115]:
X5_test = test_may.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [116]:
y5_test = test_may[['datetime', 'classification']]
X5_1_test = X5_test.drop(columns = ['datetime', 'classification'])
y5_1_test = X5_test.datetime

In [117]:
X5_1_test_scaler = scaler.fit_transform(X5_1_test)

In [118]:
X5_1_test_sc = pd.DataFrame(X5_1_test_scaler)
X5_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X5_test = pd.concat([y5_1_test, X5_1_test_sc], axis = 1)
X5_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-05-01 00:00:00,1.000000,0.289192,0.894222,0.0,0.0,0.0,0.000000
1,2022-05-01 01:00:00,1.000000,0.178522,0.948957,0.0,0.0,0.0,0.043478
2,2022-05-01 02:00:00,1.000000,0.099620,0.975673,0.0,0.0,0.0,0.086957
3,2022-05-01 03:00:00,1.000000,0.059047,0.995222,0.0,0.0,0.0,0.130435
4,2022-05-01 04:00:00,1.000000,0.060946,0.959383,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-05-31 19:00:00,0.166667,0.632079,0.350130,0.0,0.0,1.0,0.826087
740,2022-05-31 20:00:00,0.166667,0.612224,0.684188,0.0,0.0,1.0,0.869565
741,2022-05-31 21:00:00,0.166667,0.752935,0.599696,0.0,0.0,1.0,0.913043
742,2022-05-31 22:00:00,0.166667,0.715642,0.653128,0.0,0.0,1.0,0.956522


## LightGBM

In [119]:
X5 = X5.drop(columns = ['datetime'])
y5 = y5.drop(columns = ['datetime'])
X5_test = X5_test.drop(columns = ['datetime'])
y5_test = y5_test.drop(columns = ['datetime'])

In [120]:
study5 = optuna.create_study(direction='maximize',sampler=TPESampler())
study5.optimize(lambda trial : objectiveLGBM(trial, X5, y5), n_trials=20) 
print('Best trial: score {},\nparams {}'.format(study5.best_trial.value,study5.best_trial.params))

[32m[I 2022-12-01 03:04:03,198][0m A new study created in memory with name: no-name-31ecff5a-1aca-4a8e-b0cc-fc4a9c77586d[0m
[32m[I 2022-12-01 03:04:11,537][0m Trial 0 finished with value: 0.9743589743589745 and parameters: {'num_leaves': 422, 'n_estimators': 1841, 'feature_fraction': 0.9139038144782395, 'bagging_fraction': 0.9071188183005525, 'bagging_freq': 4, 'min_child_samples': 14}. Best is trial 0 with value: 0.9743589743589745.[0m
[32m[I 2022-12-01 03:04:14,333][0m Trial 1 finished with value: 0.9231707317073171 and parameters: {'num_leaves': 29, 'n_estimators': 1927, 'feature_fraction': 0.8498866082579222, 'bagging_fraction': 0.823007582231998, 'bagging_freq': 1, 'min_child_samples': 99}. Best is trial 0 with value: 0.9743589743589745.[0m
[32m[I 2022-12-01 03:04:16,528][0m Trial 2 finished with value: 0.918207811517127 and parameters: {'num_leaves': 8, 'n_estimators': 1721, 'feature_fraction': 0.8125727757897896, 'bagging_fraction': 0.5977887962251721, 'bagging_freq':

Best trial: score 1.0,
params {'num_leaves': 409, 'n_estimators': 2739, 'feature_fraction': 0.9457622457611966, 'bagging_fraction': 0.9768122044071985, 'bagging_freq': 5, 'min_child_samples': 16}


In [121]:
optuna.visualization.plot_param_importances(study5) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study5) # 최적화 과정 시각화

In [122]:
X5_train, X5_val, y5_train, y5_val = train_test_split(X5, y5, test_size = 0.2, random_state = 42)

In [123]:
X5_train.shape, X5_val.shape, y5_train.shape, y5_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [124]:
model = LGBMClassifier(**study5.best_trial.params)

In [125]:
model5 = model.fit(X5_train, y5_train,
          eval_set = [(X5_train, y5_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.641743	training's multi_logloss: 0.641743
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.557479	training's multi_logloss: 0.557479
[3]	training's multi_logloss: 0.484545	training's multi_logloss: 0.484545
[4]	training's multi_logloss: 0.425552	training's multi_logloss: 0.425552
[5]	training's multi_logloss: 0.375995	training's multi_logloss: 0.375995
[6]	training's multi_logloss: 0.334715	training's multi_logloss: 0.334715
[7]	training's multi_logloss: 0.302756	training's multi_logloss: 0.302756
[8]	training's multi_logloss: 0.274634	training's multi_logloss: 0.274634
[9]	training's multi_logloss: 0.246188	training's multi_logloss: 0.246188
[10]	training's multi_logloss: 0.220703	training's multi_logloss: 0.220703
[11]	training's multi_logloss: 0.198258	training's multi_logloss: 0.198258
[12]	training's multi_logloss: 0.180716	training's multi_logloss: 0.180716
[13]	training's multi_logloss: 0.162866	traini

In [126]:
train5_preds = model5.predict(X5_train)
val5_preds = model5.predict(X5_val)

In [127]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [128]:
get_clf_eval(y5_train, train5_preds)
get_clf_eval(y5_val, val5_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9967
재현율: 0.9502


In [129]:
preds_5= model5.predict(X5_test)
preds_5

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 3, 3, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,

In [130]:
test_may['classification'] = preds_5
test_may

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-05-01,7,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,2420,57.23,1,2022-05-01 00:00:00,2022,5,1,0
1,2022-05-01,7,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1779,59.75,1,2022-05-01 01:00:00,2022,5,1,1
2,2022-05-01,7,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1322,60.98,1,2022-05-01 02:00:00,2022,5,1,2
3,2022-05-01,7,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1087,61.88,1,2022-05-01 03:00:00,2022,5,1,3
4,2022-05-01,7,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1098,60.23,1,2022-05-01 04:00:00,2022,5,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-05-31,2,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4406,32.18,2,2022-05-31 19:00:00,2022,5,31,19
740,2022-05-31,2,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4291,47.56,2,2022-05-31 20:00:00,2022,5,31,20
741,2022-05-31,2,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,5106,43.67,2,2022-05-31 21:00:00,2022,5,31,21
742,2022-05-31,2,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4890,46.13,2,2022-05-31 22:00:00,2022,5,31,22


# 6월 데이터 머신러닝

## 데이터 가공

In [131]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [132]:
X6 = train_jun.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [133]:
y6 = train_jun[['datetime', 'classification']]
X6_1 = X6.drop(columns = ['datetime', 'classification'])
y6_1 = X6.datetime

In [134]:
X6_1_scaler = scaler.fit_transform(X6_1)

In [135]:
X6_1_sc = pd.DataFrame(X6_1_scaler)
X6_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X6 = pd.concat([y6_1, X6_1_sc], axis = 1)
X6

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-06-01 00:00:00,0.833333,0.558182,0.940670,0.0,0.0,0.0,0.000000
1,2019-06-01 01:00:00,0.833333,0.424628,0.981244,0.0,0.0,0.0,0.043478
2,2019-06-01 02:00:00,0.833333,0.324132,0.996172,0.0,0.0,0.0,0.086957
3,2019-06-01 03:00:00,0.833333,0.253388,0.985837,0.0,0.0,0.0,0.130435
4,2019-06-01 04:00:00,0.833333,0.263802,0.965933,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-06-30 19:00:00,0.333333,0.540826,0.726507,1.0,0.0,1.0,0.826087
2156,2021-06-30 20:00:00,0.333333,0.574380,0.679426,1.0,0.0,1.0,0.869565
2157,2021-06-30 21:00:00,0.333333,0.648926,0.521914,1.0,0.0,1.0,0.913043
2158,2021-06-30 22:00:00,0.333333,0.561488,0.654163,1.0,0.0,1.0,0.956522


In [136]:
X6_test = test_jun.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [137]:
y6_test = test_jun[['datetime', 'classification']]
X6_1_test = X6_test.drop(columns = ['datetime', 'classification'])
y6_1_test = X6_test.datetime

In [138]:
X6_1_test_scaler = scaler.fit_transform(X6_1_test)

In [139]:
X6_1_test_sc = pd.DataFrame(X6_1_test_scaler)
X6_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X6_test = pd.concat([y6_1_test, X6_1_test_sc], axis = 1)
X6_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-06-01 00:00:00,0.333333,0.411573,0.651728,0.0,0.0,0.0,0.000000
1,2022-06-01 01:00:00,0.333333,0.259100,0.774838,0.0,0.0,0.0,0.043478
2,2022-06-01 02:00:00,0.333333,0.154018,0.818305,0.0,0.0,0.0,0.086957
3,2022-06-01 03:00:00,0.333333,0.105598,0.775378,0.0,0.0,0.0,0.130435
4,2022-06-01 04:00:00,0.333333,0.138736,0.676836,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-06-30 19:00:00,0.500000,0.710165,0.068844,0.0,0.0,1.0,0.826087
716,2022-06-30 20:00:00,0.500000,0.509787,0.152808,0.0,0.0,1.0,0.869565
717,2022-06-30 21:00:00,0.500000,0.644574,0.296976,0.0,0.0,1.0,0.913043
718,2022-06-30 22:00:00,0.500000,0.525240,0.559395,0.0,0.0,1.0,0.956522


## LightGBM

In [140]:
X6 = X6.drop(columns = ['datetime'])
y6 = y6.drop(columns = ['datetime'])
X6_test = X6_test.drop(columns = ['datetime'])
y6_test = y6_test.drop(columns = ['datetime'])

In [141]:
study6 = optuna.create_study(direction='maximize',sampler=TPESampler())
study6.optimize(lambda trial : objectiveLGBM(trial, X6, y6), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study6.best_trial.value,study6.best_trial.params))

[32m[I 2022-12-01 03:06:03,960][0m A new study created in memory with name: no-name-32abf8c9-1223-47f9-bb1c-082aea730ad3[0m
[32m[I 2022-12-01 03:06:08,363][0m Trial 0 finished with value: 0.9467312348668281 and parameters: {'num_leaves': 41, 'n_estimators': 2358, 'feature_fraction': 0.4826137306305811, 'bagging_fraction': 0.9348884980370435, 'bagging_freq': 7, 'min_child_samples': 76}. Best is trial 0 with value: 0.9467312348668281.[0m
[32m[I 2022-12-01 03:06:09,768][0m Trial 1 finished with value: 0.8962756052141527 and parameters: {'num_leaves': 252, 'n_estimators': 747, 'feature_fraction': 0.8054238360339605, 'bagging_fraction': 0.8101500094905969, 'bagging_freq': 1, 'min_child_samples': 64}. Best is trial 0 with value: 0.9467312348668281.[0m
[32m[I 2022-12-01 03:06:13,488][0m Trial 2 finished with value: 0.806207047383518 and parameters: {'num_leaves': 350, 'n_estimators': 2177, 'feature_fraction': 0.4687479968066467, 'bagging_fraction': 0.9230154807063076, 'bagging_freq

Best trial: score 1.0,
params {'num_leaves': 61, 'n_estimators': 769, 'feature_fraction': 0.6840579597480481, 'bagging_fraction': 0.9345313663112116, 'bagging_freq': 4, 'min_child_samples': 39}


In [142]:
optuna.visualization.plot_param_importances(study6) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study6) # 최적화 과정 시각화

In [143]:
X6_train, X6_val, y6_train, y6_val = train_test_split(X6, y6, test_size = 0.2, random_state = 42)

In [144]:
X6_train.shape, X6_val.shape, y6_train.shape, y6_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [145]:
model = LGBMClassifier(**study6.best_trial.params)

In [146]:
model6 = model.fit(X6_train, y6_train,
          eval_set = [(X6_train, y6_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.668356	training's multi_logloss: 0.668356
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.583076	training's multi_logloss: 0.583076
[3]	training's multi_logloss: 0.505145	training's multi_logloss: 0.505145
[4]	training's multi_logloss: 0.442923	training's multi_logloss: 0.442923
[5]	training's multi_logloss: 0.390833	training's multi_logloss: 0.390833
[6]	training's multi_logloss: 0.347854	training's multi_logloss: 0.347854
[7]	training's multi_logloss: 0.315233	training's multi_logloss: 0.315233
[8]	training's multi_logloss: 0.287355	training's multi_logloss: 0.287355
[9]	training's multi_logloss: 0.257699	training's multi_logloss: 0.257699
[10]	training's multi_logloss: 0.236197	training's multi_logloss: 0.236197
[11]	training's multi_logloss: 0.211673	training's multi_logloss: 0.211673
[12]	training's multi_logloss: 0.192928	training's multi_logloss: 0.192928
[13]	training's multi_logloss: 0.176048	traini

In [147]:
train6_preds = model6.predict(X6_train)
val6_preds = model6.predict(X6_val)

In [148]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [149]:
get_clf_eval(y6_train, train6_preds)
get_clf_eval(y6_val, val6_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [150]:
preds_6= model6.predict(X6_test)
preds_6

array([2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
       2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 3, 2, 2, 3,
       3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 3, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2,
       3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1,
       1, 2, 3, 3, 3, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       1, 1, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 2, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 3, 2, 2, 2, 2, 2,
       2, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 3, 2, 2, 2, 2,

In [151]:
test_jun['classification'] = preds_6
test_jun

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-06-01,3,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3255,41.89,2,2022-06-01 00:00:00,2022,6,1,0
1,2022-06-01,3,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,2367,46.45,1,2022-06-01 01:00:00,2022,6,1,1
2,2022-06-01,3,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1755,48.06,1,2022-06-01 02:00:00,2022,6,1,2
3,2022-06-01,3,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1473,46.47,1,2022-06-01 03:00:00,2022,6,1,3
4,2022-06-01,3,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1666,42.82,2,2022-06-01 04:00:00,2022,6,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-06-30,4,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4994,20.30,3,2022-06-30 19:00:00,2022,6,30,19
716,2022-06-30,4,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3827,23.41,3,2022-06-30 20:00:00,2022,6,30,20
717,2022-06-30,4,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4612,28.75,2,2022-06-30 21:00:00,2022,6,30,21
718,2022-06-30,4,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3917,38.47,2,2022-06-30 22:00:00,2022,6,30,22


# 7월 데이터 머신러닝

## 데이터 가공

In [152]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [153]:
X7 = train_jul.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [154]:
y7 = train_jul[['datetime', 'classification']]
X7_1 = X7.drop(columns = ['datetime', 'classification'])
y7_1 = X7.datetime

In [155]:
X7_1_scaler = scaler.fit_transform(X7_1)

In [156]:
X7_1_sc = pd.DataFrame(X7_1_scaler)
X7_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X7 = pd.concat([y7_1, X7_1_sc], axis = 1)
X7

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-07-01 00:00:00,0.000000,0.272520,0.922846,0.0,0.0,0.0,0.000000
1,2019-07-01 01:00:00,0.000000,0.173140,0.931481,0.0,0.0,0.0,0.043478
2,2019-07-01 02:00:00,0.000000,0.122389,0.888117,0.0,0.0,0.0,0.086957
3,2019-07-01 03:00:00,0.000000,0.094648,0.909518,0.0,0.0,0.0,0.130435
4,2019-07-01 04:00:00,0.000000,0.193701,0.890745,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-07-31 19:00:00,0.833333,0.499021,0.822039,1.0,0.0,1.0,0.826087
2228,2021-07-31 20:00:00,0.833333,0.460509,0.785433,1.0,0.0,1.0,0.869565
2229,2021-07-31 21:00:00,0.833333,0.501958,0.754458,1.0,0.0,1.0,0.913043
2230,2021-07-31 22:00:00,0.833333,0.445333,0.754646,1.0,0.0,1.0,0.956522


In [157]:
X7_test = test_jul.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [158]:
y7_test = test_jul[['datetime', 'classification']]
X7_1_test = X7_test.drop(columns = ['datetime', 'classification'])
y7_1_test = X7_test.datetime

In [159]:
X7_1_test_scaler = scaler.fit_transform(X7_1_test)

In [160]:
X7_1_test_sc = pd.DataFrame(X7_1_test_scaler)
X7_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X7_test = pd.concat([y7_1_test, X7_1_test_sc], axis = 1)
X7_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-07-01 00:00:00,0.666667,0.268637,0.907614,0.0,0.0,0.0,0.000000
1,2022-07-01 01:00:00,0.666667,0.155920,0.944412,0.0,0.0,0.0,0.043478
2,2022-07-01 02:00:00,0.666667,0.099886,0.960070,0.0,0.0,0.0,0.086957
3,2022-07-01 03:00:00,0.666667,0.079259,0.952437,0.0,0.0,0.0,0.130435
4,2022-07-01 04:00:00,0.666667,0.129121,0.913290,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-07-31 19:00:00,1.000000,0.617671,0.719906,0.0,0.0,1.0,0.826087
740,2022-07-31 20:00:00,1.000000,0.617671,0.650421,0.0,0.0,1.0,0.869565
741,2022-07-31 21:00:00,1.000000,0.619620,0.680172,0.0,0.0,1.0,0.913043
742,2022-07-31 22:00:00,1.000000,0.499269,0.692699,0.0,0.0,1.0,0.956522


## LightGBM

In [161]:
X7 = X7.drop(columns = ['datetime'])
y7 = y7.drop(columns = ['datetime'])
X7_test = X7_test.drop(columns = ['datetime'])
y7_test = y7_test.drop(columns = ['datetime'])

In [162]:
study7 = optuna.create_study(direction='maximize',sampler=TPESampler())
study7.optimize(lambda trial : objectiveLGBM(trial, X7, y7), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study7.best_trial.value,study7.best_trial.params))

[32m[I 2022-12-01 03:07:44,409][0m A new study created in memory with name: no-name-16e811cb-87bc-40a7-927c-27c1316438a3[0m
[32m[I 2022-12-01 03:07:47,743][0m Trial 0 finished with value: 0.7803030303030303 and parameters: {'num_leaves': 177, 'n_estimators': 2544, 'feature_fraction': 0.9529571424143153, 'bagging_fraction': 0.5316362408125753, 'bagging_freq': 5, 'min_child_samples': 94}. Best is trial 0 with value: 0.7803030303030303.[0m
[32m[I 2022-12-01 03:07:51,046][0m Trial 1 finished with value: 0.8237639553429027 and parameters: {'num_leaves': 6, 'n_estimators': 2916, 'feature_fraction': 0.8621391709119113, 'bagging_fraction': 0.6233949968637856, 'bagging_freq': 4, 'min_child_samples': 75}. Best is trial 1 with value: 0.8237639553429027.[0m
[32m[I 2022-12-01 03:07:52,636][0m Trial 2 finished with value: 0.8768613974799542 and parameters: {'num_leaves': 206, 'n_estimators': 1070, 'feature_fraction': 0.40208045264815234, 'bagging_fraction': 0.6912775272022106, 'bagging_fr

Best trial: score 1.0,
params {'num_leaves': 446, 'n_estimators': 2897, 'feature_fraction': 0.9396120276960382, 'bagging_fraction': 0.851353346936795, 'bagging_freq': 3, 'min_child_samples': 29}


In [163]:
optuna.visualization.plot_param_importances(study7) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study7) # 최적화 과정 시각화

In [164]:
X7_train, X7_val, y7_train, y7_val = train_test_split(X7, y7, test_size = 0.2, random_state = 42)

In [165]:
X7_train.shape, X7_val.shape, y7_train.shape, y7_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [166]:
model = LGBMClassifier(**study7.best_trial.params)

In [167]:
model7 = model.fit(X7_train, y7_train,
          eval_set = [(X7_train, y7_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.658436	training's multi_logloss: 0.658436
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.574993	training's multi_logloss: 0.574993
[3]	training's multi_logloss: 0.498946	training's multi_logloss: 0.498946
[4]	training's multi_logloss: 0.437495	training's multi_logloss: 0.437495
[5]	training's multi_logloss: 0.385894	training's multi_logloss: 0.385894
[6]	training's multi_logloss: 0.343449	training's multi_logloss: 0.343449
[7]	training's multi_logloss: 0.310764	training's multi_logloss: 0.310764
[8]	training's multi_logloss: 0.282031	training's multi_logloss: 0.282031
[9]	training's multi_logloss: 0.252515	training's multi_logloss: 0.252515
[10]	training's multi_logloss: 0.225681	training's multi_logloss: 0.225681
[11]	training's multi_logloss: 0.202062	training's multi_logloss: 0.202062
[12]	training's multi_logloss: 0.183785	training's multi_logloss: 0.183785
[13]	training's multi_logloss: 0.165104	traini

In [168]:
train7_preds = model7.predict(X7_train)
val7_preds = model7.predict(X7_val)

In [169]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [170]:
get_clf_eval(y7_train, train7_preds)
get_clf_eval(y7_val, val7_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9394
재현율: 0.9964


In [171]:
preds_7= model7.predict(X7_test)
preds_7

array([1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
       3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,

In [172]:
test_jul['classification'] = preds_7
test_jul

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-07-01,5,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,2156,58.63,1,2022-07-01 00:00:00,2022,7,1,0
1,2022-07-01,5,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1462,60.51,1,2022-07-01 01:00:00,2022,7,1,1
2,2022-07-01,5,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1117,61.31,1,2022-07-01 02:00:00,2022,7,1,2
3,2022-07-01,5,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,990,60.92,1,2022-07-01 03:00:00,2022,7,1,3
4,2022-07-01,5,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1297,58.92,1,2022-07-01 04:00:00,2022,7,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-07-31,7,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4305,49.04,2,2022-07-31 19:00:00,2022,7,31,19
740,2022-07-31,7,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4305,45.49,2,2022-07-31 20:00:00,2022,7,31,20
741,2022-07-31,7,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4317,47.01,2,2022-07-31 21:00:00,2022,7,31,21
742,2022-07-31,7,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3576,47.65,2,2022-07-31 22:00:00,2022,7,31,22


# 8월 데이터 머신러닝

## 데이터 가공

In [173]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [174]:
X8 = train_aug.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [175]:
y8 = train_aug[['datetime', 'classification']]
X8_1 = X8.drop(columns = ['datetime', 'classification'])
y8_1 = X8.datetime

In [176]:
X8_1_scaler = scaler.fit_transform(X8_1)

In [177]:
X8_1_sc = pd.DataFrame(X8_1_scaler)
X8_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X8 = pd.concat([y8_1, X8_1_sc], axis = 1)
X8

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-08-01 00:00:00,0.500000,0.406650,0.912847,0.0,0.0,0.0,0.000000
1,2019-08-01 01:00:00,0.500000,0.307232,0.903037,0.0,0.0,0.0,0.043478
2,2019-08-01 02:00:00,0.500000,0.263009,0.900773,0.0,0.0,0.0,0.086957
3,2019-08-01 03:00:00,0.500000,0.194846,0.921524,0.0,0.0,0.0,0.130435
4,2019-08-01 04:00:00,0.500000,0.174896,0.905867,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-08-31 19:00:00,0.166667,0.580382,0.040747,1.0,0.0,1.0,0.826087
2228,2021-08-31 20:00:00,0.166667,0.521696,0.259385,1.0,0.0,1.0,0.869565
2229,2021-08-31 21:00:00,0.166667,0.618454,0.480664,1.0,0.0,1.0,0.913043
2230,2021-08-31 22:00:00,0.166667,0.443059,0.678551,1.0,0.0,1.0,0.956522


In [178]:
X8_test = test_aug.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [179]:
y8_test = test_aug[['datetime', 'classification']]
X8_1_test = X8_test.drop(columns = ['datetime', 'classification'])
y8_1_test = X8_test.datetime

In [180]:
X8_1_test_scaler = scaler.fit_transform(X8_1_test)

In [181]:
X8_1_test_sc = pd.DataFrame(X8_1_test_scaler)
X8_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X8_test = pd.concat([y8_1_test, X8_1_test_sc], axis = 1)
X8_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-08-01 00:00:00,0.000000,0.168706,0.900221,0.0,0.0,0.0,0.000000
1,2022-08-01 01:00:00,0.000000,0.086124,0.920137,0.0,0.0,0.0,0.043478
2,2022-08-01 02:00:00,0.000000,0.056825,0.929189,0.0,0.0,0.0,0.086957
3,2022-08-01 03:00:00,0.000000,0.075016,0.849930,0.0,0.0,0.0,0.130435
4,2022-08-01 04:00:00,0.000000,0.172086,0.793402,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-08-31 19:00:00,0.333333,0.631681,0.530678,0.0,0.0,1.0,0.826087
740,2022-08-31 20:00:00,0.333333,0.586285,0.741903,0.0,0.0,1.0,0.869565
741,2022-08-31 21:00:00,0.333333,0.723921,0.629451,0.0,0.0,1.0,0.913043
742,2022-08-31 22:00:00,0.333333,0.657115,0.689399,0.0,0.0,1.0,0.956522


## LightGBM

In [182]:
X8 = X8.drop(columns = ['datetime'])
y8 = y8.drop(columns = ['datetime'])
X8_test = X8_test.drop(columns = ['datetime'])
y8_test = y8_test.drop(columns = ['datetime'])

In [183]:
study8 = optuna.create_study(direction='maximize',sampler=TPESampler())
study8.optimize(lambda trial : objectiveLGBM(trial, X8, y8), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study8.best_trial.value,study8.best_trial.params))

[32m[I 2022-12-01 03:09:40,015][0m A new study created in memory with name: no-name-941e4d84-65f6-4e00-af3e-af7ecf23f6b0[0m
[32m[I 2022-12-01 03:09:42,970][0m Trial 0 finished with value: 0.8393741334917806 and parameters: {'num_leaves': 264, 'n_estimators': 2081, 'feature_fraction': 0.6545203823488741, 'bagging_fraction': 0.4234193355238419, 'bagging_freq': 2, 'min_child_samples': 63}. Best is trial 0 with value: 0.8393741334917806.[0m
[32m[I 2022-12-01 03:09:47,102][0m Trial 1 finished with value: 0.851421188630491 and parameters: {'num_leaves': 154, 'n_estimators': 2300, 'feature_fraction': 0.7433122969856627, 'bagging_fraction': 0.7743766771748354, 'bagging_freq': 5, 'min_child_samples': 72}. Best is trial 1 with value: 0.851421188630491.[0m
[32m[I 2022-12-01 03:09:50,815][0m Trial 2 finished with value: 0.9247863247863247 and parameters: {'num_leaves': 459, 'n_estimators': 1852, 'feature_fraction': 0.4264483310667276, 'bagging_fraction': 0.9240851987757519, 'bagging_fre

Best trial: score 1.0,
params {'num_leaves': 506, 'n_estimators': 730, 'feature_fraction': 0.6063944808192714, 'bagging_fraction': 0.9844109772288705, 'bagging_freq': 3, 'min_child_samples': 35}


In [184]:
optuna.visualization.plot_param_importances(study8) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study8) # 최적화 과정 시각화

In [185]:
X8_train, X8_val, y8_train, y8_val = train_test_split(X8, y8, test_size = 0.2, random_state = 42)

In [186]:
X8_train.shape, X8_val.shape, y8_train.shape, y8_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [187]:
model = LGBMClassifier(**study8.best_trial.params)

In [188]:
model8 = model.fit(X8_train, y8_train,
          eval_set = [(X8_train, y8_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.686136	training's multi_logloss: 0.686136
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.576297	training's multi_logloss: 0.576297
[3]	training's multi_logloss: 0.515989	training's multi_logloss: 0.515989
[4]	training's multi_logloss: 0.465545	training's multi_logloss: 0.465545
[5]	training's multi_logloss: 0.411246	training's multi_logloss: 0.411246
[6]	training's multi_logloss: 0.377883	training's multi_logloss: 0.377883
[7]	training's multi_logloss: 0.344222	training's multi_logloss: 0.344222
[8]	training's multi_logloss: 0.314656	training's multi_logloss: 0.314656
[9]	training's multi_logloss: 0.298373	training's multi_logloss: 0.298373
[10]	training's multi_logloss: 0.275252	training's multi_logloss: 0.275252
[11]	training's multi_logloss: 0.258359	training's multi_logloss: 0.258359
[12]	training's multi_logloss: 0.239327	training's multi_logloss: 0.239327
[13]	training's multi_logloss: 0.220829	traini

In [189]:
train8_preds = model8.predict(X8_train)
val8_preds = model8.predict(X8_val)

In [190]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [191]:
get_clf_eval(y8_train, train8_preds)
get_clf_eval(y8_val, val8_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9591
재현율: 0.9320


In [192]:
preds_8= model8.predict(X8_test)
preds_8

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,

In [193]:
test_aug['classification'] = preds_8
test_aug

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-08-01,1,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1461,57.60,1,2022-08-01 00:00:00,2022,8,1,0
1,2022-08-01,1,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,948,58.59,1,2022-08-01 01:00:00,2022,8,1,1
2,2022-08-01,1,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,766,59.04,1,2022-08-01 02:00:00,2022,8,1,2
3,2022-08-01,1,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,879,55.10,1,2022-08-01 03:00:00,2022,8,1,3
4,2022-08-01,1,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1482,52.29,1,2022-08-01 04:00:00,2022,8,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-08-31,3,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4337,39.23,2,2022-08-31 19:00:00,2022,8,31,19
740,2022-08-31,3,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4055,49.73,2,2022-08-31 20:00:00,2022,8,31,20
741,2022-08-31,3,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4910,44.14,2,2022-08-31 21:00:00,2022,8,31,21
742,2022-08-31,3,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4495,47.12,2,2022-08-31 22:00:00,2022,8,31,22


# 9월 데이터 머신러닝

## 데이터 가공

In [194]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [195]:
X9 = train_sep.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [196]:
y9 = train_sep[['datetime', 'classification']]
X9_1 = X9.drop(columns = ['datetime', 'classification'])
y9_1 = X9.datetime

In [197]:
X9_1_scaler = scaler.fit_transform(X9_1)

In [198]:
X9_1_sc = pd.DataFrame(X9_1_scaler)
X9_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X9 = pd.concat([y9_1, X9_1_sc], axis = 1)
X9

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-09-01 00:00:00,1.0,0.481021,0.928198,0.0,0.0,0.0,0.000000
1,2019-09-01 01:00:00,1.0,0.385635,0.977807,0.0,0.0,0.0,0.043478
2,2019-09-01 02:00:00,1.0,0.288285,0.972398,0.0,0.0,0.0,0.086957
3,2019-09-01 03:00:00,1.0,0.246891,0.978180,0.0,0.0,0.0,0.130435
4,2019-09-01 04:00:00,1.0,0.270288,0.953935,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-09-30 19:00:00,0.5,0.568390,0.634838,1.0,0.0,1.0,0.826087
2156,2021-09-30 20:00:00,0.5,0.578207,0.729579,1.0,0.0,1.0,0.869565
2157,2021-09-30 21:00:00,0.5,0.709097,0.609288,1.0,0.0,1.0,0.913043
2158,2021-09-30 22:00:00,0.5,0.598495,0.650131,1.0,0.0,1.0,0.956522


In [199]:
X9_test = test_sep.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [200]:
y9_test = test_sep[['datetime', 'classification']]
X9_1_test = X9_test.drop(columns = ['datetime', 'classification'])
y9_1_test = X9_test.datetime

In [201]:
X9_1_test_scaler = scaler.fit_transform(X9_1_test)

In [202]:
X9_1_test_sc = pd.DataFrame(X9_1_test_scaler)
X9_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X9_test = pd.concat([y9_1_test, X9_1_test_sc], axis = 1)
X9_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-09-01 00:00:00,0.500000,0.271607,0.917680,0.0,0.0,0.0,0.000000
1,2022-09-01 01:00:00,0.500000,0.145987,0.958232,0.0,0.0,0.0,0.043478
2,2022-09-01 02:00:00,0.500000,0.104056,1.000000,0.0,0.0,0.0,0.086957
3,2022-09-01 03:00:00,0.500000,0.060414,0.962693,0.0,0.0,0.0,0.130435
4,2022-09-01 04:00:00,0.500000,0.091220,0.913220,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-09-30 19:00:00,0.666667,0.842204,0.596715,0.0,0.0,1.0,0.826087
716,2022-09-30 20:00:00,0.666667,0.740715,0.743512,0.0,0.0,1.0,0.869565
717,2022-09-30 21:00:00,0.666667,0.712134,0.330900,0.0,0.0,1.0,0.913043
718,2022-09-30 22:00:00,0.666667,0.608420,0.101176,0.0,0.0,1.0,0.956522


## LightGBM

In [203]:
X9 = X9.drop(columns = ['datetime'])
y9 = y9.drop(columns = ['datetime'])
X9_test = X9_test.drop(columns = ['datetime'])
y9_test = y9_test.drop(columns = ['datetime'])

In [204]:
study9 = optuna.create_study(direction='maximize',sampler=TPESampler())
study9.optimize(lambda trial : objectiveLGBM(trial, X9, y9), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study9.best_trial.value,study9.best_trial.params))

[32m[I 2022-12-01 03:11:22,832][0m A new study created in memory with name: no-name-a56897c0-423a-4080-aa51-3943480f67b8[0m
[32m[I 2022-12-01 03:11:27,222][0m Trial 0 finished with value: 0.9231549337478718 and parameters: {'num_leaves': 293, 'n_estimators': 1585, 'feature_fraction': 0.4823995943877381, 'bagging_fraction': 0.9591524011799643, 'bagging_freq': 6, 'min_child_samples': 40}. Best is trial 0 with value: 0.9231549337478718.[0m
[32m[I 2022-12-01 03:11:29,587][0m Trial 1 finished with value: 0.8597222222222222 and parameters: {'num_leaves': 277, 'n_estimators': 1170, 'feature_fraction': 0.7041627257210239, 'bagging_fraction': 0.7874258425720484, 'bagging_freq': 1, 'min_child_samples': 57}. Best is trial 0 with value: 0.9231549337478718.[0m
[32m[I 2022-12-01 03:11:35,791][0m Trial 2 finished with value: 0.9265505174596084 and parameters: {'num_leaves': 277, 'n_estimators': 2728, 'feature_fraction': 0.9716770270173266, 'bagging_fraction': 0.6424819455025217, 'bagging_f

Best trial: score 1.0,
params {'num_leaves': 349, 'n_estimators': 1149, 'feature_fraction': 0.7679746499268665, 'bagging_fraction': 0.945819112368792, 'bagging_freq': 7, 'min_child_samples': 43}


In [205]:
optuna.visualization.plot_param_importances(study9) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study9) # 최적화 과정 시각화

In [206]:
X9_train, X9_val, y9_train, y9_val = train_test_split(X9, y9, test_size = 0.2, random_state = 42)

In [207]:
X9_train.shape, X9_val.shape, y9_train.shape, y9_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [208]:
model = LGBMClassifier(**study9.best_trial.params)

In [209]:
model9 = model.fit(X9_train, y9_train,
          eval_set = [(X9_train, y9_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.6377	training's multi_logloss: 0.6377
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.557534	training's multi_logloss: 0.557534
[3]	training's multi_logloss: 0.486405	training's multi_logloss: 0.486405
[4]	training's multi_logloss: 0.429062	training's multi_logloss: 0.429062
[5]	training's multi_logloss: 0.380745	training's multi_logloss: 0.380745
[6]	training's multi_logloss: 0.339812	training's multi_logloss: 0.339812
[7]	training's multi_logloss: 0.309348	training's multi_logloss: 0.309348
[8]	training's multi_logloss: 0.281974	training's multi_logloss: 0.281974
[9]	training's multi_logloss: 0.253815	training's multi_logloss: 0.253815
[10]	training's multi_logloss: 0.233829	training's multi_logloss: 0.233829
[11]	training's multi_logloss: 0.211028	training's multi_logloss: 0.211028
[12]	training's multi_logloss: 0.19411	training's multi_logloss: 0.19411
[13]	training's multi_logloss: 0.178972	training's m

In [210]:
train9_preds = model9.predict(X9_train)
val9_preds = model9.predict(X9_val)

In [211]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [212]:
get_clf_eval(y9_train, train9_preds)
get_clf_eval(y9_val, val9_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9931
재현율: 0.8788


In [213]:
preds_9= model9.predict(X9_test)
preds_9

array([1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2,
       2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1,

In [214]:
test_sep['classification'] = preds_9
test_sep

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-09-01,4,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,2234,57.85,1,2022-09-01 00:00:00,2022,9,1,0
1,2022-09-01,4,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1500,59.85,1,2022-09-01 01:00:00,2022,9,1,1
2,2022-09-01,4,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1255,61.91,1,2022-09-01 02:00:00,2022,9,1,2
3,2022-09-01,4,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1000,60.07,1,2022-09-01 03:00:00,2022,9,1,3
4,2022-09-01,4,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1180,57.63,1,2022-09-01 04:00:00,2022,9,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-09-30,5,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,5568,42.02,2,2022-09-30 19:00:00,2022,9,30,19
716,2022-09-30,5,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4975,49.26,2,2022-09-30 20:00:00,2022,9,30,20
717,2022-09-30,5,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4808,28.91,2,2022-09-30 21:00:00,2022,9,30,21
718,2022-09-30,5,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4202,17.58,3,2022-09-30 22:00:00,2022,9,30,22


# 10월 데이터 머신러닝

## 데이터 가공

In [215]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [216]:
X10 = train_oct.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [217]:
y10 = train_oct[['datetime', 'classification']]
X10_1 = X10.drop(columns = ['datetime', 'classification'])
y10_1 = X10.datetime

In [218]:
X10_1_scaler = scaler.fit_transform(X10_1)

In [219]:
X10_1_sc = pd.DataFrame(X10_1_scaler)
X10_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X10 = pd.concat([y10_1, X10_1_sc], axis = 1)
X10

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-10-01 00:00:00,0.166667,0.401456,0.940891,0.0,0.0,0.0,0.000000
1,2019-10-01 01:00:00,0.166667,0.234677,0.964386,0.0,0.0,0.0,0.043478
2,2019-10-01 02:00:00,0.166667,0.147816,0.927839,0.0,0.0,0.0,0.086957
3,2019-10-01 03:00:00,0.166667,0.125635,0.899869,0.0,0.0,0.0,0.130435
4,2019-10-01 04:00:00,0.166667,0.134101,0.881037,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1483,2020-10-31 19:00:00,0.833333,0.553166,0.680962,1.0,0.0,1.0,0.826087
1484,2020-10-31 20:00:00,0.833333,0.605147,0.612530,1.0,0.0,1.0,0.869565
1485,2020-10-31 21:00:00,0.833333,0.666102,0.687302,1.0,0.0,1.0,0.913043
1486,2020-10-31 22:00:00,0.833333,0.645107,0.756853,1.0,0.0,1.0,0.956522


In [220]:
X10_test = test_oct.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [221]:
y10_test = test_oct[['datetime', 'classification']]
X10_1_test = X10_test.drop(columns = ['datetime', 'classification'])
y10_1_test = X10_test.datetime

In [222]:
X10_1_test_scaler = scaler.fit_transform(X10_1_test)

In [223]:
X10_1_test_sc = pd.DataFrame(X10_1_test_scaler)
X10_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X10_test = pd.concat([y10_1_test, X10_1_test_sc], axis = 1)
X10_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-10-01 00:00:00,0.666667,0.182326,0.951050,0.0,0.0,0.0,0.000000
1,2021-10-01 01:00:00,0.666667,0.097733,0.975092,0.0,0.0,0.0,0.043478
2,2021-10-01 02:00:00,0.666667,0.052398,0.950401,0.0,0.0,0.0,0.086957
3,2021-10-01 03:00:00,0.666667,0.034001,0.908165,0.0,0.0,0.0,0.130435
4,2021-10-01 04:00:00,0.666667,0.071288,0.884774,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2021-10-31 19:00:00,1.000000,0.579501,0.779944,0.0,0.0,1.0,0.826087
740,2021-10-31 20:00:00,1.000000,0.667707,0.672515,0.0,0.0,1.0,0.869565
741,2021-10-31 21:00:00,1.000000,0.622372,0.704137,0.0,0.0,1.0,0.913043
742,2021-10-31 22:00:00,1.000000,0.819481,0.786658,0.0,0.0,1.0,0.956522


## LightGBM

In [224]:
X10 = X10.drop(columns = ['datetime'])
y10 = y10.drop(columns = ['datetime'])
X10_test = X10_test.drop(columns = ['datetime'])
y10_test = y10_test.drop(columns = ['datetime'])

In [225]:
study10 = optuna.create_study(direction='maximize',sampler=TPESampler())
study10.optimize(lambda trial : objectiveLGBM(trial, X10, y10), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study10.best_trial.value,study10.best_trial.params))

[32m[I 2022-12-01 03:12:52,577][0m A new study created in memory with name: no-name-6709723b-e6db-4fcc-b9cb-d2d74eea7c85[0m
[32m[I 2022-12-01 03:12:56,337][0m Trial 0 finished with value: 1.0 and parameters: {'num_leaves': 509, 'n_estimators': 1908, 'feature_fraction': 0.4567147016505342, 'bagging_fraction': 0.7243067662877212, 'bagging_freq': 1, 'min_child_samples': 26}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-12-01 03:12:58,074][0m Trial 1 finished with value: 0.988795518207283 and parameters: {'num_leaves': 444, 'n_estimators': 1689, 'feature_fraction': 0.6327491993719735, 'bagging_fraction': 0.6558102018577903, 'bagging_freq': 7, 'min_child_samples': 91}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-12-01 03:12:59,829][0m Trial 2 finished with value: 0.6616656563615545 and parameters: {'num_leaves': 262, 'n_estimators': 1351, 'feature_fraction': 0.7143372012724931, 'bagging_fraction': 0.8597078507249493, 'bagging_freq': 5, 'min_child_samples': 82}. Best is tri

Best trial: score 1.0,
params {'num_leaves': 509, 'n_estimators': 1908, 'feature_fraction': 0.4567147016505342, 'bagging_fraction': 0.7243067662877212, 'bagging_freq': 1, 'min_child_samples': 26}


In [226]:
optuna.visualization.plot_param_importances(study10) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study10) # 최적화 과정 시각화

In [227]:
X10_train, X10_val, y10_train, y10_val = train_test_split(X10, y10, test_size = 0.2, random_state = 42)

In [228]:
X10_train.shape, X10_val.shape, y10_train.shape, y10_val.shape

((1190, 7), (298, 7), (1190, 1), (298, 1))

In [229]:
model = LGBMClassifier(**study10.best_trial.params)

In [230]:
model10 = model.fit(X10_train, y10_train,
          eval_set = [(X10_train, y10_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.655045	training's multi_logloss: 0.655045
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.63412	training's multi_logloss: 0.63412
[3]	training's multi_logloss: 0.571463	training's multi_logloss: 0.571463
[4]	training's multi_logloss: 0.561389	training's multi_logloss: 0.561389
[5]	training's multi_logloss: 0.509613	training's multi_logloss: 0.509613
[6]	training's multi_logloss: 0.468714	training's multi_logloss: 0.468714
[7]	training's multi_logloss: 0.432081	training's multi_logloss: 0.432081
[8]	training's multi_logloss: 0.399232	training's multi_logloss: 0.399232
[9]	training's multi_logloss: 0.370528	training's multi_logloss: 0.370528
[10]	training's multi_logloss: 0.344207	training's multi_logloss: 0.344207
[11]	training's multi_logloss: 0.327821	training's multi_logloss: 0.327821
[12]	training's multi_logloss: 0.306933	training's multi_logloss: 0.306933
[13]	training's multi_logloss: 0.283955	training

In [231]:
train10_preds = model10.predict(X10_train)
val10_preds = model10.predict(X10_val)

In [232]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [233]:
get_clf_eval(y10_train, train10_preds)
get_clf_eval(y10_val, val10_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9938
재현율: 0.8333


In [234]:
preds_10= model10.predict(X10_test)
preds_10

array([1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,

In [235]:
test_oct['classification'] = preds_10
test_oct

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-10-01,5,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1543,59.40,1,2021-10-01 00:00:00,2021,10,1,0
1,2021-10-01,5,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1028,60.51,1,2021-10-01 01:00:00,2021,10,1,1
2,2021-10-01,5,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,752,59.37,1,2021-10-01 02:00:00,2021,10,1,2
3,2021-10-01,5,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,640,57.42,1,2021-10-01 03:00:00,2021,10,1,3
4,2021-10-01,5,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,867,56.34,1,2021-10-01 04:00:00,2021,10,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2021-10-31,7,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3961,51.50,1,2021-10-31 19:00:00,2021,10,31,19
740,2021-10-31,7,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4498,46.54,2,2021-10-31 20:00:00,2021,10,31,20
741,2021-10-31,7,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4222,48.00,2,2021-10-31 21:00:00,2021,10,31,21
742,2021-10-31,7,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,5422,51.81,1,2021-10-31 22:00:00,2021,10,31,22


# 11월 데이터 머신러닝

## 데이터 가공

In [236]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [237]:
X11 = train_nov.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [238]:
y11 = train_nov[['datetime', 'classification']]
X11_1 = X11.drop(columns = ['datetime', 'classification'])
y11_1 = X11.datetime

In [239]:
X11_1_scaler = scaler.fit_transform(X11_1)

In [240]:
X11_1_sc = pd.DataFrame(X11_1_scaler)
X11_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X11 = pd.concat([y11_1, X11_1_sc], axis = 1)
X11

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-11-01 00:00:00,0.666667,0.449061,0.920548,0.0,0.0,0.0,0.000000
1,2019-11-01 01:00:00,0.666667,0.309955,0.944110,0.0,0.0,0.0,0.043478
2,2019-11-01 02:00:00,0.666667,0.246468,0.954703,0.0,0.0,0.0,0.086957
3,2019-11-01 03:00:00,0.666667,0.185973,0.946484,0.0,0.0,0.0,0.130435
4,2019-11-01 04:00:00,0.666667,0.194615,0.928584,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1435,2020-11-30 19:00:00,0.000000,0.477314,0.673607,1.0,0.0,1.0,0.826087
1436,2020-11-30 20:00:00,0.000000,0.597640,0.743927,1.0,0.0,1.0,0.869565
1437,2020-11-30 21:00:00,0.000000,0.640685,0.602740,1.0,0.0,1.0,0.913043
1438,2020-11-30 22:00:00,0.000000,0.436430,0.768767,1.0,0.0,1.0,0.956522


In [241]:
X11_test = test_nov.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [242]:
y11_test = test_nov[['datetime', 'classification']]
X11_1_test = X11_test.drop(columns = ['datetime', 'classification'])
y11_1_test = X11_test.datetime

In [243]:
X11_1_test_scaler = scaler.fit_transform(X11_1_test)

In [244]:
X11_1_test_sc = pd.DataFrame(X11_1_test_scaler)
X11_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X11_test = pd.concat([y11_1_test, X11_1_test_sc], axis = 1)
X11_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-11-01 00:00:00,0.000000,0.191675,0.903245,0.0,0.0,0.0,0.000000
1,2021-11-01 01:00:00,0.000000,0.105740,0.931342,0.0,0.0,0.0,0.043478
2,2021-11-01 02:00:00,0.000000,0.052367,0.908785,0.0,0.0,0.0,0.086957
3,2021-11-01 03:00:00,0.000000,0.050688,0.893550,0.0,0.0,0.0,0.130435
4,2021-11-01 04:00:00,0.000000,0.171031,0.903047,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2021-11-30 19:00:00,0.166667,0.520477,0.292244,0.0,0.0,1.0,0.826087
716,2021-11-30 20:00:00,0.166667,0.547163,0.747131,0.0,0.0,1.0,0.869565
717,2021-11-30 21:00:00,0.166667,0.649883,0.573209,0.0,0.0,1.0,0.913043
718,2021-11-30 22:00:00,0.166667,0.708963,0.732093,0.0,0.0,1.0,0.956522


## LightGBM

In [245]:
X11 = X11.drop(columns = ['datetime'])
y11 = y11.drop(columns = ['datetime'])
X11_test = X11_test.drop(columns = ['datetime'])
y11_test = y11_test.drop(columns = ['datetime'])

In [246]:
study11 = optuna.create_study(direction='maximize',sampler=TPESampler())
study11.optimize(lambda trial : objectiveLGBM(trial, X11, y11), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study11.best_trial.value,study11.best_trial.params))

[32m[I 2022-12-01 03:13:55,487][0m A new study created in memory with name: no-name-86b27fef-383e-4c68-8564-db43a8bbf152[0m
[32m[I 2022-12-01 03:13:57,124][0m Trial 0 finished with value: 0.9938837920489297 and parameters: {'num_leaves': 345, 'n_estimators': 1217, 'feature_fraction': 0.9522069735468015, 'bagging_fraction': 0.4879604505648162, 'bagging_freq': 4, 'min_child_samples': 41}. Best is trial 0 with value: 0.9938837920489297.[0m
[32m[I 2022-12-01 03:14:04,421][0m Trial 1 finished with value: 0.9931972789115647 and parameters: {'num_leaves': 401, 'n_estimators': 2785, 'feature_fraction': 0.425039102486813, 'bagging_fraction': 0.7770918270913905, 'bagging_freq': 1, 'min_child_samples': 14}. Best is trial 0 with value: 0.9938837920489297.[0m
[32m[I 2022-12-01 03:14:06,717][0m Trial 2 finished with value: 0.7740324594257179 and parameters: {'num_leaves': 397, 'n_estimators': 2632, 'feature_fraction': 0.4218550655102967, 'bagging_fraction': 0.5382522076502952, 'bagging_fr

Best trial: score 0.9938837920489297,
params {'num_leaves': 345, 'n_estimators': 1217, 'feature_fraction': 0.9522069735468015, 'bagging_fraction': 0.4879604505648162, 'bagging_freq': 4, 'min_child_samples': 41}


In [247]:
optuna.visualization.plot_param_importances(study11) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study11) # 최적화 과정 시각화

In [248]:
X11_train, X11_val, y11_train, y11_val = train_test_split(X11, y11, test_size = 0.2, random_state = 42)

In [249]:
X11_train.shape, X11_val.shape, y11_train.shape, y11_val.shape

((1152, 7), (288, 7), (1152, 1), (288, 1))

In [250]:
model = LGBMClassifier(**study11.best_trial.params)

In [251]:
model11 = model.fit(X11_train, y11_train,
          eval_set = [(X11_train, y11_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.608559	training's multi_logloss: 0.608559
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.532332	training's multi_logloss: 0.532332
[3]	training's multi_logloss: 0.467454	training's multi_logloss: 0.467454
[4]	training's multi_logloss: 0.414536	training's multi_logloss: 0.414536
[5]	training's multi_logloss: 0.369523	training's multi_logloss: 0.369523
[6]	training's multi_logloss: 0.331762	training's multi_logloss: 0.331762
[7]	training's multi_logloss: 0.303754	training's multi_logloss: 0.303754
[8]	training's multi_logloss: 0.279148	training's multi_logloss: 0.279148
[9]	training's multi_logloss: 0.252565	training's multi_logloss: 0.252565
[10]	training's multi_logloss: 0.2292	training's multi_logloss: 0.2292
[11]	training's multi_logloss: 0.208654	training's multi_logloss: 0.208654
[12]	training's multi_logloss: 0.194084	training's multi_logloss: 0.194084
[13]	training's multi_logloss: 0.177214	training's

In [252]:
train11_preds = model11.predict(X11_train)
val11_preds = model11.predict(X11_val)

In [253]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [254]:
get_clf_eval(y11_train, train11_preds)
get_clf_eval(y11_val, val11_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9112
재현율: 0.9093


In [255]:
preds_11= model11.predict(X11_test)
preds_11

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2,
       2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2,
       2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,

In [256]:
test_nov['classification'] = preds_11
test_nov

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-11-01,1,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1757,57.92,1,2021-11-01 00:00:00,2021,11,1,0
1,2021-11-01,1,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1245,59.34,1,2021-11-01 01:00:00,2021,11,1,1
2,2021-11-01,1,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,927,58.20,1,2021-11-01 02:00:00,2021,11,1,2
3,2021-11-01,1,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,917,57.43,1,2021-11-01 03:00:00,2021,11,1,3
4,2021-11-01,1,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1634,57.91,1,2021-11-01 04:00:00,2021,11,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2021-11-30,2,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3716,27.04,2,2021-11-30 19:00:00,2021,11,30,19
716,2021-11-30,2,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3875,50.03,1,2021-11-30 20:00:00,2021,11,30,20
717,2021-11-30,2,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4487,41.24,2,2021-11-30 21:00:00,2021,11,30,21
718,2021-11-30,2,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4839,49.27,1,2021-11-30 22:00:00,2021,11,30,22


# 12월 데이터 머신러닝

## 데이터 가공

In [257]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [258]:
X12 = train_dec.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [259]:
y12 = train_dec[['datetime', 'classification']]
X12_1 = X12.drop(columns = ['datetime', 'classification'])
y12_1 = X12.datetime

In [260]:
X12_1_scaler = scaler.fit_transform(X12_1)

In [261]:
X12_1_sc = pd.DataFrame(X12_1_scaler)
X12_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X12 = pd.concat([y12_1, X12_1_sc], axis = 1)
X12

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-12-01 00:00:00,1.0,0.414495,0.929391,0.0,0.0,0.0,0.000000
1,2019-12-01 01:00:00,1.0,0.296542,0.948180,0.0,0.0,0.0,0.043478
2,2019-12-01 02:00:00,1.0,0.207485,0.955498,0.0,0.0,0.0,0.086957
3,2019-12-01 03:00:00,1.0,0.171641,0.949960,0.0,0.0,0.0,0.130435
4,2019-12-01 04:00:00,1.0,0.152534,0.936511,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1483,2020-12-31 19:00:00,0.5,0.452866,0.393196,1.0,0.0,1.0,0.826087
1484,2020-12-31 20:00:00,0.5,0.479709,0.743473,1.0,0.0,1.0,0.869565
1485,2020-12-31 21:00:00,0.5,0.540028,0.649328,1.0,0.0,1.0,0.913043
1486,2020-12-31 22:00:00,0.5,0.359387,0.715585,1.0,0.0,1.0,0.956522


In [262]:
X12_test = test_dec.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [263]:
y12_test = test_dec[['datetime', 'classification']]
X12_1_test = X12_test.drop(columns = ['datetime', 'classification'])
y12_1_test = X12_test.datetime

In [264]:
X12_1_test_scaler = scaler.fit_transform(X12_1_test)

In [265]:
X12_1_test_sc = pd.DataFrame(X12_1_test_scaler)
X12_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X12_test = pd.concat([y12_1_test, X12_1_test_sc], axis = 1)
X12_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-12-01 00:00:00,0.333333,0.354839,0.945496,0.0,0.0,0.0,0.000000
1,2021-12-01 01:00:00,0.333333,0.207301,1.000000,0.0,0.0,0.0,0.043478
2,2021-12-01 02:00:00,0.333333,0.135314,0.978581,0.0,0.0,0.0,0.086957
3,2021-12-01 03:00:00,0.333333,0.094567,0.917958,0.0,0.0,0.0,0.130435
4,2021-12-01 04:00:00,0.333333,0.111715,0.908204,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2021-12-31 19:00:00,0.666667,0.519015,0.450564,0.0,0.0,1.0,0.826087
740,2021-12-31 20:00:00,0.666667,0.459932,0.789252,0.0,0.0,1.0,0.869565
741,2021-12-31 21:00:00,0.666667,0.657046,0.698221,0.0,0.0,1.0,0.913043
742,2021-12-31 22:00:00,0.666667,0.533277,0.762287,0.0,0.0,1.0,0.956522


## LightGBM

In [266]:
X12 = X12.drop(columns = ['datetime'])
y12 = y12.drop(columns = ['datetime'])
X12_test = X12_test.drop(columns = ['datetime'])
y12_test = y12_test.drop(columns = ['datetime'])

In [267]:
study12 = optuna.create_study(direction='maximize',sampler=TPESampler())
study12.optimize(lambda trial : objectiveLGBM(trial, X12, y12), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study12.best_trial.value,study12.best_trial.params))

[32m[I 2022-12-01 03:15:08,540][0m A new study created in memory with name: no-name-b7bc7cb6-f6b4-4454-926e-23358af38795[0m
[32m[I 2022-12-01 03:15:11,674][0m Trial 0 finished with value: 0.9926470588235294 and parameters: {'num_leaves': 200, 'n_estimators': 1108, 'feature_fraction': 0.5994639501750756, 'bagging_fraction': 0.9282402328771046, 'bagging_freq': 4, 'min_child_samples': 21}. Best is trial 0 with value: 0.9926470588235294.[0m
[32m[I 2022-12-01 03:15:14,392][0m Trial 1 finished with value: 0.993421052631579 and parameters: {'num_leaves': 88, 'n_estimators': 2696, 'feature_fraction': 0.7771543570993434, 'bagging_fraction': 0.5493260230347945, 'bagging_freq': 3, 'min_child_samples': 74}. Best is trial 1 with value: 0.993421052631579.[0m
[32m[I 2022-12-01 03:15:19,265][0m Trial 2 finished with value: 0.6574074074074074 and parameters: {'num_leaves': 445, 'n_estimators': 1864, 'feature_fraction': 0.7649159529637595, 'bagging_fraction': 0.4247707555334929, 'bagging_freq

Best trial: score 1.0,
params {'num_leaves': 493, 'n_estimators': 2699, 'feature_fraction': 0.7988506753938667, 'bagging_fraction': 0.8455121189903851, 'bagging_freq': 7, 'min_child_samples': 66}


In [268]:
optuna.visualization.plot_param_importances(study12) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study12) # 최적화 과정 시각화

In [269]:
X12_train, X12_val, y12_train, y12_val = train_test_split(X12, y12, test_size = 0.2, random_state = 42)

In [270]:
X12_train.shape, X12_val.shape, y12_train.shape, y12_val.shape

((1190, 7), (298, 7), (1190, 1), (298, 1))

In [271]:
model = LGBMClassifier(**study12.best_trial.params)

In [272]:
model12 = model.fit(X12_train, y12_train,
          eval_set = [(X12_train, y12_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.487947	training's multi_logloss: 0.487947
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.419207	training's multi_logloss: 0.419207
[3]	training's multi_logloss: 0.364736	training's multi_logloss: 0.364736
[4]	training's multi_logloss: 0.320571	training's multi_logloss: 0.320571
[5]	training's multi_logloss: 0.283744	training's multi_logloss: 0.283744
[6]	training's multi_logloss: 0.252169	training's multi_logloss: 0.252169
[7]	training's multi_logloss: 0.231256	training's multi_logloss: 0.231256
[8]	training's multi_logloss: 0.212801	training's multi_logloss: 0.212801
[9]	training's multi_logloss: 0.191059	training's multi_logloss: 0.191059
[10]	training's multi_logloss: 0.176717	training's multi_logloss: 0.176717
[11]	training's multi_logloss: 0.159496	training's multi_logloss: 0.159496
[12]	training's multi_logloss: 0.148165	training's multi_logloss: 0.148165
[13]	training's multi_logloss: 0.137589	traini

In [273]:
train12_preds = model12.predict(X12_train)
val12_preds = model12.predict(X12_val)

In [274]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [275]:
get_clf_eval(y12_train, train12_preds)
get_clf_eval(y12_val, val12_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9979
재현율: 0.9911


In [276]:
preds_12= model12.predict(X12_test)
preds_12

array([1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2,
       1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2,
       1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 3, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1,

In [277]:
test_dec['classification'] = preds_12
test_dec

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-12-01,3,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,2640,59.80,1,2021-12-01 00:00:00,2021,12,1,0
1,2021-12-01,3,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1771,62.65,1,2021-12-01 01:00:00,2021,12,1,1
2,2021-12-01,3,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1347,61.53,1,2021-12-01 02:00:00,2021,12,1,2
3,2021-12-01,3,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1107,58.36,1,2021-12-01 03:00:00,2021,12,1,3
4,2021-12-01,3,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1208,57.85,1,2021-12-01 04:00:00,2021,12,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2021-12-31,5,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3607,33.92,2,2021-12-31 19:00:00,2021,12,31,19
740,2021-12-31,5,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3259,51.63,1,2021-12-31 20:00:00,2021,12,31,20
741,2021-12-31,5,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4420,46.87,2,2021-12-31 21:00:00,2021,12,31,21
742,2021-12-31,5,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,3691,50.22,1,2021-12-31 22:00:00,2021,12,31,22


# 월별 데이터 합치기

In [278]:
result = pd.concat([test_jan,
                    test_feb,
                    test_mar,
                    test_apr,
                    test_may,
                    test_jun,
                    test_jul,
                    test_aug,
                    test_sep,
                    test_oct,
                    test_nov,
                    test_dec])
result = result.sort_values(by = 'datetime')
result = result.reset_index(drop = True)
result

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-10-01,5,0:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1543,59.40,1,2021-10-01 00:00:00,2021,10,1,0
1,2021-10-01,5,1:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,1028,60.51,1,2021-10-01 01:00:00,2021,10,1,1
2,2021-10-01,5,2:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,752,59.37,1,2021-10-01 02:00:00,2021,10,1,2
3,2021-10-01,5,3:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,640,57.42,1,2021-10-01 03:00:00,2021,10,1,3
4,2021-10-01,5,4:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,867,56.34,1,2021-10-01 04:00:00,2021,10,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2022-09-30,5,19:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,5568,42.02,2,2022-09-30 19:00:00,2022,9,30,19
8756,2022-09-30,5,20:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4975,49.26,2,2022-09-30 20:00:00,2022,9,30,20
8757,2022-09-30,5,21:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4808,28.91,2,2022-09-30 21:00:00,2022,9,30,21
8758,2022-09-30,5,22:00:00,한남대교,용산구,C-13,한남대교북단,한남대교남단,6,822,4202,17.58,3,2022-09-30 22:00:00,2022,9,30,22


In [279]:
result = result.drop(columns = ['dow', 
                                'district_name', 
                                'branch_num', 
                                'arr_point', 
                                'lane', 
                                'distance', 
                                'volume', 
                                'speed', 
                                'datetime', 
                                'year', 
                                'month', 
                                'day', 
                                'hour'])
result

Unnamed: 0,date,time,branch_name,dep_point,classification
0,2021-10-01,0:00:00,한남대교,한남대교북단,1
1,2021-10-01,1:00:00,한남대교,한남대교북단,1
2,2021-10-01,2:00:00,한남대교,한남대교북단,1
3,2021-10-01,3:00:00,한남대교,한남대교북단,1
4,2021-10-01,4:00:00,한남대교,한남대교북단,1
...,...,...,...,...,...
8755,2022-09-30,19:00:00,한남대교,한남대교북단,2
8756,2022-09-30,20:00:00,한남대교,한남대교북단,2
8757,2022-09-30,21:00:00,한남대교,한남대교북단,2
8758,2022-09-30,22:00:00,한남대교,한남대교북단,3


# csv 파일 만들기

In [280]:
result.to_csv('hannam_depnorth_result.csv', index = False)