# 라이브러리

In [29]:
import pandas as pd
import random
import os
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

In [30]:
# 한글 폰트 깨짐 현상 해결을 위한 나눔 폰트 설치
# 코드 1회 실행 후 주석 처리하고 런타임 재시작 및 모두 실행
# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf

In [31]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

## 데이터 로드

In [32]:
# 경로 설정
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
# csv 파일 읽어오기
# root = '/content/drive/MyDrive/최종프로젝트/교통/분석/2nd_modified_data/'
root = '/content/drive/MyDrive/Project/'
C4_depnorth = pd.read_csv(root + 'Data_sungsan_depnorth.csv', encoding='cp949')
C4_depnorth_test = pd.read_csv(root + 'sungsan_depnorth_test.csv', encoding='cp949')

In [34]:
# 데이터 확인
print(C4_depnorth.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24096 entries, 0 to 24095
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            24096 non-null  object 
 1   dow             24096 non-null  int64  
 2   time            24096 non-null  object 
 3   branch_name     24096 non-null  object 
 4   district_name   24096 non-null  object 
 5   branch_num      24096 non-null  object 
 6   dep_point       24096 non-null  object 
 7   arr_point       24096 non-null  object 
 8   lane            24096 non-null  int64  
 9   distance        24096 non-null  int64  
 10  volume          24096 non-null  int64  
 11  speed           24096 non-null  float64
 12  classification  0 non-null      float64
dtypes: float64(2), int64(4), object(7)
memory usage: 2.4+ MB
None


In [35]:
# 결측치 확인
print(C4_depnorth.isnull().sum())

date                  0
dow                   0
time                  0
branch_name           0
district_name         0
branch_num            0
dep_point             0
arr_point             0
lane                  0
distance              0
volume                0
speed                 0
classification    24096
dtype: int64


In [36]:
# date 컬럼과 time 컬럼을 합쳐 datetime이라는 컬럼 만들기
C4_depnorth['datetime'] = C4_depnorth['date'] + ' ' + C4_depnorth['time']
C4_depnorth_test['datetime'] = C4_depnorth_test['date'] + ' ' + C4_depnorth_test['time']

In [37]:
# date 컬럼과 time 컬럼 제거
# C4_depsouth = C4_depsouth.drop(C4_depsouth[['date', 'time']], axis=1)

In [38]:
# datetime 문자형 컬럼을 datetime 자료형으로 변환
C4_depnorth['datetime'] = pd.to_datetime(C4_depnorth['datetime'])
C4_depnorth_test['datetime'] = pd.to_datetime(C4_depnorth_test['datetime'])

# classification 컬럼값 변경

In [39]:
C4_depnorth.describe()

Unnamed: 0,dow,lane,distance,volume,speed,classification
count,24096.0,24096.0,24096.0,24096.0,24096.0,0.0
mean,3.997012,3.0,1176.0,3268.06167,42.139455,
std,1.998295,0.0,0.0,1347.367306,10.487994,
min,1.0,3.0,1176.0,205.0,8.46,
25%,2.0,3.0,1176.0,2295.75,34.1,
50%,4.0,3.0,1176.0,3743.0,42.15,
75%,6.0,3.0,1176.0,4336.0,51.27,
max,7.0,3.0,1176.0,5487.0,62.62,


In [40]:
C4_depnorth.loc[C4_depnorth['speed'] >= C4_depnorth['speed'].mean(), 'classification'] = 1
C4_depnorth.loc[C4_depnorth['speed'] < 15, 'classification'] = 3
C4_depnorth.loc[(C4_depnorth['speed'] >= 15) 
                & (C4_depnorth['speed'] < 25) 
                & ((C4_depnorth['volume'] >= C4_depnorth['volume'].mean())), 'classification'] = 3
C4_depnorth.loc[(C4_depnorth['speed'] >= 15) 
                & (C4_depnorth['speed'] < C4_depnorth['speed'].mean()) 
                & ((C4_depnorth['volume'] < C4_depnorth['volume'].mean())), 'classification'] = 2
C4_depnorth.loc[(C4_depnorth['speed'] >= 25) 
                & (C4_depnorth['speed'] < C4_depnorth['speed'].mean()) 
                & ((C4_depnorth['volume'] >= C4_depnorth['volume'].mean())), 'classification'] = 2

In [41]:
C4_depnorth['classification']

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
24091    2.0
24092    2.0
24093    1.0
24094    1.0
24095    1.0
Name: classification, Length: 24096, dtype: float64

In [42]:
C4_depnorth['year'] = C4_depnorth['datetime'].dt.year
C4_depnorth['month'] = C4_depnorth['datetime'].dt.month
C4_depnorth['day'] = C4_depnorth['datetime'].dt.day
C4_depnorth['hour'] = C4_depnorth['datetime'].dt.hour

In [43]:
C4_depnorth

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2019-01-01,2,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,2408,52.41,1.0,2019-01-01 00:00:00,2019,1,1,0
1,2019-01-01,2,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3406,49.32,1.0,2019-01-01 01:00:00,2019,1,1,1
2,2019-01-01,2,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,2147,54.63,1.0,2019-01-01 02:00:00,2019,1,1,2
3,2019-01-01,2,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1421,58.31,1.0,2019-01-01 03:00:00,2019,1,1,3
4,2019-01-01,2,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1158,59.63,1.0,2019-01-01 04:00:00,2019,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24091,2021-09-30,4,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,4050,31.29,2.0,2021-09-30 19:00:00,2021,9,30,19
24092,2021-09-30,4,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3162,32.11,2.0,2021-09-30 20:00:00,2021,9,30,20
24093,2021-09-30,4,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3317,45.28,1.0,2021-09-30 21:00:00,2021,9,30,21
24094,2021-09-30,4,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3000,49.40,1.0,2021-09-30 22:00:00,2021,9,30,22


# 월별로 데이터 나누기

In [44]:
C4_dn_month = C4_depnorth['month']
C4_dn_month_list  = sorted(set(C4_dn_month))
C4_dn_month_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [45]:
month_data = []
for i in range(0, len(C4_dn_month_list)):
  month = C4_depnorth[C4_depnorth['month'] == C4_dn_month_list[i]]
  month = month.reset_index(drop=True)
  month_data.append(month)

In [46]:
train_jan = month_data[0]
train_feb = month_data[1]
train_mar = month_data[2]
train_apr = month_data[3]
train_may = month_data[4]
train_jun = month_data[5]
train_jul = month_data[6]
train_aug = month_data[7]
train_sep = month_data[8]
train_oct = month_data[9]
train_nov = month_data[10]
train_dec = month_data[11]

In [47]:
C4_depnorth_test['year'] = C4_depnorth_test['datetime'].dt.year
C4_depnorth_test['month'] = C4_depnorth_test['datetime'].dt.month
C4_depnorth_test['day'] = C4_depnorth_test['datetime'].dt.day
C4_depnorth_test['hour'] = C4_depnorth_test['datetime'].dt.hour

In [48]:
C4_dn_test_mon = C4_depnorth_test['month']
C4_dn_test_mon_list  = sorted(set(C4_dn_test_mon))
C4_dn_test_mon_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [49]:
month_test_data = []
for i in range(0, len(C4_dn_month_list)):
  month = C4_depnorth_test[C4_depnorth_test['month'] == C4_dn_test_mon_list[i]]
  month = month.reset_index(drop=True)
  month_test_data.append(month)

In [50]:
test_jan = month_test_data[0]
test_feb = month_test_data[1]
test_mar = month_test_data[2]
test_apr = month_test_data[3]
test_may = month_test_data[4]
test_jun = month_test_data[5]
test_jul = month_test_data[6]
test_aug = month_test_data[7]
test_sep = month_test_data[8]
test_oct = month_test_data[9]
test_nov = month_test_data[10]
test_dec = month_test_data[11]

In [51]:
test_dec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            744 non-null    object        
 1   dow             744 non-null    int64         
 2   time            744 non-null    object        
 3   branch_name     744 non-null    object        
 4   district_name   744 non-null    object        
 5   branch_num      744 non-null    object        
 6   dep_point       744 non-null    object        
 7   arr_point       744 non-null    object        
 8   lane            744 non-null    int64         
 9   distance        744 non-null    int64         
 10  volume          744 non-null    int64         
 11  speed           744 non-null    float64       
 12  classification  0 non-null      float64       
 13  datetime        744 non-null    datetime64[ns]
 14  year            744 non-null    int64         
 15  month 

# 1월 데이터 머신러닝

## 데이터 가공

In [52]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [53]:
X1 = train_jan.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [54]:
y1 = train_jan[['datetime', 'classification']]
X1_1 = X1.drop(columns = ['datetime', 'classification'])
y1_1 = X1.datetime

In [55]:
X1_1_scaler = scaler.fit_transform(X1_1)

In [56]:
X1_1_sc = pd.DataFrame(X1_1_scaler)
X1_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X1 = pd.concat([y1_1, X1_1_sc], axis = 1)

In [57]:
X1_test = test_jan.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [58]:
y1_test = test_jan[['datetime', 'classification']]
X1_1_test = X1_test.drop(columns = ['datetime', 'classification'])
y1_1_test = X1_test.datetime

In [59]:
X1_1_test_scaler = scaler.fit_transform(X1_1_test)

In [60]:
X1_1_test_sc = pd.DataFrame(X1_1_test_scaler)
X1_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X1_test = pd.concat([y1_1_test, X1_1_test_sc], axis = 1)
X1_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-01-01 00:00:00,0.833333,0.308015,0.907001,0.0,0.0,0.0,0.000000
1,2022-01-01 01:00:00,0.833333,0.335394,0.904661,0.0,0.0,0.0,0.043478
2,2022-01-01 02:00:00,0.833333,0.163612,0.949989,0.0,0.0,0.0,0.086957
3,2022-01-01 03:00:00,0.833333,0.108192,0.935731,0.0,0.0,0.0,0.130435
4,2022-01-01 04:00:00,0.833333,0.098697,0.946584,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-01-31 19:00:00,0.000000,0.679399,0.672484,0.0,0.0,1.0,0.826087
740,2022-01-31 20:00:00,0.000000,0.685361,0.687593,0.0,0.0,1.0,0.869565
741,2022-01-31 21:00:00,0.000000,0.561272,0.667163,0.0,0.0,1.0,0.913043
742,2022-01-31 22:00:00,0.000000,0.369397,0.707597,0.0,0.0,1.0,0.956522


## LightGBM

In [61]:
# optuna 설치
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.3-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 7.5 MB/s 
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting cliff
  Downloading cliff-4.1.0-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 8.9 MB/s 
Collecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 59.8 MB/s 
[?25hCollecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 8.5 MB/s 
Collecting cmd2>=1.0.0
  Downloading cmd2-2.4.2-py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 61.7 MB/s 
[?25hCollecting stevedore>=2.0.1
  Downloading stevedore-4.1.1-py3-none-any.whl (5

In [62]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error

In [63]:
import lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score

In [64]:
X1 = X1.drop(columns = ['datetime'])
y1 = y1.drop(columns = ['datetime'])
X1_test = X1_test.drop(columns = ['datetime'])
y1_test = y1_test.drop(columns = ['datetime'])

In [65]:
# LigthGBM 하이퍼파라미터 값 지정
def objectiveLGBM(trial: Trial, X, y):
    param = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'learning_rate': 0.01,
        'n_estimators': trial.suggest_int('n_estimators', 700, 3000),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'gpu_use_dp':True
    }
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

     # 학습 모델 생성
    model = LGBMClassifier(**param)
    lgb_model = model.fit(X_train, y_train, verbose=True) # 학습 진행
    train_preds = lgb_model.predict(X_train)
    test_preds = lgb_model.predict(X_test)

    # 모델 성능 확인
    train_precision = precision_score(y_test, test_preds, average= "macro")
    
    return train_precision

In [66]:
study1 = optuna.create_study(direction='maximize',sampler=TPESampler())
study1.optimize(lambda trial : objectiveLGBM(trial, X1, y1), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study1.best_trial.value,study1.best_trial.params))

[32m[I 2022-12-01 02:13:46,040][0m A new study created in memory with name: no-name-1d1e5205-e4b8-4777-87c9-d4d34a8a03d5[0m
[32m[I 2022-12-01 02:13:50,028][0m Trial 0 finished with value: 0.9968841354723708 and parameters: {'num_leaves': 282, 'n_estimators': 1212, 'feature_fraction': 0.460850860424796, 'bagging_fraction': 0.7833116492683841, 'bagging_freq': 7, 'min_child_samples': 38}. Best is trial 0 with value: 0.9968841354723708.[0m
[32m[I 2022-12-01 02:13:52,593][0m Trial 1 finished with value: 0.8760683760683761 and parameters: {'num_leaves': 316, 'n_estimators': 2097, 'feature_fraction': 0.6013489834136801, 'bagging_fraction': 0.49338551547427156, 'bagging_freq': 2, 'min_child_samples': 93}. Best is trial 0 with value: 0.9968841354723708.[0m
[32m[I 2022-12-01 02:14:20,212][0m Trial 2 finished with value: 0.9583747927031508 and parameters: {'num_leaves': 338, 'n_estimators': 2182, 'feature_fraction': 0.6247259238647702, 'bagging_fraction': 0.46901950244065876, 'bagging_

Best trial: score 1.0,
params {'num_leaves': 164, 'n_estimators': 1620, 'feature_fraction': 0.4347119727212799, 'bagging_fraction': 0.9409637712476044, 'bagging_freq': 7, 'min_child_samples': 39}


In [67]:
optuna.visualization.plot_param_importances(study1) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study1) # 최적화 과정 시각화

In [69]:
X1_train, X1_val, y1_train, y1_val = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

In [70]:
X1_train.shape, X1_val.shape, y1_train.shape, y1_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [73]:
model = LGBMClassifier(**study1.best_trial.params)

In [74]:
model1 = model.fit(X1_train, y1_train,
          eval_set = [(X1_train, y1_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.729716	training's multi_logloss: 0.729716
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.69651	training's multi_logloss: 0.69651
[3]	training's multi_logloss: 0.619436	training's multi_logloss: 0.619436
[4]	training's multi_logloss: 0.612602	training's multi_logloss: 0.612602
[5]	training's multi_logloss: 0.555431	training's multi_logloss: 0.555431
[6]	training's multi_logloss: 0.508109	training's multi_logloss: 0.508109
[7]	training's multi_logloss: 0.464154	training's multi_logloss: 0.464154
[8]	training's multi_logloss: 0.425833	training's multi_logloss: 0.425833
[9]	training's multi_logloss: 0.393025	training's multi_logloss: 0.393025
[10]	training's multi_logloss: 0.362689	training's multi_logloss: 0.362689
[11]	training's multi_logloss: 0.342571	training's multi_logloss: 0.342571
[12]	training's multi_logloss: 0.323359	training's multi_logloss: 0.323359
[13]	training's multi_logloss: 0.297236	training

In [75]:
train1_preds = model1.predict(X1_train)
val1_preds = model1.predict(X1_val)

In [76]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [77]:
get_clf_eval(y1_train, train1_preds)
get_clf_eval(y1_val, val1_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9969
재현율: 0.9679


In [78]:
preds_1 = model1.predict(X1_test)
preds_1

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 1., 1., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 1., 1., 1., 1., 1., 2., 3., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 1., 1.,
       1., 1., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 1., 1., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 2.

In [79]:
test_jan['classification'] = preds_1
test_jan

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-01-01,6,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1627,55.55,1.0,2022-01-01 00:00:00,2022,1,1,0
1,2022-01-01,6,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1751,55.44,1.0,2022-01-01 01:00:00,2022,1,1,1
2,2022-01-01,6,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,973,57.57,1.0,2022-01-01 02:00:00,2022,1,1,2
3,2022-01-01,6,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,722,56.90,1.0,2022-01-01 03:00:00,2022,1,1,3
4,2022-01-01,6,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,679,57.41,1.0,2022-01-01 04:00:00,2022,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-01-31,1,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3309,44.53,1.0,2022-01-31 19:00:00,2022,1,31,19
740,2022-01-31,1,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3336,45.24,1.0,2022-01-31 20:00:00,2022,1,31,20
741,2022-01-31,1,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,2774,44.28,1.0,2022-01-31 21:00:00,2022,1,31,21
742,2022-01-31,1,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1905,46.18,1.0,2022-01-31 22:00:00,2022,1,31,22


# 2월 데이터 머신러닝

## 데이터 가공

In [80]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [81]:
X2 = train_feb.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [82]:
y2 = train_feb[['datetime', 'classification']]
X2_1 = X2.drop(columns = ['datetime', 'classification'])
y2_1 = X2.datetime

In [83]:
X2_1_scaler = scaler.fit_transform(X2_1)

In [84]:
X2_1_sc = pd.DataFrame(X2_1_scaler)
X2_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X2 = pd.concat([y2_1, X2_1_sc], axis = 1)
X2

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-02-01 00:00:00,0.666667,0.516946,0.836152,0.0,0.0,0.000000,0.000000
1,2019-02-01 01:00:00,0.666667,0.333134,0.920492,0.0,0.0,0.000000,0.043478
2,2019-02-01 02:00:00,0.666667,0.208134,0.962223,0.0,0.0,0.000000,0.086957
3,2019-02-01 03:00:00,0.666667,0.160885,0.969031,0.0,0.0,0.000000,0.130435
4,2019-02-01 04:00:00,0.666667,0.192384,0.934549,0.0,0.0,0.000000,0.173913
...,...,...,...,...,...,...,...,...
2035,2021-02-28 19:00:00,1.000000,0.580144,0.557215,1.0,0.0,0.964286,0.826087
2036,2021-02-28 20:00:00,1.000000,0.640949,0.614320,1.0,0.0,0.964286,0.869565
2037,2021-02-28 21:00:00,1.000000,0.564793,0.659345,1.0,0.0,0.964286,0.913043
2038,2021-02-28 22:00:00,1.000000,0.562799,0.714694,1.0,0.0,0.964286,0.956522


In [85]:
X2_test = test_feb.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [86]:
y2_test = test_feb[['datetime', 'classification']]
X2_1_test = X2_test.drop(columns = ['datetime', 'classification'])
y2_1_test = X2_test.datetime

In [87]:
X2_1_test_scaler = scaler.fit_transform(X2_1_test)

In [88]:
X2_1_test_sc = pd.DataFrame(X2_1_test_scaler)
X2_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X2_test = pd.concat([y2_1_test, X2_1_test_sc], axis = 1)
X2_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-02-01 00:00:00,0.166667,0.105813,0.854026,0.0,0.0,0.0,0.000000
1,2022-02-01 01:00:00,0.166667,0.044732,0.881017,0.0,0.0,0.0,0.043478
2,2022-02-01 02:00:00,0.166667,0.012489,0.861898,0.0,0.0,0.0,0.086957
3,2022-02-01 03:00:00,0.166667,0.000000,0.819163,0.0,0.0,0.0,0.130435
4,2022-02-01 04:00:00,0.166667,0.009764,0.643275,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
667,2022-02-28 19:00:00,0.000000,0.825386,0.424202,0.0,0.0,1.0,0.826087
668,2022-02-28 20:00:00,0.000000,0.688919,0.580747,0.0,0.0,1.0,0.869565
669,2022-02-28 21:00:00,0.000000,0.694142,0.656095,0.0,0.0,1.0,0.913043
670,2022-02-28 22:00:00,0.000000,0.611035,0.706928,0.0,0.0,1.0,0.956522


## LightGBM

In [89]:
X2 = X2.drop(columns = ['datetime'])
y2 = y2.drop(columns = ['datetime'])
X2_test = X2_test.drop(columns = ['datetime'])
y2_test = y2_test.drop(columns = ['datetime'])

In [90]:
study2 = optuna.create_study(direction='maximize',sampler=TPESampler())
study2.optimize(lambda trial : objectiveLGBM(trial, X2, y2), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study2.best_trial.value,study2.best_trial.params))

[32m[I 2022-12-01 02:24:00,154][0m A new study created in memory with name: no-name-d34f20bb-218f-4bf8-b888-4df8de1b92dd[0m
[32m[I 2022-12-01 02:24:01,084][0m Trial 0 finished with value: 0.9204419889502763 and parameters: {'num_leaves': 433, 'n_estimators': 787, 'feature_fraction': 0.9404003490167663, 'bagging_fraction': 0.4279380918681003, 'bagging_freq': 1, 'min_child_samples': 84}. Best is trial 0 with value: 0.9204419889502763.[0m
[32m[I 2022-12-01 02:24:05,556][0m Trial 1 finished with value: 0.9953271028037384 and parameters: {'num_leaves': 296, 'n_estimators': 2360, 'feature_fraction': 0.6471984293265005, 'bagging_fraction': 0.7864426503281745, 'bagging_freq': 6, 'min_child_samples': 57}. Best is trial 1 with value: 0.9953271028037384.[0m
[32m[I 2022-12-01 02:24:10,874][0m Trial 2 finished with value: 0.956876456876457 and parameters: {'num_leaves': 185, 'n_estimators': 2841, 'feature_fraction': 0.6780079456589383, 'bagging_fraction': 0.43188584235174216, 'bagging_fr

Best trial: score 1.0,
params {'num_leaves': 507, 'n_estimators': 1996, 'feature_fraction': 0.7881205520450134, 'bagging_fraction': 0.7966932674835542, 'bagging_freq': 3, 'min_child_samples': 36}


In [91]:
optuna.visualization.plot_param_importances(study2) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study2) # 최적화 과정 시각화

In [92]:
X2_train, X2_val, y2_train, y2_val = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

In [93]:
X2_train.shape, X2_val.shape, y2_train.shape, y2_val.shape

((1632, 7), (408, 7), (1632, 1), (408, 1))

In [94]:
model = LGBMClassifier(**study2.best_trial.params)

In [95]:
model2 = model.fit(X2_train, y2_train,
          eval_set = [(X2_train, y2_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.692392	training's multi_logloss: 0.692392
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.609817	training's multi_logloss: 0.609817
[3]	training's multi_logloss: 0.530282	training's multi_logloss: 0.530282
[4]	training's multi_logloss: 0.46637	training's multi_logloss: 0.46637
[5]	training's multi_logloss: 0.413432	training's multi_logloss: 0.413432
[6]	training's multi_logloss: 0.370408	training's multi_logloss: 0.370408
[7]	training's multi_logloss: 0.33737	training's multi_logloss: 0.33737
[8]	training's multi_logloss: 0.308926	training's multi_logloss: 0.308926
[9]	training's multi_logloss: 0.27921	training's multi_logloss: 0.27921
[10]	training's multi_logloss: 0.255733	training's multi_logloss: 0.255733
[11]	training's multi_logloss: 0.23072	training's multi_logloss: 0.23072
[12]	training's multi_logloss: 0.21221	training's multi_logloss: 0.21221
[13]	training's multi_logloss: 0.195547	training's multi

In [96]:
train2_preds = model2.predict(X2_train)
val2_preds = model2.predict(X2_val)

In [97]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [98]:
get_clf_eval(y2_train, train2_preds)
get_clf_eval(y2_val, val2_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [99]:
preds_2= model2.predict(X2_test)
preds_2

array([1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 2.

In [100]:
test_feb['classification'] = preds_2
test_feb

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-02-01,2,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,713,55.66,1.0,2022-02-01 00:00:00,2022,2,1,0
1,2022-02-01,2,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,444,56.86,1.0,2022-02-01 01:00:00,2022,2,1,1
2,2022-02-01,2,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,302,56.01,1.0,2022-02-01 02:00:00,2022,2,1,2
3,2022-02-01,2,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,247,54.11,1.0,2022-02-01 03:00:00,2022,2,1,3
4,2022-02-01,2,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,290,46.29,1.0,2022-02-01 04:00:00,2022,2,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2022-02-28,1,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3882,36.55,2.0,2022-02-28 19:00:00,2022,2,28,19
668,2022-02-28,1,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3281,43.51,2.0,2022-02-28 20:00:00,2022,2,28,20
669,2022-02-28,1,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3304,46.86,1.0,2022-02-28 21:00:00,2022,2,28,21
670,2022-02-28,1,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,2938,49.12,1.0,2022-02-28 22:00:00,2022,2,28,22


# 3월 데이터 머신러닝

## 데이터 가공

In [101]:
X3 = train_mar.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [102]:
y3 = train_mar[['datetime', 'classification']]
X3_1 = X3.drop(columns = ['datetime', 'classification'])
y3_1 = X3.datetime

In [103]:
X3_1_scaler = scaler.fit_transform(X3_1)

In [104]:
X3_1_sc = pd.DataFrame(X3_1_scaler)
X3_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X3 = pd.concat([y3_1, X3_1_sc], axis = 1)
X3

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-03-01 00:00:00,0.666667,0.520095,0.793000,0.0,0.0,0.0,0.000000
1,2019-03-01 01:00:00,0.666667,0.385540,0.884977,0.0,0.0,0.0,0.043478
2,2019-03-01 02:00:00,0.666667,0.283491,0.943662,0.0,0.0,0.0,0.086957
3,2019-03-01 03:00:00,0.666667,0.204492,0.976099,0.0,0.0,0.0,0.130435
4,2019-03-01 04:00:00,0.666667,0.226556,0.954545,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-03-31 19:00:00,0.333333,0.629630,0.367478,1.0,0.0,1.0,0.826087
2228,2021-03-31 20:00:00,0.333333,0.557723,0.590909,1.0,0.0,1.0,0.869565
2229,2021-03-31 21:00:00,0.333333,0.562648,0.670294,1.0,0.0,1.0,0.913043
2230,2021-03-31 22:00:00,0.333333,0.570528,0.670081,1.0,0.0,1.0,0.956522


In [105]:
X3_test = test_mar.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [106]:
y3_test = test_mar[['datetime', 'classification']]
X3_1_test = X3_test.drop(columns = ['datetime', 'classification'])
y3_1_test = X3_test.datetime

In [107]:
X3_1_test_scaler = scaler.fit_transform(X3_1_test)

In [108]:
X3_1_test_sc = pd.DataFrame(X3_1_test_scaler)
X3_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X3_test = pd.concat([y3_1_test, X3_1_test_sc], axis = 1)
X3_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-03-01 00:00:00,0.166667,0.239919,0.912002,0.0,0.0,0.0,0.000000
1,2022-03-01 01:00:00,0.166667,0.133065,0.967109,0.0,0.0,0.0,0.043478
2,2022-03-01 02:00:00,0.166667,0.081093,0.946047,0.0,0.0,0.0,0.086957
3,2022-03-01 03:00:00,0.166667,0.053539,0.871321,0.0,0.0,0.0,0.130435
4,2022-03-01 04:00:00,0.166667,0.080421,0.798615,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-03-31 19:00:00,0.500000,0.685260,0.469994,0.0,0.0,1.0,0.826087
740,2022-03-31 20:00:00,0.500000,0.638217,0.601269,0.0,0.0,1.0,0.869565
741,2022-03-31 21:00:00,0.500000,0.668235,0.624062,0.0,0.0,1.0,0.913043
742,2022-03-31 22:00:00,0.500000,0.564740,0.731679,0.0,0.0,1.0,0.956522


## LightGBM

In [109]:
X3 = X3.drop(columns = ['datetime'])
y3 = y3.drop(columns = ['datetime'])
X3_test = X3_test.drop(columns = ['datetime'])
y3_test = y3_test.drop(columns = ['datetime'])

In [110]:
study3 = optuna.create_study(direction='maximize',sampler=TPESampler())
study3.optimize(lambda trial : objectiveLGBM(trial, X3, y3), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study3.best_trial.value,study3.best_trial.params))

[32m[I 2022-12-01 02:25:30,941][0m A new study created in memory with name: no-name-82bab878-c6f1-4df5-a0c4-9d0e380bf5ae[0m
[32m[I 2022-12-01 02:25:36,430][0m Trial 0 finished with value: 0.8898587933247754 and parameters: {'num_leaves': 414, 'n_estimators': 2841, 'feature_fraction': 0.49594467072878434, 'bagging_fraction': 0.9409813791506237, 'bagging_freq': 7, 'min_child_samples': 75}. Best is trial 0 with value: 0.8898587933247754.[0m
[32m[I 2022-12-01 02:25:38,981][0m Trial 1 finished with value: 0.9965635738831615 and parameters: {'num_leaves': 157, 'n_estimators': 1016, 'feature_fraction': 0.7883190282511638, 'bagging_fraction': 0.50856104222589, 'bagging_freq': 5, 'min_child_samples': 27}. Best is trial 1 with value: 0.9965635738831615.[0m
[32m[I 2022-12-01 02:25:43,813][0m Trial 2 finished with value: 0.9983249581239532 and parameters: {'num_leaves': 219, 'n_estimators': 2365, 'feature_fraction': 0.8817142137614234, 'bagging_fraction': 0.7602315128112245, 'bagging_fr

Best trial: score 0.9985693848354793,
params {'num_leaves': 420, 'n_estimators': 805, 'feature_fraction': 0.6695299865307907, 'bagging_fraction': 0.8061558910140447, 'bagging_freq': 4, 'min_child_samples': 36}


In [111]:
optuna.visualization.plot_param_importances(study3) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study3) # 최적화 과정 시각화

In [112]:
X3_train, X3_val, y3_train, y3_val = train_test_split(X3, y3, test_size = 0.2, random_state = 42)

In [113]:
X3_train.shape, X3_val.shape, y3_train.shape, y3_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [114]:
model = LGBMClassifier(**study3.best_trial.params)

In [115]:
model3 = model.fit(X3_train, y3_train,
          eval_set = [(X3_train, y3_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.687369	training's multi_logloss: 0.687369
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.605591	training's multi_logloss: 0.605591
[3]	training's multi_logloss: 0.520515	training's multi_logloss: 0.520515
[4]	training's multi_logloss: 0.456926	training's multi_logloss: 0.456926
[5]	training's multi_logloss: 0.404106	training's multi_logloss: 0.404106
[6]	training's multi_logloss: 0.361117	training's multi_logloss: 0.361117
[7]	training's multi_logloss: 0.327695	training's multi_logloss: 0.327695
[8]	training's multi_logloss: 0.29972	training's multi_logloss: 0.29972
[9]	training's multi_logloss: 0.269779	training's multi_logloss: 0.269779
[10]	training's multi_logloss: 0.247065	training's multi_logloss: 0.247065
[11]	training's multi_logloss: 0.222171	training's multi_logloss: 0.222171
[12]	training's multi_logloss: 0.202992	training's multi_logloss: 0.202992
[13]	training's multi_logloss: 0.186323	training

In [116]:
train3_preds = model3.predict(X3_train)
val3_preds = model3.predict(X3_val)

In [117]:
get_clf_eval(y3_train, train3_preds)
get_clf_eval(y3_val, val3_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9966
재현율: 0.9972


In [118]:
preds_3= model3.predict(X3_test)
preds_3

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 3., 2., 3., 3., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 3., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 3.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 2.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 3., 2.

In [119]:
test_mar['classification'] = preds_3
test_mar

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-03-01,2,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1463,57.25,1.0,2022-03-01 00:00:00,2022,3,1,0
1,2022-03-01,2,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,986,59.16,1.0,2022-03-01 01:00:00,2022,3,1,1
2,2022-03-01,2,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,754,58.43,1.0,2022-03-01 02:00:00,2022,3,1,2
3,2022-03-01,2,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,631,55.84,1.0,2022-03-01 03:00:00,2022,3,1,3
4,2022-03-01,2,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,751,53.32,1.0,2022-03-01 04:00:00,2022,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-03-31,4,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3451,41.93,2.0,2022-03-31 19:00:00,2022,3,31,19
740,2022-03-31,4,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3241,46.48,1.0,2022-03-31 20:00:00,2022,3,31,20
741,2022-03-31,4,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3375,47.27,1.0,2022-03-31 21:00:00,2022,3,31,21
742,2022-03-31,4,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,2913,51.00,1.0,2022-03-31 22:00:00,2022,3,31,22


# 4월 데이터 머신러닝

## 데이터 가공

In [120]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [121]:
X4 = train_apr.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [122]:
y4 = train_apr[['datetime', 'classification']]
X4_1 = X4.drop(columns = ['datetime', 'classification'])
y4_1 = X4.datetime

In [123]:
X4_1_scaler = scaler.fit_transform(X4_1)

In [124]:
X4_1_sc = pd.DataFrame(X4_1_scaler)
X4_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X4 = pd.concat([y4_1, X4_1_sc], axis = 1)
X4

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-04-01 00:00:00,0.000000,0.288155,0.856908,0.0,0.0,0.0,0.000000
1,2019-04-01 01:00:00,0.000000,0.155146,0.915623,0.0,0.0,0.0,0.043478
2,2019-04-01 02:00:00,0.000000,0.078447,0.898583,0.0,0.0,0.0,0.086957
3,2019-04-01 03:00:00,0.000000,0.094369,0.904332,0.0,0.0,0.0,0.130435
4,2019-04-01 04:00:00,0.000000,0.230097,0.883802,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-04-30 19:00:00,0.666667,0.554757,0.210634,1.0,0.0,1.0,0.826087
2156,2021-04-30 20:00:00,0.666667,0.566214,0.492301,1.0,0.0,1.0,0.869565
2157,2021-04-30 21:00:00,0.666667,0.614757,0.599877,1.0,0.0,1.0,0.913043
2158,2021-04-30 22:00:00,0.666667,0.641165,0.614863,1.0,0.0,1.0,0.956522


In [125]:
X4_test = test_apr.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [126]:
y4_test = test_apr[['datetime', 'classification']]
X4_1_test = X4_test.drop(columns = ['datetime', 'classification'])
y4_1_test = X4_test.datetime

In [127]:
X4_1_test_scaler = scaler.fit_transform(X4_1_test)

In [128]:
X4_1_test_sc = pd.DataFrame(X4_1_test_scaler)
X4_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X4_test = pd.concat([y4_1_test, X4_1_test_sc], axis = 1)
X4_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-04-01 00:00:00,0.666667,0.237230,0.902208,0.0,0.0,0.0,0.000000
1,2022-04-01 01:00:00,0.666667,0.113438,0.958728,0.0,0.0,0.0,0.043478
2,2022-04-01 02:00:00,0.666667,0.045329,0.895110,0.0,0.0,0.0,0.086957
3,2022-04-01 03:00:00,0.666667,0.037046,0.884595,0.0,0.0,0.0,0.130435
4,2022-04-01 04:00:00,0.666667,0.125633,0.865931,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-04-30 19:00:00,0.833333,0.670732,0.465563,0.0,0.0,1.0,0.826087
716,2022-04-30 20:00:00,0.833333,0.615048,0.333596,0.0,0.0,1.0,0.869565
717,2022-04-30 21:00:00,0.833333,0.674643,0.563354,0.0,0.0,1.0,0.913043
718,2022-04-30 22:00:00,0.833333,0.610216,0.634069,0.0,0.0,1.0,0.956522


## LightGBM

In [129]:
X4 = X4.drop(columns = ['datetime'])
y4 = y4.drop(columns = ['datetime'])
X4_test = X4_test.drop(columns = ['datetime'])
y4_test = y4_test.drop(columns = ['datetime'])

In [130]:
study4 = optuna.create_study(direction='maximize',sampler=TPESampler())
study4.optimize(lambda trial : objectiveLGBM(trial, X4, y4), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study4.best_trial.value,study4.best_trial.params))

[32m[I 2022-12-01 02:27:03,352][0m A new study created in memory with name: no-name-4f52c7d0-b4d6-4c66-8b3e-54bc1e4269c7[0m
[32m[I 2022-12-01 02:27:06,760][0m Trial 0 finished with value: 0.9754890551854499 and parameters: {'num_leaves': 175, 'n_estimators': 1523, 'feature_fraction': 0.5789107984255903, 'bagging_fraction': 0.9293801885017837, 'bagging_freq': 7, 'min_child_samples': 60}. Best is trial 0 with value: 0.9754890551854499.[0m
[32m[I 2022-12-01 02:27:11,066][0m Trial 1 finished with value: 0.8825995807127883 and parameters: {'num_leaves': 299, 'n_estimators': 2694, 'feature_fraction': 0.9849927615302115, 'bagging_fraction': 0.7759389593751099, 'bagging_freq': 3, 'min_child_samples': 94}. Best is trial 0 with value: 0.9754890551854499.[0m
[32m[I 2022-12-01 02:27:18,164][0m Trial 2 finished with value: 0.9968992248062015 and parameters: {'num_leaves': 157, 'n_estimators': 1480, 'feature_fraction': 0.4297566143417588, 'bagging_fraction': 0.5565324488339808, 'bagging_f

Best trial: score 1.0,
params {'num_leaves': 415, 'n_estimators': 1072, 'feature_fraction': 0.5194803621173848, 'bagging_fraction': 0.7787457034830918, 'bagging_freq': 1, 'min_child_samples': 25}


In [131]:
optuna.visualization.plot_param_importances(study4) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study4) # 최적화 과정 시각화

In [132]:
X4_train, X4_val, y4_train, y4_val = train_test_split(X4, y4, test_size = 0.2, random_state = 42)

In [133]:
X4_train.shape, X4_val.shape, y4_train.shape, y4_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [134]:
model = LGBMClassifier(**study4.best_trial.params)

In [135]:
model4 = model.fit(X4_train, y4_train,
          eval_set = [(X4_train, y4_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.739097	training's multi_logloss: 0.739097
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.624407	training's multi_logloss: 0.624407
[3]	training's multi_logloss: 0.555	training's multi_logloss: 0.555
[4]	training's multi_logloss: 0.496639	training's multi_logloss: 0.496639
[5]	training's multi_logloss: 0.439427	training's multi_logloss: 0.439427
[6]	training's multi_logloss: 0.400977	training's multi_logloss: 0.400977
[7]	training's multi_logloss: 0.363989	training's multi_logloss: 0.363989
[8]	training's multi_logloss: 0.330892	training's multi_logloss: 0.330892
[9]	training's multi_logloss: 0.310494	training's multi_logloss: 0.310494
[10]	training's multi_logloss: 0.285716	training's multi_logloss: 0.285716
[11]	training's multi_logloss: 0.269416	training's multi_logloss: 0.269416
[12]	training's multi_logloss: 0.249124	training's multi_logloss: 0.249124
[13]	training's multi_logloss: 0.228765	training's m

In [136]:
train4_preds = model4.predict(X4_train)
val4_preds = model4.predict(X4_val)

In [137]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [138]:
get_clf_eval(y4_train, train4_preds)
get_clf_eval(y4_val, val4_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9833
재현율: 0.9984


In [139]:
preds_4= model4.predict(X4_test)
preds_4

array([1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 3., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 3., 2., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 3., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 3., 2., 2., 2., 2., 2., 2., 2., 3.,
       3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       3., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [140]:
test_apr['classification'] = preds_4
test_apr

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-04-01,5,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1508,57.76,1.0,2022-04-01 00:00:00,2022,4,1,0
1,2022-04-01,5,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,970,59.91,1.0,2022-04-01 01:00:00,2022,4,1,1
2,2022-04-01,5,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,674,57.49,1.0,2022-04-01 02:00:00,2022,4,1,2
3,2022-04-01,5,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,638,57.09,1.0,2022-04-01 03:00:00,2022,4,1,3
4,2022-04-01,5,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1023,56.38,1.0,2022-04-01 04:00:00,2022,4,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-04-30,6,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3392,41.15,2.0,2022-04-30 19:00:00,2022,4,30,19
716,2022-04-30,6,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3150,36.13,2.0,2022-04-30 20:00:00,2022,4,30,20
717,2022-04-30,6,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3409,44.87,2.0,2022-04-30 21:00:00,2022,4,30,21
718,2022-04-30,6,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3129,47.56,1.0,2022-04-30 22:00:00,2022,4,30,22


# 5월 데이터 머신러닝

## 데이터 가공

In [141]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [142]:
X5 = train_may.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [143]:
y5 = train_may[['datetime', 'classification']]
X5_1 = X5.drop(columns = ['datetime', 'classification'])
y5_1 = X5.datetime

In [144]:
X5_1_scaler = scaler.fit_transform(X5_1)

In [145]:
X5_1_sc = pd.DataFrame(X5_1_scaler)
X5_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X5 = pd.concat([y5_1, X5_1_sc], axis = 1)
X5

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-05-01 00:00:00,0.333333,0.530073,0.823176,0.0,0.0,0.0,0.000000
1,2019-05-01 01:00:00,0.333333,0.403668,0.895315,0.0,0.0,0.0,0.043478
2,2019-05-01 02:00:00,0.333333,0.287714,0.943823,0.0,0.0,0.0,0.086957
3,2019-05-01 03:00:00,0.333333,0.224019,0.973673,0.0,0.0,0.0,0.130435
4,2019-05-01 04:00:00,0.333333,0.228949,0.950456,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-05-31 19:00:00,0.000000,0.606981,0.507255,1.0,0.0,1.0,0.826087
2228,2021-05-31 20:00:00,0.000000,0.556301,0.632048,1.0,0.0,1.0,0.869565
2229,2021-05-31 21:00:00,0.000000,0.590219,0.667289,1.0,0.0,1.0,0.913043
2230,2021-05-31 22:00:00,0.000000,0.572668,0.717247,1.0,0.0,1.0,0.956522


In [146]:
X5_test = test_may.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [147]:
y5_test = test_may[['datetime', 'classification']]
X5_1_test = X5_test.drop(columns = ['datetime', 'classification'])
y5_1_test = X5_test.datetime

In [148]:
X5_1_test_scaler = scaler.fit_transform(X5_1_test)

In [149]:
X5_1_test_sc = pd.DataFrame(X5_1_test_scaler)
X5_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X5_test = pd.concat([y5_1_test, X5_1_test_sc], axis = 1)
X5_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-05-01 00:00:00,1.000000,0.384842,0.843604,0.0,0.0,0.0,0.000000
1,2022-05-01 01:00:00,1.000000,0.251131,0.919990,0.0,0.0,0.0,0.043478
2,2022-05-01 02:00:00,1.000000,0.151357,0.967633,0.0,0.0,0.0,0.086957
3,2022-05-01 03:00:00,1.000000,0.102715,0.967115,0.0,0.0,0.0,0.130435
4,2022-05-01 04:00:00,1.000000,0.087783,0.939151,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-05-31 19:00:00,0.166667,0.680317,0.309684,0.0,0.0,1.0,0.826087
740,2022-05-31 20:00:00,0.166667,0.582353,0.298032,0.0,0.0,1.0,0.869565
741,2022-05-31 21:00:00,0.166667,0.621493,0.485759,0.0,0.0,1.0,0.913043
742,2022-05-31 22:00:00,0.166667,0.629412,0.620145,0.0,0.0,1.0,0.956522


## LightGBM

In [150]:
X5 = X5.drop(columns = ['datetime'])
y5 = y5.drop(columns = ['datetime'])
X5_test = X5_test.drop(columns = ['datetime'])
y5_test = y5_test.drop(columns = ['datetime'])

In [151]:
study5 = optuna.create_study(direction='maximize',sampler=TPESampler())
study5.optimize(lambda trial : objectiveLGBM(trial, X5, y5), n_trials=20) 
print('Best trial: score {},\nparams {}'.format(study5.best_trial.value,study5.best_trial.params))

[32m[I 2022-12-01 02:28:51,292][0m A new study created in memory with name: no-name-83681fd3-e1dd-48c9-adcd-f08dab9c8357[0m
[32m[I 2022-12-01 02:28:54,246][0m Trial 0 finished with value: 0.9984779299847792 and parameters: {'num_leaves': 334, 'n_estimators': 977, 'feature_fraction': 0.6412549458039347, 'bagging_fraction': 0.9712672304074385, 'bagging_freq': 4, 'min_child_samples': 54}. Best is trial 0 with value: 0.9984779299847792.[0m
[32m[I 2022-12-01 02:28:58,123][0m Trial 1 finished with value: 0.9449071108263934 and parameters: {'num_leaves': 440, 'n_estimators': 2007, 'feature_fraction': 0.6744395158603274, 'bagging_fraction': 0.9274803753970717, 'bagging_freq': 1, 'min_child_samples': 68}. Best is trial 0 with value: 0.9984779299847792.[0m
[32m[I 2022-12-01 02:28:59,618][0m Trial 2 finished with value: 0.965266106442577 and parameters: {'num_leaves': 56, 'n_estimators': 772, 'feature_fraction': 0.6219063055456774, 'bagging_fraction': 0.4667044761840363, 'bagging_freq'

Best trial: score 0.9984779299847792,
params {'num_leaves': 334, 'n_estimators': 977, 'feature_fraction': 0.6412549458039347, 'bagging_fraction': 0.9712672304074385, 'bagging_freq': 4, 'min_child_samples': 54}


In [152]:
optuna.visualization.plot_param_importances(study5) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study5) # 최적화 과정 시각화

In [153]:
X5_train, X5_val, y5_train, y5_val = train_test_split(X5, y5, test_size = 0.2, random_state = 42)

In [154]:
X5_train.shape, X5_val.shape, y5_train.shape, y5_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [155]:
model = LGBMClassifier(**study5.best_trial.params)

In [156]:
model5 = model.fit(X5_train, y5_train,
          eval_set = [(X5_train, y5_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.738043	training's multi_logloss: 0.738043
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.626501	training's multi_logloss: 0.626501
[3]	training's multi_logloss: 0.56069	training's multi_logloss: 0.56069
[4]	training's multi_logloss: 0.504948	training's multi_logloss: 0.504948
[5]	training's multi_logloss: 0.447503	training's multi_logloss: 0.447503
[6]	training's multi_logloss: 0.409752	training's multi_logloss: 0.409752
[7]	training's multi_logloss: 0.374358	training's multi_logloss: 0.374358
[8]	training's multi_logloss: 0.342801	training's multi_logloss: 0.342801
[9]	training's multi_logloss: 0.324035	training's multi_logloss: 0.324035
[10]	training's multi_logloss: 0.298809	training's multi_logloss: 0.298809
[11]	training's multi_logloss: 0.282338	training's multi_logloss: 0.282338
[12]	training's multi_logloss: 0.261605	training's multi_logloss: 0.261605
[13]	training's multi_logloss: 0.242074	training

In [157]:
train5_preds = model5.predict(X5_train)
val5_preds = model5.predict(X5_val)

In [158]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [159]:
get_clf_eval(y5_train, train5_preds)
get_clf_eval(y5_val, val5_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [160]:
preds_5= model5.predict(X5_test)
preds_5

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 3.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 3., 2., 2.,
       2., 2., 2., 3., 3., 3., 3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       3., 3., 3., 3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 3., 3., 2.,
       3., 2., 3., 2., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 3., 2.

In [161]:
test_may['classification'] = preds_5
test_may

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-05-01,7,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,2095,55.97,1.0,2022-05-01 00:00:00,2022,5,1,0
1,2022-05-01,7,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1504,58.92,1.0,2022-05-01 01:00:00,2022,5,1,1
2,2022-05-01,7,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1063,60.76,1.0,2022-05-01 02:00:00,2022,5,1,2
3,2022-05-01,7,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,848,60.74,1.0,2022-05-01 03:00:00,2022,5,1,3
4,2022-05-01,7,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,782,59.66,1.0,2022-05-01 04:00:00,2022,5,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-05-31,2,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3401,35.35,2.0,2022-05-31 19:00:00,2022,5,31,19
740,2022-05-31,2,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,2968,34.90,2.0,2022-05-31 20:00:00,2022,5,31,20
741,2022-05-31,2,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3141,42.15,2.0,2022-05-31 21:00:00,2022,5,31,21
742,2022-05-31,2,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3176,47.34,1.0,2022-05-31 22:00:00,2022,5,31,22


# 6월 데이터 머신러닝

## 데이터 가공

In [162]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [163]:
X6 = train_jun.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [164]:
y6 = train_jun[['datetime', 'classification']]
X6_1 = X6.drop(columns = ['datetime', 'classification'])
y6_1 = X6.datetime

In [165]:
X6_1_scaler = scaler.fit_transform(X6_1)

In [166]:
X6_1_sc = pd.DataFrame(X6_1_scaler)
X6_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X6 = pd.concat([y6_1, X6_1_sc], axis = 1)
X6

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-06-01 00:00:00,0.833333,0.579880,0.796230,0.0,0.0,0.0,0.000000
1,2019-06-01 01:00:00,0.833333,0.433267,0.870512,0.0,0.0,0.0,0.043478
2,2019-06-01 02:00:00,0.833333,0.353586,0.921230,0.0,0.0,0.0,0.086957
3,2019-06-01 03:00:00,0.833333,0.244622,0.954219,0.0,0.0,0.0,0.130435
4,2019-06-01 04:00:00,0.833333,0.269721,0.945018,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-06-30 19:00:00,0.333333,0.719522,0.426391,1.0,0.0,1.0,0.826087
2156,2021-06-30 20:00:00,0.333333,0.648606,0.585727,1.0,0.0,1.0,0.869565
2157,2021-06-30 21:00:00,0.333333,0.709562,0.630835,1.0,0.0,1.0,0.913043
2158,2021-06-30 22:00:00,0.333333,0.712948,0.647217,1.0,0.0,1.0,0.956522


In [167]:
X6_test = test_jun.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [168]:
y6_test = test_jun[['datetime', 'classification']]
X6_1_test = X6_test.drop(columns = ['datetime', 'classification'])
y6_1_test = X6_test.datetime

In [169]:
X6_1_test_scaler = scaler.fit_transform(X6_1_test)

In [170]:
X6_1_test_sc = pd.DataFrame(X6_1_test_scaler)
X6_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X6_test = pd.concat([y6_1_test, X6_1_test_sc], axis = 1)
X6_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-06-01 00:00:00,0.333333,0.160554,0.832735,0.0,0.0,0.0,0.000000
1,2022-06-01 01:00:00,0.333333,0.078252,0.905001,0.0,0.0,0.0,0.043478
2,2022-06-01 02:00:00,0.333333,0.030277,0.961713,0.0,0.0,0.0,0.086957
3,2022-06-01 03:00:00,0.333333,0.026226,0.995214,0.0,0.0,0.0,0.130435
4,2022-06-01 04:00:00,0.333333,0.105757,0.946877,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-06-30 19:00:00,0.500000,0.746482,0.189040,0.0,0.0,1.0,0.826087
716,2022-06-30 20:00:00,0.500000,0.701493,0.464226,0.0,0.0,1.0,0.869565
717,2022-06-30 21:00:00,0.500000,0.755864,0.567839,0.0,0.0,1.0,0.913043
718,2022-06-30 22:00:00,0.500000,0.775906,0.635080,0.0,0.0,1.0,0.956522


## LightGBM

In [171]:
X6 = X6.drop(columns = ['datetime'])
y6 = y6.drop(columns = ['datetime'])
X6_test = X6_test.drop(columns = ['datetime'])
y6_test = y6_test.drop(columns = ['datetime'])

In [179]:
study6 = optuna.create_study(direction='maximize',sampler=TPESampler())
study6.optimize(lambda trial : objectiveLGBM(trial, X6, y6), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study6.best_trial.value,study6.best_trial.params))

[32m[I 2022-12-01 02:30:51,916][0m A new study created in memory with name: no-name-3b6c8ea7-3467-40ba-819e-e817c1c400a0[0m
[32m[I 2022-12-01 02:30:53,935][0m Trial 0 finished with value: 0.9329527155614112 and parameters: {'num_leaves': 85, 'n_estimators': 1217, 'feature_fraction': 0.9047328277152328, 'bagging_fraction': 0.676283651104707, 'bagging_freq': 4, 'min_child_samples': 84}. Best is trial 0 with value: 0.9329527155614112.[0m
[32m[I 2022-12-01 02:30:57,272][0m Trial 1 finished with value: 0.9968404423380727 and parameters: {'num_leaves': 510, 'n_estimators': 777, 'feature_fraction': 0.9477292107374771, 'bagging_fraction': 0.9797331634848033, 'bagging_freq': 2, 'min_child_samples': 29}. Best is trial 1 with value: 0.9968404423380727.[0m
[32m[I 2022-12-01 02:30:58,344][0m Trial 2 finished with value: 0.9088785046728972 and parameters: {'num_leaves': 73, 'n_estimators': 864, 'feature_fraction': 0.9023652116438328, 'bagging_fraction': 0.5773394434588109, 'bagging_freq':

Best trial: score 1.0,
params {'num_leaves': 37, 'n_estimators': 2853, 'feature_fraction': 0.8511151996593942, 'bagging_fraction': 0.837817200460008, 'bagging_freq': 7, 'min_child_samples': 21}


In [180]:
optuna.visualization.plot_param_importances(study6) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study6) # 최적화 과정 시각화

In [181]:
X6_train, X6_val, y6_train, y6_val = train_test_split(X6, y6, test_size = 0.2, random_state = 42)

In [182]:
X6_train.shape, X6_val.shape, y6_train.shape, y6_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [183]:
model = LGBMClassifier(**study6.best_trial.params)

In [184]:
model6 = model.fit(X6_train, y6_train,
          eval_set = [(X6_train, y6_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.695629	training's multi_logloss: 0.695629
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.609576	training's multi_logloss: 0.609576
[3]	training's multi_logloss: 0.531103	training's multi_logloss: 0.531103
[4]	training's multi_logloss: 0.466943	training's multi_logloss: 0.466943
[5]	training's multi_logloss: 0.412883	training's multi_logloss: 0.412883
[6]	training's multi_logloss: 0.368551	training's multi_logloss: 0.368551
[7]	training's multi_logloss: 0.333399	training's multi_logloss: 0.333399
[8]	training's multi_logloss: 0.302398	training's multi_logloss: 0.302398
[9]	training's multi_logloss: 0.271507	training's multi_logloss: 0.271507
[10]	training's multi_logloss: 0.243542	training's multi_logloss: 0.243542
[11]	training's multi_logloss: 0.218885	training's multi_logloss: 0.218885
[12]	training's multi_logloss: 0.199385	training's multi_logloss: 0.199385
[13]	training's multi_logloss: 0.179785	traini

In [185]:
train6_preds = model6.predict(X6_train)
val6_preds = model6.predict(X6_val)

In [186]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [187]:
get_clf_eval(y6_train, train6_preds)
get_clf_eval(y6_val, val6_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9953
재현율: 0.9690


In [188]:
preds_6= model6.predict(X6_test)
preds_6

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       2., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 1., 1., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 2.

In [189]:
test_jun['classification'] = preds_6
test_jun

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-06-01,3,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1071,54.91,1.0,2022-06-01 00:00:00,2022,6,1,0
1,2022-06-01,3,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,685,57.93,1.0,2022-06-01 01:00:00,2022,6,1,1
2,2022-06-01,3,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,460,60.30,1.0,2022-06-01 02:00:00,2022,6,1,2
3,2022-06-01,3,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,441,61.70,1.0,2022-06-01 03:00:00,2022,6,1,3
4,2022-06-01,3,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,814,59.68,1.0,2022-06-01 04:00:00,2022,6,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-06-30,4,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3819,28.01,3.0,2022-06-30 19:00:00,2022,6,30,19
716,2022-06-30,4,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3608,39.51,2.0,2022-06-30 20:00:00,2022,6,30,20
717,2022-06-30,4,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3863,43.84,2.0,2022-06-30 21:00:00,2022,6,30,21
718,2022-06-30,4,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3957,46.65,1.0,2022-06-30 22:00:00,2022,6,30,22


# 7월 데이터 머신러닝

## 데이터 가공

In [190]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [191]:
X7 = train_jul.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [192]:
y7 = train_jul[['datetime', 'classification']]
X7_1 = X7.drop(columns = ['datetime', 'classification'])
y7_1 = X7.datetime

In [193]:
X7_1_scaler = scaler.fit_transform(X7_1)

In [194]:
X7_1_sc = pd.DataFrame(X7_1_scaler)
X7_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X7 = pd.concat([y7_1, X7_1_sc], axis = 1)
X7

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-07-01 00:00:00,0.000000,0.287232,0.861008,0.0,0.0,0.0,0.000000
1,2019-07-01 01:00:00,0.000000,0.155669,0.913084,0.0,0.0,0.0,0.043478
2,2019-07-01 02:00:00,0.000000,0.075587,0.920682,0.0,0.0,0.0,0.086957
3,2019-07-01 03:00:00,0.000000,0.077835,0.886953,0.0,0.0,0.0,0.130435
4,2019-07-01 04:00:00,0.000000,0.244331,0.868792,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-07-31 19:00:00,0.833333,0.662104,0.628984,1.0,0.0,1.0,0.826087
2228,2021-07-31 20:00:00,0.833333,0.654341,0.629726,1.0,0.0,1.0,0.869565
2229,2021-07-31 21:00:00,0.833333,0.644535,0.694033,1.0,0.0,1.0,0.913043
2230,2021-07-31 22:00:00,0.833333,0.629826,0.711453,1.0,0.0,1.0,0.956522


In [195]:
X7_test = test_jul.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [196]:
y7_test = test_jul[['datetime', 'classification']]
X7_1_test = X7_test.drop(columns = ['datetime', 'classification'])
y7_1_test = X7_test.datetime

In [197]:
X7_1_test_scaler = scaler.fit_transform(X7_1_test)

In [198]:
X7_1_test_sc = pd.DataFrame(X7_1_test_scaler)
X7_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X7_test = pd.concat([y7_1_test, X7_1_test_sc], axis = 1)
X7_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-07-01 00:00:00,0.666667,0.171027,0.511345,0.0,0.0,0.0,0.000000
1,2022-07-01 01:00:00,0.666667,0.111676,0.862456,0.0,0.0,0.0,0.043478
2,2022-07-01 02:00:00,0.666667,0.041667,0.946199,0.0,0.0,0.0,0.086957
3,2022-07-01 03:00:00,0.666667,0.032461,0.950643,0.0,0.0,0.0,0.130435
4,2022-07-01 04:00:00,0.666667,0.101502,0.878129,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-07-31 19:00:00,1.000000,0.676599,0.564678,0.0,0.0,1.0,0.826087
740,2022-07-31 20:00:00,1.000000,0.703731,0.505731,0.0,0.0,1.0,0.869565
741,2022-07-31 21:00:00,1.000000,0.675388,0.547368,0.0,0.0,1.0,0.913043
742,2022-07-31 22:00:00,1.000000,0.492975,0.591813,0.0,0.0,1.0,0.956522


## LightGBM

In [199]:
X7 = X7.drop(columns = ['datetime'])
y7 = y7.drop(columns = ['datetime'])
X7_test = X7_test.drop(columns = ['datetime'])
y7_test = y7_test.drop(columns = ['datetime'])

In [200]:
study7 = optuna.create_study(direction='maximize',sampler=TPESampler())
study7.optimize(lambda trial : objectiveLGBM(trial, X7, y7), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study7.best_trial.value,study7.best_trial.params))

[32m[I 2022-12-01 02:32:22,021][0m A new study created in memory with name: no-name-fd2b9c17-3079-4556-b505-195e6805ce5c[0m
[32m[I 2022-12-01 02:32:26,751][0m Trial 0 finished with value: 0.9598377812663527 and parameters: {'num_leaves': 509, 'n_estimators': 2758, 'feature_fraction': 0.98752092775718, 'bagging_fraction': 0.47131758795197065, 'bagging_freq': 6, 'min_child_samples': 47}. Best is trial 0 with value: 0.9598377812663527.[0m
[32m[I 2022-12-01 02:32:31,486][0m Trial 1 finished with value: 0.9863902179691654 and parameters: {'num_leaves': 225, 'n_estimators': 1699, 'feature_fraction': 0.7090155265411059, 'bagging_fraction': 0.8441002586724591, 'bagging_freq': 6, 'min_child_samples': 38}. Best is trial 1 with value: 0.9863902179691654.[0m
[32m[I 2022-12-01 02:32:37,491][0m Trial 2 finished with value: 0.9866666666666667 and parameters: {'num_leaves': 335, 'n_estimators': 2805, 'feature_fraction': 0.8624947906848137, 'bagging_fraction': 0.7708047992862945, 'bagging_fr

Best trial: score 1.0,
params {'num_leaves': 211, 'n_estimators': 1202, 'feature_fraction': 0.7595562783057159, 'bagging_fraction': 0.8649905457777085, 'bagging_freq': 6, 'min_child_samples': 58}


In [201]:
optuna.visualization.plot_param_importances(study7) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study7) # 최적화 과정 시각화

In [202]:
X7_train, X7_val, y7_train, y7_val = train_test_split(X7, y7, test_size = 0.2, random_state = 42)

In [203]:
X7_train.shape, X7_val.shape, y7_train.shape, y7_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [204]:
model = LGBMClassifier(**study7.best_trial.params)

In [205]:
model7 = model.fit(X7_train, y7_train,
          eval_set = [(X7_train, y7_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.767771	training's multi_logloss: 0.767771
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.678211	training's multi_logloss: 0.678211
[3]	training's multi_logloss: 0.587292	training's multi_logloss: 0.587292
[4]	training's multi_logloss: 0.515181	training's multi_logloss: 0.515181
[5]	training's multi_logloss: 0.455163	training's multi_logloss: 0.455163
[6]	training's multi_logloss: 0.408598	training's multi_logloss: 0.408598
[7]	training's multi_logloss: 0.37224	training's multi_logloss: 0.37224
[8]	training's multi_logloss: 0.342964	training's multi_logloss: 0.342964
[9]	training's multi_logloss: 0.310215	training's multi_logloss: 0.310215
[10]	training's multi_logloss: 0.284567	training's multi_logloss: 0.284567
[11]	training's multi_logloss: 0.255978	training's multi_logloss: 0.255978
[12]	training's multi_logloss: 0.234341	training's multi_logloss: 0.234341
[13]	training's multi_logloss: 0.214739	training

In [206]:
train7_preds = model7.predict(X7_train)
val7_preds = model7.predict(X7_val)

In [207]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [208]:
get_clf_eval(y7_train, train7_preds)
get_clf_eval(y7_val, val7_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9835
재현율: 0.9832


In [209]:
preds_7= model7.predict(X7_test)
preds_7

array([2., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 3., 3., 3., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 3., 3., 2., 2.,
       2., 2., 2., 3., 3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 3., 3., 3., 3., 2., 2., 2., 2., 2., 3., 3., 3., 2., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 3., 2., 2., 2., 2., 3.,
       3., 3., 3., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3.,
       3., 2., 2., 2., 2., 2., 2., 3., 3., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 3., 2., 2., 2., 2., 2., 2., 3., 3., 2., 2.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 3., 2., 2., 2., 2., 1., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [210]:
test_jul['classification'] = preds_7
test_jul

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-07-01,5,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1289,40.84,2.0,2022-07-01 00:00:00,2022,7,1,0
1,2022-07-01,5,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1044,55.85,1.0,2022-07-01 01:00:00,2022,7,1,1
2,2022-07-01,5,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,755,59.43,1.0,2022-07-01 02:00:00,2022,7,1,2
3,2022-07-01,5,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,717,59.62,1.0,2022-07-01 03:00:00,2022,7,1,3
4,2022-07-01,5,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1002,56.52,1.0,2022-07-01 04:00:00,2022,7,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-07-31,7,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3376,43.12,2.0,2022-07-31 19:00:00,2022,7,31,19
740,2022-07-31,7,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3488,40.60,2.0,2022-07-31 20:00:00,2022,7,31,20
741,2022-07-31,7,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3371,42.38,2.0,2022-07-31 21:00:00,2022,7,31,21
742,2022-07-31,7,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,2618,44.28,2.0,2022-07-31 22:00:00,2022,7,31,22


# 8월 데이터 머신러닝

## 데이터 가공

In [211]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [212]:
X8 = train_aug.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [213]:
y8 = train_aug[['datetime', 'classification']]
X8_1 = X8.drop(columns = ['datetime', 'classification'])
y8_1 = X8.datetime

In [214]:
X8_1_scaler = scaler.fit_transform(X8_1)

In [215]:
X8_1_sc = pd.DataFrame(X8_1_scaler)
X8_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X8 = pd.concat([y8_1, X8_1_sc], axis = 1)
X8

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-08-01 00:00:00,0.500000,0.357444,0.807144,0.0,0.0,0.0,0.000000
1,2019-08-01 01:00:00,0.500000,0.242114,0.796124,0.0,0.0,0.0,0.043478
2,2019-08-01 02:00:00,0.500000,0.166566,0.818735,0.0,0.0,0.0,0.086957
3,2019-08-01 03:00:00,0.500000,0.124975,0.837355,0.0,0.0,0.0,0.130435
4,2019-08-01 04:00:00,0.500000,0.173398,0.856736,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-08-31 19:00:00,0.166667,0.530038,0.131864,1.0,0.0,1.0,0.826087
2228,2021-08-31 20:00:00,0.166667,0.524814,0.133004,1.0,0.0,1.0,0.869565
2229,2021-08-31 21:00:00,0.166667,0.491662,0.081892,1.0,0.0,1.0,0.913043
2230,2021-08-31 22:00:00,0.166667,0.478802,0.349040,1.0,0.0,1.0,0.956522


In [216]:
X8_test = test_aug.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [217]:
y8_test = test_aug[['datetime', 'classification']]
X8_1_test = X8_test.drop(columns = ['datetime', 'classification'])
y8_1_test = X8_test.datetime

In [218]:
X8_1_test_scaler = scaler.fit_transform(X8_1_test)

In [219]:
X8_1_test_sc = pd.DataFrame(X8_1_test_scaler)
X8_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X8_test = pd.concat([y8_1_test, X8_1_test_sc], axis = 1)
X8_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-08-01 00:00:00,0.000000,0.156228,0.794366,0.0,0.0,0.0,0.000000
1,2022-08-01 01:00:00,0.000000,0.051761,0.861801,0.0,0.0,0.0,0.043478
2,2022-08-01 02:00:00,0.000000,0.000000,0.841393,0.0,0.0,0.0,0.086957
3,2022-08-01 03:00:00,0.000000,0.001654,0.805457,0.0,0.0,0.0,0.130435
4,2022-08-01 04:00:00,0.000000,0.161900,0.738909,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-08-31 19:00:00,0.333333,0.845899,0.254658,0.0,0.0,1.0,0.826087
740,2022-08-31 20:00:00,0.333333,0.726306,0.509982,0.0,0.0,1.0,0.869565
741,2022-08-31 21:00:00,0.333333,0.648546,0.622005,0.0,0.0,1.0,0.913043
742,2022-08-31 22:00:00,0.333333,0.636020,0.693656,0.0,0.0,1.0,0.956522


## LightGBM

In [220]:
X8 = X8.drop(columns = ['datetime'])
y8 = y8.drop(columns = ['datetime'])
X8_test = X8_test.drop(columns = ['datetime'])
y8_test = y8_test.drop(columns = ['datetime'])

In [221]:
study8 = optuna.create_study(direction='maximize',sampler=TPESampler())
study8.optimize(lambda trial : objectiveLGBM(trial, X8, y8), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study8.best_trial.value,study8.best_trial.params))

[32m[I 2022-12-01 02:33:52,458][0m A new study created in memory with name: no-name-f54ed4f4-0f71-4a72-b3ba-e5ed93ca168d[0m
[32m[I 2022-12-01 02:33:58,314][0m Trial 0 finished with value: 0.972985347985348 and parameters: {'num_leaves': 42, 'n_estimators': 1842, 'feature_fraction': 0.6036270620251024, 'bagging_fraction': 0.9901266285352807, 'bagging_freq': 1, 'min_child_samples': 29}. Best is trial 0 with value: 0.972985347985348.[0m
[32m[I 2022-12-01 02:34:00,805][0m Trial 1 finished with value: 0.9556346103051353 and parameters: {'num_leaves': 342, 'n_estimators': 2330, 'feature_fraction': 0.40005798052086605, 'bagging_fraction': 0.4170550597830917, 'bagging_freq': 6, 'min_child_samples': 100}. Best is trial 0 with value: 0.972985347985348.[0m
[32m[I 2022-12-01 02:34:04,594][0m Trial 2 finished with value: 0.9729698642742122 and parameters: {'num_leaves': 483, 'n_estimators': 1970, 'feature_fraction': 0.7316871700643693, 'bagging_fraction': 0.7164381515185693, 'bagging_fre

Best trial: score 1.0,
params {'num_leaves': 434, 'n_estimators': 2581, 'feature_fraction': 0.8098502671224883, 'bagging_fraction': 0.8244033609453337, 'bagging_freq': 6, 'min_child_samples': 5}


In [222]:
optuna.visualization.plot_param_importances(study8) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study8) # 최적화 과정 시각화

In [223]:
X8_train, X8_val, y8_train, y8_val = train_test_split(X8, y8, test_size = 0.2, random_state = 42)

In [224]:
X8_train.shape, X8_val.shape, y8_train.shape, y8_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [225]:
model = LGBMClassifier(**study8.best_trial.params)

In [226]:
model8 = model.fit(X8_train, y8_train,
          eval_set = [(X8_train, y8_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.829153	training's multi_logloss: 0.829153
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.728369	training's multi_logloss: 0.728369
[3]	training's multi_logloss: 0.637873	training's multi_logloss: 0.637873
[4]	training's multi_logloss: 0.563359	training's multi_logloss: 0.563359
[5]	training's multi_logloss: 0.499282	training's multi_logloss: 0.499282
[6]	training's multi_logloss: 0.447975	training's multi_logloss: 0.447975
[7]	training's multi_logloss: 0.404374	training's multi_logloss: 0.404374
[8]	training's multi_logloss: 0.367567	training's multi_logloss: 0.367567
[9]	training's multi_logloss: 0.33074	training's multi_logloss: 0.33074
[10]	training's multi_logloss: 0.300367	training's multi_logloss: 0.300367
[11]	training's multi_logloss: 0.270258	training's multi_logloss: 0.270258
[12]	training's multi_logloss: 0.244796	training's multi_logloss: 0.244796
[13]	training's multi_logloss: 0.221857	training

In [227]:
train8_preds = model8.predict(X8_train)
val8_preds = model8.predict(X8_val)

In [228]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [229]:
get_clf_eval(y8_train, train8_preds)
get_clf_eval(y8_val, val8_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9767
재현율: 0.9808


In [230]:
preds_8= model8.predict(X8_test)
preds_8

array([1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 3., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 3., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 3., 3., 3., 3.,
       3., 2., 2., 2., 2., 3., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2.

In [231]:
test_aug['classification'] = preds_8
test_aug

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-08-01,1,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1179,52.91,1.0,2022-08-01 00:00:00,2022,8,1,0
1,2022-08-01,1,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,737,55.95,1.0,2022-08-01 01:00:00,2022,8,1,1
2,2022-08-01,1,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,518,55.03,1.0,2022-08-01 02:00:00,2022,8,1,2
3,2022-08-01,1,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,525,53.41,1.0,2022-08-01 03:00:00,2022,8,1,3
4,2022-08-01,1,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1203,50.41,1.0,2022-08-01 04:00:00,2022,8,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-08-31,3,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,4097,28.58,3.0,2022-08-31 19:00:00,2022,8,31,19
740,2022-08-31,3,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3591,40.09,2.0,2022-08-31 20:00:00,2022,8,31,20
741,2022-08-31,3,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3262,45.14,2.0,2022-08-31 21:00:00,2022,8,31,21
742,2022-08-31,3,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3209,48.37,1.0,2022-08-31 22:00:00,2022,8,31,22


# 9월 데이터 머신러닝

## 데이터 가공

In [232]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [233]:
X9 = train_sep.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [234]:
y9 = train_sep[['datetime', 'classification']]
X9_1 = X9.drop(columns = ['datetime', 'classification'])
y9_1 = X9.datetime

In [235]:
X9_1_scaler = scaler.fit_transform(X9_1)

In [236]:
X9_1_sc = pd.DataFrame(X9_1_scaler)
X9_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X9 = pd.concat([y9_1, X9_1_sc], axis = 1)
X9

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-09-01 00:00:00,1.0,0.517062,0.806823,0.0,0.0,0.0,0.000000
1,2019-09-01 01:00:00,1.0,0.353191,0.891115,0.0,0.0,0.0,0.043478
2,2019-09-01 02:00:00,1.0,0.251012,0.947441,0.0,0.0,0.0,0.086957
3,2019-09-01 03:00:00,1.0,0.184693,0.971242,0.0,0.0,0.0,0.130435
4,2019-09-01 04:00:00,1.0,0.218431,0.957755,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-09-30 19:00:00,0.5,0.741276,0.423443,1.0,0.0,1.0,0.826087
2156,2021-09-30 20:00:00,0.5,0.570079,0.439706,1.0,0.0,1.0,0.869565
2157,2021-09-30 21:00:00,0.5,0.599961,0.700912,1.0,0.0,1.0,0.913043
2158,2021-09-30 22:00:00,0.5,0.538847,0.782626,1.0,0.0,1.0,0.956522


In [237]:
X9_test = test_sep.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [238]:
y9_test = test_sep[['datetime', 'classification']]
X9_1_test = X9_test.drop(columns = ['datetime', 'classification'])
y9_1_test = X9_test.datetime

In [239]:
X9_1_test_scaler = scaler.fit_transform(X9_1_test)

In [240]:
X9_1_test_sc = pd.DataFrame(X9_1_test_scaler)
X9_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X9_test = pd.concat([y9_1_test, X9_1_test_sc], axis = 1)
X9_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-09-01 00:00:00,0.500000,0.337947,0.909781,0.0,0.0,0.0,0.000000
1,2022-09-01 01:00:00,0.500000,0.225836,0.956842,0.0,0.0,0.0,0.043478
2,2022-09-01 02:00:00,0.500000,0.131027,0.954457,0.0,0.0,0.0,0.086957
3,2022-09-01 03:00:00,0.500000,0.083045,0.940577,0.0,0.0,0.0,0.130435
4,2022-09-01 04:00:00,0.500000,0.186621,0.872696,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-09-30 19:00:00,0.666667,0.774625,0.273910,0.0,0.0,1.0,0.826087
716,2022-09-30 20:00:00,0.666667,0.739792,0.488831,0.0,0.0,1.0,0.869565
717,2022-09-30 21:00:00,0.666667,0.677047,0.630449,0.0,0.0,1.0,0.913043
718,2022-09-30 22:00:00,0.666667,0.615225,0.664064,0.0,0.0,1.0,0.956522


## LightGBM

In [241]:
X9 = X9.drop(columns = ['datetime'])
y9 = y9.drop(columns = ['datetime'])
X9_test = X9_test.drop(columns = ['datetime'])
y9_test = y9_test.drop(columns = ['datetime'])

In [242]:
study9 = optuna.create_study(direction='maximize',sampler=TPESampler())
study9.optimize(lambda trial : objectiveLGBM(trial, X9, y9), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study9.best_trial.value,study9.best_trial.params))

[32m[I 2022-12-01 02:36:48,334][0m A new study created in memory with name: no-name-d4d30845-2c71-4d9e-9ea7-67e843561f91[0m
[32m[I 2022-12-01 02:36:56,909][0m Trial 0 finished with value: 0.9675493250259605 and parameters: {'num_leaves': 227, 'n_estimators': 1510, 'feature_fraction': 0.9866826359715529, 'bagging_fraction': 0.9663052866223928, 'bagging_freq': 2, 'min_child_samples': 14}. Best is trial 0 with value: 0.9675493250259605.[0m
[32m[I 2022-12-01 02:36:59,165][0m Trial 1 finished with value: 0.8380903839619437 and parameters: {'num_leaves': 196, 'n_estimators': 2314, 'feature_fraction': 0.9510195566689571, 'bagging_fraction': 0.43753099125005246, 'bagging_freq': 4, 'min_child_samples': 100}. Best is trial 0 with value: 0.9675493250259605.[0m
[32m[I 2022-12-01 02:37:02,228][0m Trial 2 finished with value: 0.9694084793094694 and parameters: {'num_leaves': 276, 'n_estimators': 1619, 'feature_fraction': 0.6544946052342788, 'bagging_fraction': 0.9914906356258266, 'bagging

Best trial: score 0.9984917043740573,
params {'num_leaves': 328, 'n_estimators': 1042, 'feature_fraction': 0.4119564025962276, 'bagging_fraction': 0.5628196160155913, 'bagging_freq': 5, 'min_child_samples': 38}


In [243]:
optuna.visualization.plot_param_importances(study9) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study9) # 최적화 과정 시각화

In [244]:
X9_train, X9_val, y9_train, y9_val = train_test_split(X9, y9, test_size = 0.2, random_state = 42)

In [245]:
X9_train.shape, X9_val.shape, y9_train.shape, y9_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [246]:
model = LGBMClassifier(**study9.best_trial.params)

In [247]:
model9 = model.fit(X9_train, y9_train,
          eval_set = [(X9_train, y9_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.732335	training's multi_logloss: 0.732335
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.707907	training's multi_logloss: 0.707907
[3]	training's multi_logloss: 0.629886	training's multi_logloss: 0.629886
[4]	training's multi_logloss: 0.626671	training's multi_logloss: 0.626671
[5]	training's multi_logloss: 0.568848	training's multi_logloss: 0.568848
[6]	training's multi_logloss: 0.519924	training's multi_logloss: 0.519924
[7]	training's multi_logloss: 0.473986	training's multi_logloss: 0.473986
[8]	training's multi_logloss: 0.434155	training's multi_logloss: 0.434155
[9]	training's multi_logloss: 0.401077	training's multi_logloss: 0.401077
[10]	training's multi_logloss: 0.370028	training's multi_logloss: 0.370028
[11]	training's multi_logloss: 0.349457	training's multi_logloss: 0.349457
[12]	training's multi_logloss: 0.331085	training's multi_logloss: 0.331085
[13]	training's multi_logloss: 0.304183	traini

In [248]:
train9_preds = model9.predict(X9_train)
val9_preds = model9.predict(X9_val)

In [249]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [250]:
get_clf_eval(y9_train, train9_preds)
get_clf_eval(y9_val, val9_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9969
재현율: 0.9487


In [251]:
preds_9= model9.predict(X9_test)
preds_9

array([1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       3., 3., 2., 2., 3., 3., 3., 3., 2., 2., 2., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 2., 3., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 3., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 3., 3., 2., 2., 2., 3., 3., 3., 3., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 3., 2., 2., 2., 2., 3., 2., 2., 2., 3., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2.

In [252]:
test_sep['classification'] = preds_9
test_sep

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-09-01,4,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,2001,58.03,1.0,2022-09-01 00:00:00,2022,9,1,0
1,2022-09-01,4,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1515,60.20,1.0,2022-09-01 01:00:00,2022,9,1,1
2,2022-09-01,4,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1104,60.09,1.0,2022-09-01 02:00:00,2022,9,1,2
3,2022-09-01,4,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,896,59.45,1.0,2022-09-01 03:00:00,2022,9,1,3
4,2022-09-01,4,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1345,56.32,1.0,2022-09-01 04:00:00,2022,9,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-09-30,5,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3894,28.71,3.0,2022-09-30 19:00:00,2022,9,30,19
716,2022-09-30,5,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3743,38.62,2.0,2022-09-30 20:00:00,2022,9,30,20
717,2022-09-30,5,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3471,45.15,2.0,2022-09-30 21:00:00,2022,9,30,21
718,2022-09-30,5,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3203,46.70,1.0,2022-09-30 22:00:00,2022,9,30,22


# 10월 데이터 머신러닝

## 데이터 가공

In [253]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [254]:
X10 = train_oct.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [255]:
y10 = train_oct[['datetime', 'classification']]
X10_1 = X10.drop(columns = ['datetime', 'classification'])
y10_1 = X10.datetime

In [256]:
X10_1_scaler = scaler.fit_transform(X10_1)

In [257]:
X10_1_sc = pd.DataFrame(X10_1_scaler)
X10_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X10 = pd.concat([y10_1, X10_1_sc], axis = 1)
X10

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-10-01 00:00:00,0.166667,0.313970,0.883526,0.0,0.0,0.0,0.000000
1,2019-10-01 01:00:00,0.166667,0.189548,0.935782,0.0,0.0,0.0,0.043478
2,2019-10-01 02:00:00,0.166667,0.137286,0.949843,0.0,0.0,0.0,0.086957
3,2019-10-01 03:00:00,0.166667,0.090653,0.955929,0.0,0.0,0.0,0.130435
4,2019-10-01 04:00:00,0.166667,0.183719,0.889402,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1483,2020-10-31 19:00:00,0.833333,0.626734,0.293179,1.0,0.0,1.0,0.826087
1484,2020-10-31 20:00:00,0.833333,0.668744,0.465687,1.0,0.0,1.0,0.869565
1485,2020-10-31 21:00:00,0.833333,0.609849,0.611123,1.0,0.0,1.0,0.913043
1486,2020-10-31 22:00:00,0.833333,0.524824,0.728646,1.0,0.0,1.0,0.956522


In [258]:
X10_test = test_oct.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [259]:
y10_test = test_oct[['datetime', 'classification']]
X10_1_test = X10_test.drop(columns = ['datetime', 'classification'])
y10_1_test = X10_test.datetime

In [260]:
X10_1_test_scaler = scaler.fit_transform(X10_1_test)

In [261]:
X10_1_test_sc = pd.DataFrame(X10_1_test_scaler)
X10_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X10_test = pd.concat([y10_1_test, X10_1_test_sc], axis = 1)
X10_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-10-01 00:00:00,0.666667,0.189677,0.985725,0.0,0.0,0.0,0.000000
1,2021-10-01 01:00:00,0.666667,0.099570,0.991592,0.0,0.0,0.0,0.043478
2,2021-10-01 02:00:00,0.666667,0.046452,0.982010,0.0,0.0,0.0,0.086957
3,2021-10-01 03:00:00,0.666667,0.035699,0.947008,0.0,0.0,0.0,0.130435
4,2021-10-01 04:00:00,0.666667,0.171398,0.904576,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2021-10-31 19:00:00,1.000000,0.730108,0.573719,0.0,0.0,1.0,0.826087
740,2021-10-31 20:00:00,1.000000,0.604301,0.601095,0.0,0.0,1.0,0.869565
741,2021-10-31 21:00:00,1.000000,0.637419,0.670512,0.0,0.0,1.0,0.913043
742,2021-10-31 22:00:00,1.000000,0.383441,0.758506,0.0,0.0,1.0,0.956522


## LightGBM

In [262]:
X10 = X10.drop(columns = ['datetime'])
y10 = y10.drop(columns = ['datetime'])
X10_test = X10_test.drop(columns = ['datetime'])
y10_test = y10_test.drop(columns = ['datetime'])

In [263]:
study10 = optuna.create_study(direction='maximize',sampler=TPESampler())
study10.optimize(lambda trial : objectiveLGBM(trial, X10, y10), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study10.best_trial.value,study10.best_trial.params))

[32m[I 2022-12-01 02:37:59,999][0m A new study created in memory with name: no-name-722dc128-c2c0-445e-8123-249c68d891ba[0m
[32m[I 2022-12-01 02:38:00,875][0m Trial 0 finished with value: 0.6567164179104478 and parameters: {'num_leaves': 481, 'n_estimators': 764, 'feature_fraction': 0.9189866981446871, 'bagging_fraction': 0.5328478715492649, 'bagging_freq': 4, 'min_child_samples': 71}. Best is trial 0 with value: 0.6567164179104478.[0m
[32m[I 2022-12-01 02:38:01,465][0m Trial 1 finished with value: 0.6505747126436782 and parameters: {'num_leaves': 203, 'n_estimators': 768, 'feature_fraction': 0.4832435880723114, 'bagging_fraction': 0.4428196141889123, 'bagging_freq': 4, 'min_child_samples': 80}. Best is trial 0 with value: 0.6567164179104478.[0m
[32m[I 2022-12-01 02:38:03,578][0m Trial 2 finished with value: 0.7950248756218906 and parameters: {'num_leaves': 393, 'n_estimators': 1898, 'feature_fraction': 0.773607183069077, 'bagging_fraction': 0.929293078073467, 'bagging_freq'

Best trial: score 0.9978070175438596,
params {'num_leaves': 9, 'n_estimators': 2940, 'feature_fraction': 0.6448454256371672, 'bagging_fraction': 0.8135582194911887, 'bagging_freq': 1, 'min_child_samples': 41}


In [264]:
optuna.visualization.plot_param_importances(study10) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study10) # 최적화 과정 시각화

In [265]:
X10_train, X10_val, y10_train, y10_val = train_test_split(X10, y10, test_size = 0.2, random_state = 42)

In [266]:
X10_train.shape, X10_val.shape, y10_train.shape, y10_val.shape

((1190, 7), (298, 7), (1190, 1), (298, 1))

In [267]:
model = LGBMClassifier(**study10.best_trial.params)

In [268]:
model10 = model.fit(X10_train, y10_train,
          eval_set = [(X10_train, y10_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.700181	training's multi_logloss: 0.700181
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.603915	training's multi_logloss: 0.603915
[3]	training's multi_logloss: 0.544712	training's multi_logloss: 0.544712
[4]	training's multi_logloss: 0.495522	training's multi_logloss: 0.495522
[5]	training's multi_logloss: 0.440472	training's multi_logloss: 0.440472
[6]	training's multi_logloss: 0.405909	training's multi_logloss: 0.405909
[7]	training's multi_logloss: 0.374011	training's multi_logloss: 0.374011
[8]	training's multi_logloss: 0.345031	training's multi_logloss: 0.345031
[9]	training's multi_logloss: 0.329615	training's multi_logloss: 0.329615
[10]	training's multi_logloss: 0.306887	training's multi_logloss: 0.306887
[11]	training's multi_logloss: 0.290622	training's multi_logloss: 0.290622
[12]	training's multi_logloss: 0.270219	training's multi_logloss: 0.270219
[13]	training's multi_logloss: 0.25227	trainin

In [269]:
train10_preds = model10.predict(X10_train)
val10_preds = model10.predict(X10_val)

In [270]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [271]:
get_clf_eval(y10_train, train10_preds)
get_clf_eval(y10_val, val10_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [272]:
preds_10= model10.predict(X10_test)
preds_10

array([1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 1., 1., 2., 2.,
       2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 1., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 3., 2., 2., 2., 1., 1., 2., 2., 2., 2., 2., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 1., 2., 1., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1.,
       2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [273]:
test_oct['classification'] = preds_10
test_oct

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-10-01,5,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1384,59.28,1.0,2021-10-01 00:00:00,2021,10,1,0
1,2021-10-01,5,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,965,59.58,1.0,2021-10-01 01:00:00,2021,10,1,1
2,2021-10-01,5,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,718,59.09,1.0,2021-10-01 02:00:00,2021,10,1,2
3,2021-10-01,5,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,668,57.30,1.0,2021-10-01 03:00:00,2021,10,1,3
4,2021-10-01,5,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1299,55.13,1.0,2021-10-01 04:00:00,2021,10,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2021-10-31,7,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3897,38.21,2.0,2021-10-31 19:00:00,2021,10,31,19
740,2021-10-31,7,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3312,39.61,2.0,2021-10-31 20:00:00,2021,10,31,20
741,2021-10-31,7,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3466,43.16,1.0,2021-10-31 21:00:00,2021,10,31,21
742,2021-10-31,7,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,2285,47.66,1.0,2021-10-31 22:00:00,2021,10,31,22


# 11월 데이터 머신러닝

## 데이터 가공

In [274]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [275]:
X11 = train_nov.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [276]:
y11 = train_nov[['datetime', 'classification']]
X11_1 = X11.drop(columns = ['datetime', 'classification'])
y11_1 = X11.datetime

In [277]:
X11_1_scaler = scaler.fit_transform(X11_1)

In [278]:
X11_1_sc = pd.DataFrame(X11_1_scaler)
X11_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X11 = pd.concat([y11_1, X11_1_sc], axis = 1)
X11

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-11-01 00:00:00,0.666667,0.366995,0.773166,0.0,0.0,0.0,0.000000
1,2019-11-01 01:00:00,0.666667,0.222085,0.831796,0.0,0.0,0.0,0.043478
2,2019-11-01 02:00:00,0.666667,0.132800,0.944005,0.0,0.0,0.0,0.086957
3,2019-11-01 03:00:00,0.666667,0.100575,0.951471,0.0,0.0,0.0,0.130435
4,2019-11-01 04:00:00,0.666667,0.124795,0.916996,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1435,2020-11-30 19:00:00,0.000000,0.570197,0.288098,1.0,0.0,1.0,0.826087
1436,2020-11-30 20:00:00,0.000000,0.635057,0.558849,1.0,0.0,1.0,0.869565
1437,2020-11-30 21:00:00,0.000000,0.418514,0.330479,1.0,0.0,1.0,0.913043
1438,2020-11-30 22:00:00,0.000000,0.375205,0.598155,1.0,0.0,1.0,0.956522


In [279]:
X11_test = test_nov.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [280]:
y11_test = test_nov[['datetime', 'classification']]
X11_1_test = X11_test.drop(columns = ['datetime', 'classification'])
y11_1_test = X11_test.datetime

In [281]:
X11_1_test_scaler = scaler.fit_transform(X11_1_test)

In [282]:
X11_1_test_sc = pd.DataFrame(X11_1_test_scaler)
X11_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X11_test = pd.concat([y11_1_test, X11_1_test_sc], axis = 1)
X11_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-11-01 00:00:00,0.000000,0.115081,0.861529,0.0,0.0,0.0,0.000000
1,2021-11-01 01:00:00,0.000000,0.033757,0.922316,0.0,0.0,0.0,0.043478
2,2021-11-01 02:00:00,0.000000,0.001754,0.908510,0.0,0.0,0.0,0.086957
3,2021-11-01 03:00:00,0.000000,0.006138,0.904389,0.0,0.0,0.0,0.130435
4,2021-11-01 04:00:00,0.000000,0.153661,0.869359,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2021-11-30 19:00:00,0.166667,0.588996,0.456831,0.0,0.0,1.0,0.826087
716,2021-11-30 20:00:00,0.166667,0.617492,0.636720,0.0,0.0,1.0,0.869565
717,2021-11-30 21:00:00,0.166667,0.644454,0.678961,0.0,0.0,1.0,0.913043
718,2021-11-30 22:00:00,0.166667,0.477203,0.751494,0.0,0.0,1.0,0.956522


## LightGBM

In [283]:
X11 = X11.drop(columns = ['datetime'])
y11 = y11.drop(columns = ['datetime'])
X11_test = X11_test.drop(columns = ['datetime'])
y11_test = y11_test.drop(columns = ['datetime'])

In [284]:
study11 = optuna.create_study(direction='maximize',sampler=TPESampler())
study11.optimize(lambda trial : objectiveLGBM(trial, X11, y11), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study11.best_trial.value,study11.best_trial.params))

[32m[I 2022-12-01 02:38:58,075][0m A new study created in memory with name: no-name-656db4d9-e1bc-4188-90ae-f8b27df7a931[0m
[32m[I 2022-12-01 02:39:05,601][0m Trial 0 finished with value: 0.9978354978354979 and parameters: {'num_leaves': 364, 'n_estimators': 2623, 'feature_fraction': 0.8250478780529311, 'bagging_fraction': 0.860405357921086, 'bagging_freq': 4, 'min_child_samples': 10}. Best is trial 0 with value: 0.9978354978354979.[0m
[32m[I 2022-12-01 02:39:07,282][0m Trial 1 finished with value: 0.6666666666666666 and parameters: {'num_leaves': 341, 'n_estimators': 1485, 'feature_fraction': 0.9536157774859224, 'bagging_fraction': 0.7952773970213655, 'bagging_freq': 6, 'min_child_samples': 93}. Best is trial 0 with value: 0.9978354978354979.[0m
[32m[I 2022-12-01 02:39:09,661][0m Trial 2 finished with value: 0.9569663076498204 and parameters: {'num_leaves': 382, 'n_estimators': 1290, 'feature_fraction': 0.9978460064136986, 'bagging_fraction': 0.7174453859621595, 'bagging_fr

Best trial: score 1.0,
params {'num_leaves': 113, 'n_estimators': 2126, 'feature_fraction': 0.6942474122105831, 'bagging_fraction': 0.7595312773133115, 'bagging_freq': 4, 'min_child_samples': 5}


In [285]:
optuna.visualization.plot_param_importances(study11) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study11) # 최적화 과정 시각화

In [286]:
X11_train, X11_val, y11_train, y11_val = train_test_split(X11, y11, test_size = 0.2, random_state = 42)

In [287]:
X11_train.shape, X11_val.shape, y11_train.shape, y11_val.shape

((1152, 7), (288, 7), (1152, 1), (288, 1))

In [288]:
model = LGBMClassifier(**study11.best_trial.params)

In [289]:
model11 = model.fit(X11_train, y11_train,
          eval_set = [(X11_train, y11_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.671797	training's multi_logloss: 0.671797
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.585541	training's multi_logloss: 0.585541
[3]	training's multi_logloss: 0.510075	training's multi_logloss: 0.510075
[4]	training's multi_logloss: 0.449003	training's multi_logloss: 0.449003
[5]	training's multi_logloss: 0.396394	training's multi_logloss: 0.396394
[6]	training's multi_logloss: 0.352827	training's multi_logloss: 0.352827
[7]	training's multi_logloss: 0.317195	training's multi_logloss: 0.317195
[8]	training's multi_logloss: 0.286701	training's multi_logloss: 0.286701
[9]	training's multi_logloss: 0.256612	training's multi_logloss: 0.256612
[10]	training's multi_logloss: 0.232152	training's multi_logloss: 0.232152
[11]	training's multi_logloss: 0.208188	training's multi_logloss: 0.208188
[12]	training's multi_logloss: 0.188264	training's multi_logloss: 0.188264
[13]	training's multi_logloss: 0.170344	traini

In [290]:
train11_preds = model11.predict(X11_train)
val11_preds = model11.predict(X11_val)

In [291]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [292]:
get_clf_eval(y11_train, train11_preds)
get_clf_eval(y11_val, val11_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [293]:
preds_11= model11.predict(X11_test)
preds_11

array([1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 2.

In [294]:
test_nov['classification'] = preds_11
test_nov

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-11-01,1,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,982,55.67,1.0,2021-11-01 00:00:00,2021,11,1,0
1,2021-11-01,1,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,611,58.62,1.0,2021-11-01 01:00:00,2021,11,1,1
2,2021-11-01,1,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,465,57.95,1.0,2021-11-01 02:00:00,2021,11,1,2
3,2021-11-01,1,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,485,57.75,1.0,2021-11-01 03:00:00,2021,11,1,3
4,2021-11-01,1,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1158,56.05,1.0,2021-11-01 04:00:00,2021,11,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2021-11-30,2,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3144,36.03,2.0,2021-11-30 19:00:00,2021,11,30,19
716,2021-11-30,2,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3274,44.76,1.0,2021-11-30 20:00:00,2021,11,30,20
717,2021-11-30,2,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3397,46.81,1.0,2021-11-30 21:00:00,2021,11,30,21
718,2021-11-30,2,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,2634,50.33,1.0,2021-11-30 22:00:00,2021,11,30,22


# 12월 데이터 머신러닝

## 데이터 가공

In [295]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [296]:
X12 = train_dec.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [297]:
y12 = train_dec[['datetime', 'classification']]
X12_1 = X12.drop(columns = ['datetime', 'classification'])
y12_1 = X12.datetime

In [298]:
X12_1_scaler = scaler.fit_transform(X12_1)

In [299]:
X12_1_sc = pd.DataFrame(X12_1_scaler)
X12_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X12 = pd.concat([y12_1, X12_1_sc], axis = 1)
X12

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-12-01 00:00:00,1.0,0.457664,0.817383,0.0,0.0,0.0,0.000000
1,2019-12-01 01:00:00,1.0,0.321237,0.877197,0.0,0.0,0.0,0.043478
2,2019-12-01 02:00:00,1.0,0.239342,0.930664,0.0,0.0,0.0,0.086957
3,2019-12-01 03:00:00,1.0,0.163593,0.971436,0.0,0.0,0.0,0.130435
4,2019-12-01 04:00:00,1.0,0.127702,0.993408,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1483,2020-12-31 19:00:00,0.5,0.587349,0.318115,1.0,0.0,1.0,0.826087
1484,2020-12-31 20:00:00,0.5,0.596272,0.554932,1.0,0.0,1.0,0.869565
1485,2020-12-31 21:00:00,0.5,0.556613,0.601074,1.0,0.0,1.0,0.913043
1486,2020-12-31 22:00:00,0.5,0.435455,0.680908,1.0,0.0,1.0,0.956522


In [300]:
X12_test = test_dec.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [301]:
y12_test = test_dec[['datetime', 'classification']]
X12_1_test = X12_test.drop(columns = ['datetime', 'classification'])
y12_1_test = X12_test.datetime

In [302]:
X12_1_test_scaler = scaler.fit_transform(X12_1_test)

In [303]:
X12_1_test_sc = pd.DataFrame(X12_1_test_scaler)
X12_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X12_test = pd.concat([y12_1_test, X12_1_test_sc], axis = 1)
X12_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-12-01 00:00:00,0.333333,0.314924,0.925243,0.0,0.0,0.0,0.000000
1,2021-12-01 01:00:00,0.333333,0.191102,0.936840,0.0,0.0,0.0,0.043478
2,2021-12-01 02:00:00,0.333333,0.120973,0.941810,0.0,0.0,0.0,0.086957
3,2021-12-01 03:00:00,0.333333,0.097524,0.915925,0.0,0.0,0.0,0.130435
4,2021-12-01 04:00:00,0.333333,0.224195,0.895838,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2021-12-31 19:00:00,0.666667,0.623712,0.399669,0.0,0.0,1.0,0.826087
740,2021-12-31 20:00:00,0.666667,0.657681,0.600538,0.0,0.0,1.0,0.869565
741,2021-12-31 21:00:00,0.666667,0.683542,0.639677,0.0,0.0,1.0,0.913043
742,2021-12-31 22:00:00,0.666667,0.555556,0.707393,0.0,0.0,1.0,0.956522


## LightGBM

In [304]:
X12 = X12.drop(columns = ['datetime'])
y12 = y12.drop(columns = ['datetime'])
X12_test = X12_test.drop(columns = ['datetime'])
y12_test = y12_test.drop(columns = ['datetime'])

In [305]:
study12 = optuna.create_study(direction='maximize',sampler=TPESampler())
study12.optimize(lambda trial : objectiveLGBM(trial, X12, y12), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study12.best_trial.value,study12.best_trial.params))

[32m[I 2022-12-01 02:40:11,269][0m A new study created in memory with name: no-name-169b3544-b193-469a-8a36-806df9da4206[0m
[32m[I 2022-12-01 02:40:15,261][0m Trial 0 finished with value: 0.9974160206718347 and parameters: {'num_leaves': 347, 'n_estimators': 1912, 'feature_fraction': 0.8403740692253705, 'bagging_fraction': 0.9705513173382303, 'bagging_freq': 2, 'min_child_samples': 31}. Best is trial 0 with value: 0.9974160206718347.[0m
[32m[I 2022-12-01 02:40:17,510][0m Trial 1 finished with value: 0.6511627906976744 and parameters: {'num_leaves': 141, 'n_estimators': 2875, 'feature_fraction': 0.7516670277647142, 'bagging_fraction': 0.5114829122155164, 'bagging_freq': 1, 'min_child_samples': 87}. Best is trial 0 with value: 0.9974160206718347.[0m
[32m[I 2022-12-01 02:40:18,553][0m Trial 2 finished with value: 0.9905437352245863 and parameters: {'num_leaves': 397, 'n_estimators': 1138, 'feature_fraction': 0.5986806236434656, 'bagging_fraction': 0.5694059871236491, 'bagging_f

Best trial: score 1.0,
params {'num_leaves': 215, 'n_estimators': 1182, 'feature_fraction': 0.979289410969621, 'bagging_fraction': 0.8286682791496149, 'bagging_freq': 7, 'min_child_samples': 12}


In [306]:
optuna.visualization.plot_param_importances(study12) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study12) # 최적화 과정 시각화

In [307]:
X12_train, X12_val, y12_train, y12_val = train_test_split(X12, y12, test_size = 0.2, random_state = 42)

In [308]:
X12_train.shape, X12_val.shape, y12_train.shape, y12_val.shape

((1190, 7), (298, 7), (1190, 1), (298, 1))

In [309]:
model = LGBMClassifier(**study12.best_trial.params)

In [310]:
model12 = model.fit(X12_train, y12_train,
          eval_set = [(X12_train, y12_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.641376	training's multi_logloss: 0.641376
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.558183	training's multi_logloss: 0.558183
[3]	training's multi_logloss: 0.486122	training's multi_logloss: 0.486122
[4]	training's multi_logloss: 0.426608	training's multi_logloss: 0.426608
[5]	training's multi_logloss: 0.376849	training's multi_logloss: 0.376849
[6]	training's multi_logloss: 0.335021	training's multi_logloss: 0.335021
[7]	training's multi_logloss: 0.302605	training's multi_logloss: 0.302605
[8]	training's multi_logloss: 0.273575	training's multi_logloss: 0.273575
[9]	training's multi_logloss: 0.244535	training's multi_logloss: 0.244535
[10]	training's multi_logloss: 0.218512	training's multi_logloss: 0.218512
[11]	training's multi_logloss: 0.195605	training's multi_logloss: 0.195605
[12]	training's multi_logloss: 0.177779	training's multi_logloss: 0.177779
[13]	training's multi_logloss: 0.159584	traini

In [311]:
train12_preds = model12.predict(X12_train)
val12_preds = model12.predict(X12_val)

In [312]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [313]:
get_clf_eval(y12_train, train12_preds)
get_clf_eval(y12_val, val12_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [314]:
preds_12= model12.predict(X12_test)
preds_12

array([1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 1., 1., 1., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 1., 1., 1., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 1., 1., 1., 1., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 2., 2.

In [315]:
test_dec['classification'] = preds_12
test_dec

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-12-01,3,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1972,58.33,1.0,2021-12-01 00:00:00,2021,12,1,0
1,2021-12-01,3,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1407,58.89,1.0,2021-12-01 01:00:00,2021,12,1,1
2,2021-12-01,3,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1087,59.13,1.0,2021-12-01 02:00:00,2021,12,1,2
3,2021-12-01,3,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,980,57.88,1.0,2021-12-01 03:00:00,2021,12,1,3
4,2021-12-01,3,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1558,56.91,1.0,2021-12-01 04:00:00,2021,12,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2021-12-31,5,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3381,32.95,2.0,2021-12-31 19:00:00,2021,12,31,19
740,2021-12-31,5,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3536,42.65,1.0,2021-12-31 20:00:00,2021,12,31,20
741,2021-12-31,5,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3654,44.54,1.0,2021-12-31 21:00:00,2021,12,31,21
742,2021-12-31,5,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3070,47.81,1.0,2021-12-31 22:00:00,2021,12,31,22


# 월별 데이터 합치기

In [316]:
result = pd.concat([test_jan,
                    test_feb,
                    test_mar,
                    test_apr,
                    test_may,
                    test_jun,
                    test_jul,
                    test_aug,
                    test_sep,
                    test_oct,
                    test_nov,
                    test_dec])
result = result.sort_values(by = 'datetime')
result = result.reset_index(drop = True)
result

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-10-01,5,0:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1384,59.28,1.0,2021-10-01 00:00:00,2021,10,1,0
1,2021-10-01,5,1:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,965,59.58,1.0,2021-10-01 01:00:00,2021,10,1,1
2,2021-10-01,5,2:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,718,59.09,1.0,2021-10-01 02:00:00,2021,10,1,2
3,2021-10-01,5,3:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,668,57.30,1.0,2021-10-01 03:00:00,2021,10,1,3
4,2021-10-01,5,4:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,1299,55.13,1.0,2021-10-01 04:00:00,2021,10,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2022-09-30,5,19:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3894,28.71,3.0,2022-09-30 19:00:00,2022,9,30,19
8756,2022-09-30,5,20:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3743,38.62,2.0,2022-09-30 20:00:00,2022,9,30,20
8757,2022-09-30,5,21:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3471,45.15,2.0,2022-09-30 21:00:00,2022,9,30,21
8758,2022-09-30,5,22:00:00,성산대교,마포구,C-04,성산대교북단,성산대교남단,3,1176,3203,46.70,1.0,2022-09-30 22:00:00,2022,9,30,22


In [317]:
result = result.drop(columns = ['dow', 
                                'district_name', 
                                'branch_num', 
                                'arr_point', 
                                'lane', 
                                'distance', 
                                'volume', 
                                'speed', 
                                'datetime', 
                                'year', 
                                'month', 
                                'day', 
                                'hour'])
result

Unnamed: 0,date,time,branch_name,dep_point,classification
0,2021-10-01,0:00:00,성산대교,성산대교북단,1.0
1,2021-10-01,1:00:00,성산대교,성산대교북단,1.0
2,2021-10-01,2:00:00,성산대교,성산대교북단,1.0
3,2021-10-01,3:00:00,성산대교,성산대교북단,1.0
4,2021-10-01,4:00:00,성산대교,성산대교북단,1.0
...,...,...,...,...,...
8755,2022-09-30,19:00:00,성산대교,성산대교북단,3.0
8756,2022-09-30,20:00:00,성산대교,성산대교북단,2.0
8757,2022-09-30,21:00:00,성산대교,성산대교북단,2.0
8758,2022-09-30,22:00:00,성산대교,성산대교북단,1.0


# csv 파일 만들기

In [318]:
result.to_csv('sungsan_depnorth_result.csv', index = False)