# 라이브러리

In [1]:
import pandas as pd
import random
import os
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

In [2]:
# 한글 폰트 깨짐 현상 해결을 위한 나눔 폰트 설치
# 코드 1회 실행 후 주석 처리하고 런타임 재시작 및 모두 실행
# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

## 데이터 로드

In [6]:
# 경로 설정
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# csv 파일 읽어오기
# root = '/content/drive/MyDrive/최종프로젝트/교통/분석/3rd_modified_data/'
root = '/content/drive/MyDrive/Project/'
C17_depnorth = pd.read_csv(root + 'Data_chungdam_depnorth.csv', encoding='cp949')
C17_depnorth_test = pd.read_csv(root + 'chungdam_depnorth_test.csv', encoding='cp949')

In [10]:
# 데이터 확인
print(C17_depnorth.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24096 entries, 0 to 24095
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            24096 non-null  object 
 1   dow             24096 non-null  int64  
 2   time            24096 non-null  object 
 3   branch_name     24096 non-null  object 
 4   district_name   24096 non-null  object 
 5   branch_num      24096 non-null  object 
 6   dep_point       24096 non-null  object 
 7   arr_point       24096 non-null  object 
 8   lane            24096 non-null  int64  
 9   distance        24096 non-null  int64  
 10  volume          24096 non-null  int64  
 11  speed           24096 non-null  float64
 12  classification  0 non-null      float64
dtypes: float64(2), int64(4), object(7)
memory usage: 2.4+ MB
None


In [11]:
# 결측치 확인
print(C17_depnorth.isnull().sum())

date                  0
dow                   0
time                  0
branch_name           0
district_name         0
branch_num            0
dep_point             0
arr_point             0
lane                  0
distance              0
volume                0
speed                 0
classification    24096
dtype: int64


In [12]:
# date 컬럼과 time 컬럼을 합쳐 datetime이라는 컬럼 만들기
C17_depnorth['datetime'] = C17_depnorth['date'] + ' ' + C17_depnorth['time']
C17_depnorth_test['datetime'] = C17_depnorth_test['date'] + ' ' + C17_depnorth_test['time']

In [13]:
# date 컬럼과 time 컬럼 제거
# C4_depsouth = C4_depsouth.drop(C4_depsouth[['date', 'time']], axis=1)

In [14]:
# datetime 문자형 컬럼을 datetime 자료형으로 변환
C17_depnorth['datetime'] = pd.to_datetime(C17_depnorth['datetime'])
C17_depnorth_test['datetime'] = pd.to_datetime(C17_depnorth_test['datetime'])

# classification 컬럼값 변경

In [15]:
C17_depnorth.describe()

Unnamed: 0,dow,lane,distance,volume,speed,classification
count,24096.0,24096.0,24096.0,24096.0,24096.0,0.0
mean,3.997012,3.0,1575.0,2571.370103,54.701498,
std,1.998295,0.0,0.0,1123.537353,30.830422,
min,1.0,3.0,1575.0,147.0,6.08,
25%,2.0,3.0,1575.0,1606.75,21.17,
50%,4.0,3.0,1575.0,3058.0,56.33,
75%,6.0,3.0,1575.0,3439.0,85.67,
max,7.0,3.0,1575.0,4620.0,111.5,


In [16]:
C17_depnorth.loc[C17_depnorth['speed'] >= C17_depnorth['speed'].mean(), 'classification'] = 1
C17_depnorth.loc[C17_depnorth['speed'] < 20, 'classification'] = 3
C17_depnorth.loc[(C17_depnorth['speed'] >= 20) 
                      & (C17_depnorth['speed'] < 40) 
                      & ((C17_depnorth['volume'] >= C17_depnorth['volume'].mean())), 'classification'] = 3
C17_depnorth.loc[(C17_depnorth['speed'] >= 20) 
                      & (C17_depnorth['speed'] < C17_depnorth['speed'].mean()) 
                      & ((C17_depnorth['volume'] < C17_depnorth['volume'].mean())), 'classification'] = 2
C17_depnorth.loc[(C17_depnorth['speed'] >= 40) 
                      & (C17_depnorth['speed'] < C17_depnorth['speed'].mean()) 
                      & ((C17_depnorth['volume'] >= C17_depnorth['volume'].mean())), 'classification'] = 2

In [17]:
C17_depnorth['classification']

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
24091    3.0
24092    1.0
24093    1.0
24094    1.0
24095    1.0
Name: classification, Length: 24096, dtype: float64

In [18]:
C17_depnorth['year'] = C17_depnorth['datetime'].dt.year
C17_depnorth['month'] = C17_depnorth['datetime'].dt.month
C17_depnorth['day'] = C17_depnorth['datetime'].dt.day
C17_depnorth['hour'] = C17_depnorth['datetime'].dt.hour

In [19]:
C17_depnorth

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2019-01-01,2,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1406,71.71,1.0,2019-01-01 00:00:00,2019,1,1,0
1,2019-01-01,2,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1904,75.67,1.0,2019-01-01 01:00:00,2019,1,1,1
2,2019-01-01,2,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,985,86.92,1.0,2019-01-01 02:00:00,2019,1,1,2
3,2019-01-01,2,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,591,89.33,1.0,2019-01-01 03:00:00,2019,1,1,3
4,2019-01-01,2,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,552,91.25,1.0,2019-01-01 04:00:00,2019,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24091,2021-09-30,4,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3291,23.00,3.0,2021-09-30 19:00:00,2021,9,30,19
24092,2021-09-30,4,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3041,73.18,1.0,2021-09-30 20:00:00,2021,9,30,20
24093,2021-09-30,4,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2987,76.58,1.0,2021-09-30 21:00:00,2021,9,30,21
24094,2021-09-30,4,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2538,81.83,1.0,2021-09-30 22:00:00,2021,9,30,22


# 월별로 데이터 나누기

In [20]:
C17_dn_month = C17_depnorth['month']
C17_dn_month_list = sorted(set(C17_dn_month))
C17_dn_month_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [21]:
month_data = []
for i in range(0, len(C17_dn_month_list)):
  month = C17_depnorth[C17_depnorth['month'] == C17_dn_month_list[i]]
  month = month.reset_index(drop=True)
  month_data.append(month)

In [22]:
train_jan = month_data[0]
train_feb = month_data[1]
train_mar = month_data[2]
train_apr = month_data[3]
train_may = month_data[4]
train_jun = month_data[5]
train_jul = month_data[6]
train_aug = month_data[7]
train_sep = month_data[8]
train_oct = month_data[9]
train_nov = month_data[10]
train_dec = month_data[11]

In [23]:
C17_depnorth_test['year'] = C17_depnorth_test['datetime'].dt.year
C17_depnorth_test['month'] = C17_depnorth_test['datetime'].dt.month
C17_depnorth_test['day'] = C17_depnorth_test['datetime'].dt.day
C17_depnorth_test['hour'] = C17_depnorth_test['datetime'].dt.hour

In [24]:
C17_dn_test_mon = C17_depnorth_test['month']
C17_dn_test_mon_list  = sorted(set(C17_dn_test_mon))
C17_dn_test_mon_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [25]:
month_test_data = []
for i in range(0, len(C17_dn_month_list)):
  month = C17_depnorth_test[C17_depnorth_test['month'] == C17_dn_test_mon_list[i]]
  month = month.reset_index(drop=True)
  month_test_data.append(month)

In [26]:
test_jan = month_test_data[0]
test_feb = month_test_data[1]
test_mar = month_test_data[2]
test_apr = month_test_data[3]
test_may = month_test_data[4]
test_jun = month_test_data[5]
test_jul = month_test_data[6]
test_aug = month_test_data[7]
test_sep = month_test_data[8]
test_oct = month_test_data[9]
test_nov = month_test_data[10]
test_dec = month_test_data[11]

In [27]:
test_dec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            744 non-null    object        
 1   dow             744 non-null    int64         
 2   time            744 non-null    object        
 3   branch_name     744 non-null    object        
 4   district_name   744 non-null    object        
 5   branch_num      744 non-null    object        
 6   dep_point       744 non-null    object        
 7   arr_point       744 non-null    object        
 8   lane            744 non-null    int64         
 9   distance        744 non-null    int64         
 10  volume          744 non-null    int64         
 11  speed           744 non-null    float64       
 12  classification  0 non-null      float64       
 13  datetime        744 non-null    datetime64[ns]
 14  year            744 non-null    int64         
 15  month 

# 1월 데이터 머신러닝

## 데이터 가공

In [28]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [29]:
X1 = train_jan.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [30]:
y1 = train_jan[['datetime', 'classification']]
X1_1 = X1.drop(columns = ['datetime', 'classification'])
y1_1 = X1.datetime

In [31]:
X1_1_scaler = scaler.fit_transform(X1_1)

In [32]:
X1_1_sc = pd.DataFrame(X1_1_scaler)
X1_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X1 = pd.concat([y1_1, X1_1_sc], axis = 1)

In [33]:
X1_test = test_jan.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [34]:
y1_test = test_jan[['datetime', 'classification']]
X1_1_test = X1_test.drop(columns = ['datetime', 'classification'])
y1_1_test = X1_test.datetime

In [35]:
X1_1_test_scaler = scaler.fit_transform(X1_1_test)

In [36]:
X1_1_test_sc = pd.DataFrame(X1_1_test_scaler)
X1_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X1_test = pd.concat([y1_1_test, X1_1_test_sc], axis = 1)
X1_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-01-01 00:00:00,0.833333,0.194980,0.757351,0.0,0.0,0.0,0.000000
1,2022-01-01 01:00:00,0.833333,0.230647,0.750839,0.0,0.0,0.0,0.043478
2,2022-01-01 02:00:00,0.833333,0.056275,0.783698,0.0,0.0,0.0,0.086957
3,2022-01-01 03:00:00,0.833333,0.032232,0.791889,0.0,0.0,0.0,0.130435
4,2022-01-01 04:00:00,0.833333,0.025099,0.784488,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-01-31 19:00:00,0.000000,0.599207,0.657095,0.0,0.0,1.0,0.826087
740,2022-01-31 20:00:00,0.000000,0.655746,0.639826,0.0,0.0,1.0,0.869565
741,2022-01-31 21:00:00,0.000000,0.522853,0.650484,0.0,0.0,1.0,0.913043
742,2022-01-31 22:00:00,0.000000,0.266050,0.675942,0.0,0.0,1.0,0.956522


## LightGBM

In [37]:
# optuna 설치
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.5-py3-none-any.whl (348 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.5/348.5 KB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting importlib-metadata<5.0.0
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting cliff
  Downloading cliff-4.1.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 KB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting alembic>=1.5.0
  Downloading alembic-1.9.1-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.4/210.4 KB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (

In [38]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error

In [39]:
import lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score

In [40]:
X1 = X1.drop(columns = ['datetime'])
y1 = y1.drop(columns = ['datetime'])
X1_test = X1_test.drop(columns = ['datetime'])
y1_test = y1_test.drop(columns = ['datetime'])

In [41]:
# LigthGBM 하이퍼파라미터 값 지정
def objectiveLGBM(trial: Trial, X, y):
    param = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'learning_rate': 0.01,
        'n_estimators': trial.suggest_int('n_estimators', 700, 3000),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'gpu_use_dp':True
    }
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

     # 학습 모델 생성
    model = LGBMClassifier(**param)
    lgb_model = model.fit(X_train, y_train, verbose=True) # 학습 진행
    train_preds = lgb_model.predict(X_train)
    test_preds = lgb_model.predict(X_test)

    # 모델 성능 확인
    train_precision = precision_score(y_test, test_preds, average= "macro")
    
    return train_precision

In [42]:
study1 = optuna.create_study(direction='maximize',sampler=TPESampler())
study1.optimize(lambda trial : objectiveLGBM(trial, X1, y1), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study1.best_trial.value,study1.best_trial.params))

[32m[I 2023-01-12 07:59:01,572][0m A new study created in memory with name: no-name-55d38e67-b8a1-4c3f-a383-836bf53aecba[0m
[32m[I 2023-01-12 07:59:06,793][0m Trial 0 finished with value: 0.9966467028510824 and parameters: {'num_leaves': 44, 'n_estimators': 1893, 'feature_fraction': 0.97678274927071, 'bagging_fraction': 0.9223108701260284, 'bagging_freq': 4, 'min_child_samples': 96}. Best is trial 0 with value: 0.9966467028510824.[0m
[32m[I 2023-01-12 07:59:17,968][0m Trial 1 finished with value: 0.9979166666666667 and parameters: {'num_leaves': 279, 'n_estimators': 1123, 'feature_fraction': 0.6592469077840453, 'bagging_fraction': 0.7591111095338128, 'bagging_freq': 3, 'min_child_samples': 36}. Best is trial 1 with value: 0.9979166666666667.[0m
[32m[I 2023-01-12 07:59:24,912][0m Trial 2 finished with value: 0.992729716520039 and parameters: {'num_leaves': 309, 'n_estimators': 1381, 'feature_fraction': 0.5743696477764312, 'bagging_fraction': 0.7149983428636966, 'bagging_freq'

Best trial: score 1.0,
params {'num_leaves': 506, 'n_estimators': 2837, 'feature_fraction': 0.9085929173844878, 'bagging_fraction': 0.474101423312134, 'bagging_freq': 5, 'min_child_samples': 24}


In [43]:
optuna.visualization.plot_param_importances(study1) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study1) # 최적화 과정 시각화

In [44]:
X1_train, X1_val, y1_train, y1_val = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

In [45]:
X1_train.shape, X1_val.shape, y1_train.shape, y1_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [46]:
model = LGBMClassifier(**study1.best_trial.params)

In [47]:
model1 = model.fit(X1_train, y1_train,
          eval_set = [(X1_train, y1_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.732631	training's multi_logloss: 0.732631
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.648	training's multi_logloss: 0.648
[3]	training's multi_logloss: 0.565492	training's multi_logloss: 0.565492
[4]	training's multi_logloss: 0.497252	training's multi_logloss: 0.497252
[5]	training's multi_logloss: 0.439653	training's multi_logloss: 0.439653
[6]	training's multi_logloss: 0.398285	training's multi_logloss: 0.398285
[7]	training's multi_logloss: 0.358884	training's multi_logloss: 0.358884
[8]	training's multi_logloss: 0.324343	training's multi_logloss: 0.324343
[9]	training's multi_logloss: 0.296018	training's multi_logloss: 0.296018
[10]	training's multi_logloss: 0.265437	training's multi_logloss: 0.265437
[11]	training's multi_logloss: 0.238172	training's multi_logloss: 0.238172
[12]	training's multi_logloss: 0.2175	training's multi_logloss: 0.2175
[13]	training's multi_logloss: 0.195826	training's multi

In [48]:
train1_preds = model1.predict(X1_train)
val1_preds = model1.predict(X1_val)

In [49]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [50]:
get_clf_eval(y1_train, train1_preds)
get_clf_eval(y1_val, val1_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9966
재현율: 0.9649


In [51]:
preds_1 = model1.predict(X1_test)
preds_1

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 2., 1., 2., 3., 3.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 3., 2., 3., 3., 3., 1., 1., 2., 3., 1., 1., 1., 1., 1.,
       1., 1., 1., 3., 3., 3., 3., 1., 2., 1., 2., 1., 3., 3., 3., 3., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 2., 3., 3.,
       2., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 1., 1., 3., 3., 1., 3.,
       3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3.,
       2., 1., 3., 3., 3., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 3., 3., 3., 3., 3., 2., 3., 1., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 3., 3., 3., 2.

In [52]:
test_jan['classification'] = preds_1
test_jan

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-01-01,6,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,972,84.33,1.0,2022-01-01 00:00:00,2022,1,1,0
1,2022-01-01,6,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1107,83.67,1.0,2022-01-01 01:00:00,2022,1,1,1
2,2022-01-01,6,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,447,87.00,1.0,2022-01-01 02:00:00,2022,1,1,2
3,2022-01-01,6,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,356,87.83,1.0,2022-01-01 03:00:00,2022,1,1,3
4,2022-01-01,6,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,329,87.08,1.0,2022-01-01 04:00:00,2022,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-01-31,1,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2502,74.17,1.0,2022-01-31 19:00:00,2022,1,31,19
740,2022-01-31,1,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2716,72.42,1.0,2022-01-31 20:00:00,2022,1,31,20
741,2022-01-31,1,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2213,73.50,1.0,2022-01-31 21:00:00,2022,1,31,21
742,2022-01-31,1,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1241,76.08,1.0,2022-01-31 22:00:00,2022,1,31,22


# 2월 데이터 머신러닝

## 데이터 가공

In [53]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [54]:
X2 = train_feb.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [55]:
y2 = train_feb[['datetime', 'classification']]
X2_1 = X2.drop(columns = ['datetime', 'classification'])
y2_1 = X2.datetime

In [56]:
X2_1_scaler = scaler.fit_transform(X2_1)

In [57]:
X2_1_sc = pd.DataFrame(X2_1_scaler)
X2_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X2 = pd.concat([y2_1, X2_1_sc], axis = 1)
X2

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-02-01 00:00:00,0.666667,0.301282,0.855978,0.0,0.0,0.000000,0.000000
1,2019-02-01 01:00:00,0.666667,0.190828,0.861413,0.0,0.0,0.000000,0.043478
2,2019-02-01 02:00:00,0.666667,0.133629,0.878696,0.0,0.0,0.000000,0.086957
3,2019-02-01 03:00:00,0.666667,0.120316,0.862391,0.0,0.0,0.000000,0.130435
4,2019-02-01 04:00:00,0.666667,0.156558,0.851522,0.0,0.0,0.000000,0.173913
...,...,...,...,...,...,...,...,...
2035,2021-02-28 19:00:00,1.000000,0.653107,0.731957,1.0,0.0,0.964286,0.826087
2036,2021-02-28 20:00:00,1.000000,0.664694,0.759130,1.0,0.0,0.964286,0.869565
2037,2021-02-28 21:00:00,1.000000,0.638067,0.772717,1.0,0.0,0.964286,0.913043
2038,2021-02-28 22:00:00,1.000000,0.552761,0.803478,1.0,0.0,0.964286,0.956522


In [58]:
X2_test = test_feb.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [59]:
y2_test = test_feb[['datetime', 'classification']]
X2_1_test = X2_test.drop(columns = ['datetime', 'classification'])
y2_1_test = X2_test.datetime

In [60]:
X2_1_test_scaler = scaler.fit_transform(X2_1_test)

In [61]:
X2_1_test_sc = pd.DataFrame(X2_1_test_scaler)
X2_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X2_test = pd.concat([y2_1_test, X2_1_test_sc], axis = 1)
X2_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-02-01 00:00:00,0.166667,0.060219,0.844358,0.0,0.0,0.0,0.000000
1,2022-02-01 01:00:00,0.166667,0.020669,0.864283,0.0,0.0,0.0,0.043478
2,2022-02-01 02:00:00,0.166667,0.003828,0.850051,0.0,0.0,0.0,0.086957
3,2022-02-01 03:00:00,0.166667,0.000000,0.774223,0.0,0.0,0.0,0.130435
4,2022-02-01 04:00:00,0.166667,0.003062,0.755209,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
667,2022-02-28 19:00:00,0.000000,0.719571,0.281794,0.0,0.0,1.0,0.826087
668,2022-02-28 20:00:00,0.000000,0.695330,0.726745,0.0,0.0,1.0,0.869565
669,2022-02-28 21:00:00,0.000000,0.605767,0.756120,0.0,0.0,1.0,0.913043
670,2022-02-28 22:00:00,0.000000,0.426895,0.820676,0.0,0.0,1.0,0.956522


## LightGBM

In [62]:
X2 = X2.drop(columns = ['datetime'])
y2 = y2.drop(columns = ['datetime'])
X2_test = X2_test.drop(columns = ['datetime'])
y2_test = y2_test.drop(columns = ['datetime'])

In [63]:
study2 = optuna.create_study(direction='maximize',sampler=TPESampler())
study2.optimize(lambda trial : objectiveLGBM(trial, X2, y2), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study2.best_trial.value,study2.best_trial.params))

[32m[I 2023-01-12 08:00:29,577][0m A new study created in memory with name: no-name-454a8cc8-409c-4b00-bbfe-cd85ebeaee1e[0m
[32m[I 2023-01-12 08:00:33,184][0m Trial 0 finished with value: 1.0 and parameters: {'num_leaves': 303, 'n_estimators': 2957, 'feature_fraction': 0.8044728898390459, 'bagging_fraction': 0.767649283504859, 'bagging_freq': 2, 'min_child_samples': 76}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-01-12 08:00:35,521][0m Trial 1 finished with value: 1.0 and parameters: {'num_leaves': 125, 'n_estimators': 2526, 'feature_fraction': 0.4897249777192446, 'bagging_fraction': 0.4425062129508182, 'bagging_freq': 3, 'min_child_samples': 91}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-01-12 08:00:39,485][0m Trial 2 finished with value: 0.9978354978354979 and parameters: {'num_leaves': 458, 'n_estimators': 1383, 'feature_fraction': 0.47406103084054196, 'bagging_fraction': 0.4636523110470008, 'bagging_freq': 3, 'min_child_samples': 87}. Best is trial 0 with valu

Best trial: score 1.0,
params {'num_leaves': 303, 'n_estimators': 2957, 'feature_fraction': 0.8044728898390459, 'bagging_fraction': 0.767649283504859, 'bagging_freq': 2, 'min_child_samples': 76}


In [64]:
optuna.visualization.plot_param_importances(study2) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study2) # 최적화 과정 시각화

In [65]:
X2_train, X2_val, y2_train, y2_val = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

In [66]:
X2_train.shape, X2_val.shape, y2_train.shape, y2_val.shape

((1632, 7), (408, 7), (1632, 1), (408, 1))

In [67]:
model = LGBMClassifier(**study2.best_trial.params)

In [68]:
model2 = model.fit(X2_train, y2_train,
          eval_set = [(X2_train, y2_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.743913	training's multi_logloss: 0.743913
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.658816	training's multi_logloss: 0.658816
[3]	training's multi_logloss: 0.573424	training's multi_logloss: 0.573424
[4]	training's multi_logloss: 0.503035	training's multi_logloss: 0.503035
[5]	training's multi_logloss: 0.443773	training's multi_logloss: 0.443773
[6]	training's multi_logloss: 0.403156	training's multi_logloss: 0.403156
[7]	training's multi_logloss: 0.36285	training's multi_logloss: 0.36285
[8]	training's multi_logloss: 0.33558	training's multi_logloss: 0.33558
[9]	training's multi_logloss: 0.306674	training's multi_logloss: 0.306674
[10]	training's multi_logloss: 0.277513	training's multi_logloss: 0.277513
[11]	training's multi_logloss: 0.248196	training's multi_logloss: 0.248196
[12]	training's multi_logloss: 0.226425	training's multi_logloss: 0.226425
[13]	training's multi_logloss: 0.207282	training's

In [69]:
train2_preds = model2.predict(X2_train)
val2_preds = model2.predict(X2_val)

In [70]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [71]:
get_clf_eval(y2_train, train2_preds)
get_clf_eval(y2_val, val2_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9977
재현율: 0.9867


In [72]:
preds_2= model2.predict(X2_test)
preds_2

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3.,
       3., 2., 1., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 1., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 3., 3., 3., 1., 1., 1., 2., 2., 1., 2., 3., 3., 3., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 2., 1., 2., 2., 3.,
       3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 3., 2., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3.,
       2., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3.,
       3., 3., 3., 2., 1., 1., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 3., 3., 3., 3., 3., 3., 3., 3., 1., 3., 3., 3.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 3., 2., 3.,
       3., 3., 2., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 3., 3., 3.

In [73]:
test_feb['classification'] = preds_2
test_feb

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-02-01,2,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,375,84.58,1.0,2022-02-01 00:00:00,2022,2,1,0
1,2022-02-01,2,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,220,86.33,1.0,2022-02-01 01:00:00,2022,2,1,1
2,2022-02-01,2,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,154,85.08,1.0,2022-02-01 02:00:00,2022,2,1,2
3,2022-02-01,2,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,139,78.42,1.0,2022-02-01 03:00:00,2022,2,1,3
4,2022-02-01,2,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,151,76.75,1.0,2022-02-01 04:00:00,2022,2,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2022-02-28,1,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2959,35.17,3.0,2022-02-28 19:00:00,2022,2,28,19
668,2022-02-28,1,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2864,74.25,1.0,2022-02-28 20:00:00,2022,2,28,20
669,2022-02-28,1,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2513,76.83,1.0,2022-02-28 21:00:00,2022,2,28,21
670,2022-02-28,1,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1812,82.50,1.0,2022-02-28 22:00:00,2022,2,28,22


# 3월 데이터 머신러닝

## 데이터 가공

In [74]:
X3 = train_mar.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [75]:
y3 = train_mar[['datetime', 'classification']]
X3_1 = X3.drop(columns = ['datetime', 'classification'])
y3_1 = X3.datetime

In [76]:
X3_1_scaler = scaler.fit_transform(X3_1)

In [77]:
X3_1_sc = pd.DataFrame(X3_1_scaler)
X3_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X3 = pd.concat([y3_1, X3_1_sc], axis = 1)
X3

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-03-01 00:00:00,0.666667,0.346811,0.838895,0.0,0.0,0.0,0.000000
1,2019-03-01 01:00:00,0.666667,0.243345,0.845304,0.0,0.0,0.0,0.043478
2,2019-03-01 02:00:00,0.666667,0.146911,0.874807,0.0,0.0,0.0,0.086957
3,2019-03-01 03:00:00,0.666667,0.115269,0.868287,0.0,0.0,0.0,0.130435
4,2019-03-01 04:00:00,0.666667,0.130085,0.838895,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-03-31 19:00:00,0.333333,0.757408,0.126188,1.0,0.0,1.0,0.826087
2228,2021-03-31 20:00:00,0.333333,0.681818,0.689724,1.0,0.0,1.0,0.869565
2229,2021-03-31 21:00:00,0.333333,0.697137,0.756022,1.0,0.0,1.0,0.913043
2230,2021-03-31 22:00:00,0.333333,0.673029,0.797459,1.0,0.0,1.0,0.956522


In [78]:
X3_test = test_mar.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [79]:
y3_test = test_mar[['datetime', 'classification']]
X3_1_test = X3_test.drop(columns = ['datetime', 'classification'])
y3_1_test = X3_test.datetime

In [80]:
X3_1_test_scaler = scaler.fit_transform(X3_1_test)

In [81]:
X3_1_test_sc = pd.DataFrame(X3_1_test_scaler)
X3_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X3_test = pd.concat([y3_1_test, X3_1_test_sc], axis = 1)
X3_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-03-01 00:00:00,0.166667,0.184175,0.981557,0.0,0.0,0.0,0.000000
1,2022-03-01 01:00:00,0.166667,0.093023,0.979466,0.0,0.0,0.0,0.043478
2,2022-03-01 02:00:00,0.166667,0.065758,0.887249,0.0,0.0,0.0,0.086957
3,2022-03-01 03:00:00,0.166667,0.054798,0.885282,0.0,0.0,0.0,0.130435
4,2022-03-01 04:00:00,0.166667,0.087944,0.861675,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-03-31 19:00:00,0.500000,0.734296,0.108570,0.0,0.0,1.0,0.826087
740,2022-03-31 20:00:00,0.500000,0.634857,0.781753,0.0,0.0,1.0,0.869565
741,2022-03-31 21:00:00,0.500000,0.643411,0.799213,0.0,0.0,1.0,0.913043
742,2022-03-31 22:00:00,0.500000,0.533547,0.861675,0.0,0.0,1.0,0.956522


## LightGBM

In [82]:
X3 = X3.drop(columns = ['datetime'])
y3 = y3.drop(columns = ['datetime'])
X3_test = X3_test.drop(columns = ['datetime'])
y3_test = y3_test.drop(columns = ['datetime'])

In [83]:
study3 = optuna.create_study(direction='maximize',sampler=TPESampler())
study3.optimize(lambda trial : objectiveLGBM(trial, X3, y3), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study3.best_trial.value,study3.best_trial.params))

[32m[I 2023-01-12 08:01:37,023][0m A new study created in memory with name: no-name-f298c560-91a4-415c-94c5-c5d6e6eac06f[0m
[32m[I 2023-01-12 08:01:39,678][0m Trial 0 finished with value: 0.998158379373849 and parameters: {'num_leaves': 152, 'n_estimators': 1894, 'feature_fraction': 0.8584626953538108, 'bagging_fraction': 0.5981536297150956, 'bagging_freq': 2, 'min_child_samples': 66}. Best is trial 0 with value: 0.998158379373849.[0m
[32m[I 2023-01-12 08:01:41,005][0m Trial 1 finished with value: 0.9846586345381526 and parameters: {'num_leaves': 506, 'n_estimators': 1012, 'feature_fraction': 0.6740839345329228, 'bagging_fraction': 0.7510157050338988, 'bagging_freq': 6, 'min_child_samples': 97}. Best is trial 0 with value: 0.998158379373849.[0m
[32m[I 2023-01-12 08:01:43,767][0m Trial 2 finished with value: 0.9979674796747968 and parameters: {'num_leaves': 105, 'n_estimators': 2180, 'feature_fraction': 0.7574490416537427, 'bagging_fraction': 0.8573410755774264, 'bagging_freq

Best trial: score 1.0,
params {'num_leaves': 289, 'n_estimators': 1664, 'feature_fraction': 0.9940446481108777, 'bagging_fraction': 0.7455359985223877, 'bagging_freq': 4, 'min_child_samples': 6}


In [84]:
optuna.visualization.plot_param_importances(study3) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study3) # 최적화 과정 시각화

In [85]:
X3_train, X3_val, y3_train, y3_val = train_test_split(X3, y3, test_size = 0.2, random_state = 42)

In [86]:
X3_train.shape, X3_val.shape, y3_train.shape, y3_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [87]:
model = LGBMClassifier(**study3.best_trial.params)

In [88]:
model3 = model.fit(X3_train, y3_train,
          eval_set = [(X3_train, y3_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.741664	training's multi_logloss: 0.741664
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.649845	training's multi_logloss: 0.649845
[3]	training's multi_logloss: 0.567381	training's multi_logloss: 0.567381
[4]	training's multi_logloss: 0.499055	training's multi_logloss: 0.499055
[5]	training's multi_logloss: 0.440948	training's multi_logloss: 0.440948
[6]	training's multi_logloss: 0.396065	training's multi_logloss: 0.396065
[7]	training's multi_logloss: 0.355043	training's multi_logloss: 0.355043
[8]	training's multi_logloss: 0.31902	training's multi_logloss: 0.31902
[9]	training's multi_logloss: 0.289	training's multi_logloss: 0.289
[10]	training's multi_logloss: 0.258904	training's multi_logloss: 0.258904
[11]	training's multi_logloss: 0.232392	training's multi_logloss: 0.232392
[12]	training's multi_logloss: 0.2108	training's multi_logloss: 0.2108
[13]	training's multi_logloss: 0.189609	training's multi_l

In [89]:
train3_preds = model3.predict(X3_train)
val3_preds = model3.predict(X3_val)

In [90]:
get_clf_eval(y3_train, train3_preds)
get_clf_eval(y3_val, val3_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9576
재현율: 0.9822


In [91]:
preds_3= model3.predict(X3_test)
preds_3

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 1.,
       1., 2., 2., 1., 3., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 3., 3., 3., 3., 2., 2., 1., 2., 3., 3., 3., 3., 3.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 1., 2., 3., 2.,
       1., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 3., 2., 2., 1., 3., 3., 3., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 3.,
       3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3.,
       3., 3., 3., 1., 1., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 3., 3., 3., 1., 3., 2., 2., 3., 3., 3., 3., 3.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 3., 3., 3., 3.

In [92]:
test_mar['classification'] = preds_3
test_mar

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-03-01,2,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,915,90.00,1.0,2022-03-01 00:00:00,2022,3,1,0
1,2022-03-01,2,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,574,89.83,1.0,2022-03-01 01:00:00,2022,3,1,1
2,2022-03-01,2,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,472,82.33,1.0,2022-03-01 02:00:00,2022,3,1,2
3,2022-03-01,2,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,431,82.17,1.0,2022-03-01 03:00:00,2022,3,1,3
4,2022-03-01,2,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,555,80.25,1.0,2022-03-01 04:00:00,2022,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-03-31,4,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2973,19.00,3.0,2022-03-31 19:00:00,2022,3,31,19
740,2022-03-31,4,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2601,73.75,1.0,2022-03-31 20:00:00,2022,3,31,20
741,2022-03-31,4,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2633,75.17,1.0,2022-03-31 21:00:00,2022,3,31,21
742,2022-03-31,4,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2222,80.25,1.0,2022-03-31 22:00:00,2022,3,31,22


# 4월 데이터 머신러닝

## 데이터 가공

In [93]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [94]:
X4 = train_apr.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [95]:
y4 = train_apr[['datetime', 'classification']]
X4_1 = X4.drop(columns = ['datetime', 'classification'])
y4_1 = X4.datetime

In [96]:
X4_1_scaler = scaler.fit_transform(X4_1)

In [97]:
X4_1_sc = pd.DataFrame(X4_1_scaler)
X4_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X4 = pd.concat([y4_1, X4_1_sc], axis = 1)
X4

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-04-01 00:00:00,0.000000,0.199486,0.851117,0.0,0.0,0.0,0.000000
1,2019-04-01 01:00:00,0.000000,0.118252,0.851117,0.0,0.0,0.0,0.043478
2,2019-04-01 02:00:00,0.000000,0.077121,0.870191,0.0,0.0,0.0,0.086957
3,2019-04-01 03:00:00,0.000000,0.076350,0.848392,0.0,0.0,0.0,0.130435
4,2019-04-01 04:00:00,0.000000,0.154499,0.830191,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-04-30 19:00:00,0.666667,0.758098,0.148120,1.0,0.0,1.0,0.826087
2156,2021-04-30 20:00:00,0.666667,0.789203,0.140817,1.0,0.0,1.0,0.869565
2157,2021-04-30 21:00:00,0.666667,0.768895,0.179837,1.0,0.0,1.0,0.913043
2158,2021-04-30 22:00:00,0.666667,0.758098,0.335150,1.0,0.0,1.0,0.956522


In [98]:
X4_test = test_apr.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [99]:
y4_test = test_apr[['datetime', 'classification']]
X4_1_test = X4_test.drop(columns = ['datetime', 'classification'])
y4_1_test = X4_test.datetime

In [100]:
X4_1_test_scaler = scaler.fit_transform(X4_1_test)

In [101]:
X4_1_test_sc = pd.DataFrame(X4_1_test_scaler)
X4_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X4_test = pd.concat([y4_1_test, X4_1_test_sc], axis = 1)
X4_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-04-01 00:00:00,0.666667,0.176228,0.931091,0.0,0.0,0.0,0.000000
1,2022-04-01 01:00:00,0.666667,0.096624,0.957640,0.0,0.0,0.0,0.043478
2,2022-04-01 02:00:00,0.666667,0.055449,0.929204,0.0,0.0,0.0,0.086957
3,2022-04-01 03:00:00,0.666667,0.071644,0.910442,0.0,0.0,0.0,0.130435
4,2022-04-01 04:00:00,0.666667,0.138348,0.917404,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-04-30 19:00:00,0.833333,0.469668,0.579115,0.0,0.0,1.0,0.826087
716,2022-04-30 20:00:00,0.833333,0.728795,0.741357,0.0,0.0,1.0,0.869565
717,2022-04-30 21:00:00,0.833333,0.765852,0.534867,0.0,0.0,1.0,0.913043
718,2022-04-30 22:00:00,0.833333,0.648092,0.782655,0.0,0.0,1.0,0.956522


## LightGBM

In [102]:
X4 = X4.drop(columns = ['datetime'])
y4 = y4.drop(columns = ['datetime'])
X4_test = X4_test.drop(columns = ['datetime'])
y4_test = y4_test.drop(columns = ['datetime'])

In [103]:
study4 = optuna.create_study(direction='maximize',sampler=TPESampler())
study4.optimize(lambda trial : objectiveLGBM(trial, X4, y4), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study4.best_trial.value,study4.best_trial.params))

[32m[I 2023-01-12 08:02:57,740][0m A new study created in memory with name: no-name-da305222-e53b-411c-8f75-5abfc7538d11[0m
[32m[I 2023-01-12 08:02:59,836][0m Trial 0 finished with value: 1.0 and parameters: {'num_leaves': 450, 'n_estimators': 1871, 'feature_fraction': 0.9108788705421609, 'bagging_fraction': 0.44430256645966415, 'bagging_freq': 7, 'min_child_samples': 78}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-01-12 08:03:03,046][0m Trial 1 finished with value: 0.9984639016897082 and parameters: {'num_leaves': 460, 'n_estimators': 1787, 'feature_fraction': 0.6599584014266312, 'bagging_fraction': 0.6769015596606718, 'bagging_freq': 2, 'min_child_samples': 71}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-01-12 08:03:04,690][0m Trial 2 finished with value: 0.9965457685664939 and parameters: {'num_leaves': 437, 'n_estimators': 851, 'feature_fraction': 0.8236750496907064, 'bagging_fraction': 0.8592571886957783, 'bagging_freq': 3, 'min_child_samples': 54}. Best is tr

Best trial: score 1.0,
params {'num_leaves': 450, 'n_estimators': 1871, 'feature_fraction': 0.9108788705421609, 'bagging_fraction': 0.44430256645966415, 'bagging_freq': 7, 'min_child_samples': 78}


In [104]:
optuna.visualization.plot_param_importances(study4) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study4) # 최적화 과정 시각화

In [105]:
X4_train, X4_val, y4_train, y4_val = train_test_split(X4, y4, test_size = 0.2, random_state = 42)

In [106]:
X4_train.shape, X4_val.shape, y4_train.shape, y4_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [107]:
model = LGBMClassifier(**study4.best_trial.params)

In [108]:
model4 = model.fit(X4_train, y4_train,
          eval_set = [(X4_train, y4_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.753342	training's multi_logloss: 0.753342
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.673821	training's multi_logloss: 0.673821
[3]	training's multi_logloss: 0.593395	training's multi_logloss: 0.593395
[4]	training's multi_logloss: 0.522011	training's multi_logloss: 0.522011
[5]	training's multi_logloss: 0.462198	training's multi_logloss: 0.462198
[6]	training's multi_logloss: 0.421329	training's multi_logloss: 0.421329
[7]	training's multi_logloss: 0.380003	training's multi_logloss: 0.380003
[8]	training's multi_logloss: 0.343966	training's multi_logloss: 0.343966
[9]	training's multi_logloss: 0.31558	training's multi_logloss: 0.31558
[10]	training's multi_logloss: 0.284188	training's multi_logloss: 0.284188
[11]	training's multi_logloss: 0.256534	training's multi_logloss: 0.256534
[12]	training's multi_logloss: 0.234477	training's multi_logloss: 0.234477
[13]	training's multi_logloss: 0.211476	training

In [109]:
train4_preds = model4.predict(X4_train)
val4_preds = model4.predict(X4_val)

In [110]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [111]:
get_clf_eval(y4_train, train4_preds)
get_clf_eval(y4_val, val4_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [112]:
preds_4= model4.predict(X4_test)
preds_4

array([1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       3., 3., 3., 2., 1., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 3., 2., 1., 1., 3., 3., 3., 1.,
       3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3.,
       1., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 3., 3., 3., 3., 3., 3., 3., 2., 3., 3., 3., 3., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3.,
       3., 3., 3., 3., 2., 2., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [113]:
test_apr['classification'] = preds_4
test_apr

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-04-01,5,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,938,87.58,1.0,2022-04-01 00:00:00,2022,4,1,0
1,2022-04-01,5,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,648,89.83,1.0,2022-04-01 01:00:00,2022,4,1,1
2,2022-04-01,5,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,498,87.42,1.0,2022-04-01 02:00:00,2022,4,1,2
3,2022-04-01,5,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,557,85.83,1.0,2022-04-01 03:00:00,2022,4,1,3
4,2022-04-01,5,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,800,86.42,1.0,2022-04-01 04:00:00,2022,4,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-04-30,6,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2007,57.75,1.0,2022-04-30 19:00:00,2022,4,30,19
716,2022-04-30,6,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2951,71.50,1.0,2022-04-30 20:00:00,2022,4,30,20
717,2022-04-30,6,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3086,54.00,1.0,2022-04-30 21:00:00,2022,4,30,21
718,2022-04-30,6,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2657,75.00,1.0,2022-04-30 22:00:00,2022,4,30,22


# 5월 데이터 머신러닝

## 데이터 가공

In [114]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [115]:
X5 = train_may.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [116]:
y5 = train_may[['datetime', 'classification']]
X5_1 = X5.drop(columns = ['datetime', 'classification'])
y5_1 = X5.datetime

In [117]:
X5_1_scaler = scaler.fit_transform(X5_1)

In [118]:
X5_1_sc = pd.DataFrame(X5_1_scaler)
X5_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X5 = pd.concat([y5_1, X5_1_sc], axis = 1)
X5

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-05-01 00:00:00,0.333333,0.375324,0.826252,0.0,0.0,0.0,0.000000
1,2019-05-01 01:00:00,0.333333,0.242510,0.877328,0.0,0.0,0.0,0.043478
2,2019-05-01 02:00:00,0.333333,0.149564,0.868964,0.0,0.0,0.0,0.086957
3,2019-05-01 03:00:00,0.333333,0.118424,0.888480,0.0,0.0,0.0,0.130435
4,2019-05-01 04:00:00,0.333333,0.150035,0.869856,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-05-31 19:00:00,0.000000,0.622552,0.334560,1.0,0.0,1.0,0.826087
2228,2021-05-31 20:00:00,0.000000,0.595895,0.776960,1.0,0.0,1.0,0.869565
2229,2021-05-31 21:00:00,0.000000,0.567351,0.778856,1.0,0.0,1.0,0.913043
2230,2021-05-31 22:00:00,0.000000,0.531021,0.821568,1.0,0.0,1.0,0.956522


In [119]:
X5_test = test_may.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [120]:
y5_test = test_may[['datetime', 'classification']]
X5_1_test = X5_test.drop(columns = ['datetime', 'classification'])
y5_1_test = X5_test.datetime

In [121]:
X5_1_test_scaler = scaler.fit_transform(X5_1_test)

In [122]:
X5_1_test_sc = pd.DataFrame(X5_1_test_scaler)
X5_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X5_test = pd.concat([y5_1_test, X5_1_test_sc], axis = 1)
X5_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-05-01 00:00:00,1.000000,0.200856,0.957159,0.0,0.0,0.0,0.000000
1,2022-05-01 01:00:00,1.000000,0.105108,0.969686,0.0,0.0,0.0,0.043478
2,2022-05-01 02:00:00,1.000000,0.032094,0.986471,0.0,0.0,0.0,0.086957
3,2022-05-01 03:00:00,1.000000,0.020594,0.979081,0.0,0.0,0.0,0.130435
4,2022-05-01 04:00:00,1.000000,0.025408,0.974947,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-05-31 19:00:00,0.166667,0.727735,0.164850,0.0,0.0,1.0,0.826087
740,2022-05-31 20:00:00,0.166667,0.704734,0.364274,0.0,0.0,1.0,0.869565
741,2022-05-31 21:00:00,0.166667,0.701524,0.767255,0.0,0.0,1.0,0.913043
742,2022-05-31 22:00:00,0.166667,0.654988,0.626331,0.0,0.0,1.0,0.956522


## LightGBM

In [123]:
X5 = X5.drop(columns = ['datetime'])
y5 = y5.drop(columns = ['datetime'])
X5_test = X5_test.drop(columns = ['datetime'])
y5_test = y5_test.drop(columns = ['datetime'])

In [124]:
study5 = optuna.create_study(direction='maximize',sampler=TPESampler())
study5.optimize(lambda trial : objectiveLGBM(trial, X5, y5), n_trials=20) 
print('Best trial: score {},\nparams {}'.format(study5.best_trial.value,study5.best_trial.params))

[32m[I 2023-01-12 08:03:56,567][0m A new study created in memory with name: no-name-2c61be39-3856-4bff-a77b-b9ee851c8213[0m
[32m[I 2023-01-12 08:03:59,265][0m Trial 0 finished with value: 0.9939133548156104 and parameters: {'num_leaves': 39, 'n_estimators': 1683, 'feature_fraction': 0.8682327395547423, 'bagging_fraction': 0.6959990604335511, 'bagging_freq': 6, 'min_child_samples': 57}. Best is trial 0 with value: 0.9939133548156104.[0m
[32m[I 2023-01-12 08:04:02,934][0m Trial 1 finished with value: 1.0 and parameters: {'num_leaves': 201, 'n_estimators': 1206, 'feature_fraction': 0.6497695918901321, 'bagging_fraction': 0.43175917066223074, 'bagging_freq': 4, 'min_child_samples': 11}. Best is trial 1 with value: 1.0.[0m
[32m[I 2023-01-12 08:04:05,280][0m Trial 2 finished with value: 0.9885057471264368 and parameters: {'num_leaves': 275, 'n_estimators': 991, 'feature_fraction': 0.5094824231474333, 'bagging_fraction': 0.7856694349668039, 'bagging_freq': 6, 'min_child_samples': 3

Best trial: score 1.0,
params {'num_leaves': 201, 'n_estimators': 1206, 'feature_fraction': 0.6497695918901321, 'bagging_fraction': 0.43175917066223074, 'bagging_freq': 4, 'min_child_samples': 11}


In [125]:
optuna.visualization.plot_param_importances(study5) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study5) # 최적화 과정 시각화

In [126]:
X5_train, X5_val, y5_train, y5_val = train_test_split(X5, y5, test_size = 0.2, random_state = 42)

In [127]:
X5_train.shape, X5_val.shape, y5_train.shape, y5_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [128]:
model = LGBMClassifier(**study5.best_trial.params)

In [129]:
model5 = model.fit(X5_train, y5_train,
          eval_set = [(X5_train, y5_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.819885	training's multi_logloss: 0.819885
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.693851	training's multi_logloss: 0.693851
[3]	training's multi_logloss: 0.613432	training's multi_logloss: 0.613432
[4]	training's multi_logloss: 0.544207	training's multi_logloss: 0.544207
[5]	training's multi_logloss: 0.478762	training's multi_logloss: 0.478762
[6]	training's multi_logloss: 0.430684	training's multi_logloss: 0.430684
[7]	training's multi_logloss: 0.389941	training's multi_logloss: 0.389941
[8]	training's multi_logloss: 0.351809	training's multi_logloss: 0.351809
[9]	training's multi_logloss: 0.322747	training's multi_logloss: 0.322747
[10]	training's multi_logloss: 0.293728	training's multi_logloss: 0.293728
[11]	training's multi_logloss: 0.275003	training's multi_logloss: 0.275003
[12]	training's multi_logloss: 0.256451	training's multi_logloss: 0.256451
[13]	training's multi_logloss: 0.233433	traini

In [130]:
train5_preds = model5.predict(X5_train)
val5_preds = model5.predict(X5_val)

In [131]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [132]:
get_clf_eval(y5_train, train5_preds)
get_clf_eval(y5_val, val5_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [133]:
preds_5= model5.predict(X5_test)
preds_5

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3.,
       2., 3., 3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 3., 3., 3., 3., 1., 1., 1., 1., 2., 3., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 2., 3.,
       3., 3., 3., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 3., 3., 3., 3.

In [134]:
test_may['classification'] = preds_5
test_may

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-05-01,7,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1119,87.83,1.0,2022-05-01 00:00:00,2022,5,1,0
1,2022-05-01,7,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,761,88.83,1.0,2022-05-01 01:00:00,2022,5,1,1
2,2022-05-01,7,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,488,90.17,1.0,2022-05-01 02:00:00,2022,5,1,2
3,2022-05-01,7,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,445,89.58,1.0,2022-05-01 03:00:00,2022,5,1,3
4,2022-05-01,7,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,463,89.25,1.0,2022-05-01 04:00:00,2022,5,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-05-31,2,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3089,24.58,3.0,2022-05-31 19:00:00,2022,5,31,19
740,2022-05-31,2,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3003,40.50,2.0,2022-05-31 20:00:00,2022,5,31,20
741,2022-05-31,2,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2991,72.67,1.0,2022-05-31 21:00:00,2022,5,31,21
742,2022-05-31,2,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2817,61.42,1.0,2022-05-31 22:00:00,2022,5,31,22


# 6월 데이터 머신러닝

## 데이터 가공

In [135]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [136]:
X6 = train_jun.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [137]:
y6 = train_jun[['datetime', 'classification']]
X6_1 = X6.drop(columns = ['datetime', 'classification'])
y6_1 = X6.datetime

In [138]:
X6_1_scaler = scaler.fit_transform(X6_1)

In [139]:
X6_1_sc = pd.DataFrame(X6_1_scaler)
X6_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X6 = pd.concat([y6_1, X6_1_sc], axis = 1)
X6

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-06-01 00:00:00,0.833333,0.345244,0.758031,0.0,0.0,0.0,0.000000
1,2019-06-01 01:00:00,0.833333,0.227206,0.824263,0.0,0.0,0.0,0.043478
2,2019-06-01 02:00:00,0.833333,0.136767,0.861022,0.0,0.0,0.0,0.086957
3,2019-06-01 03:00:00,0.833333,0.124938,0.870295,0.0,0.0,0.0,0.130435
4,2019-06-01 04:00:00,0.833333,0.156235,0.841704,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-06-30 19:00:00,0.333333,0.644899,0.336682,1.0,0.0,1.0,0.826087
2156,2021-06-30 20:00:00,0.333333,0.635042,0.746992,1.0,0.0,1.0,0.869565
2157,2021-06-30 21:00:00,0.333333,0.671267,0.708356,1.0,0.0,1.0,0.913043
2158,2021-06-30 22:00:00,0.333333,0.560867,0.795783,1.0,0.0,1.0,0.956522


In [140]:
X6_test = test_jun.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [141]:
y6_test = test_jun[['datetime', 'classification']]
X6_1_test = X6_test.drop(columns = ['datetime', 'classification'])
y6_1_test = X6_test.datetime

In [142]:
X6_1_test_scaler = scaler.fit_transform(X6_1_test)

In [143]:
X6_1_test_sc = pd.DataFrame(X6_1_test_scaler)
X6_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X6_test = pd.concat([y6_1_test, X6_1_test_sc], axis = 1)
X6_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-06-01 00:00:00,0.333333,0.306429,0.912469,0.0,0.0,0.0,0.000000
1,2022-06-01 01:00:00,0.333333,0.176733,0.926914,0.0,0.0,0.0,0.043478
2,2022-06-01 02:00:00,0.333333,0.094072,0.989630,0.0,0.0,0.0,0.086957
3,2022-06-01 03:00:00,0.333333,0.068745,0.970123,0.0,0.0,0.0,0.130435
4,2022-06-01 04:00:00,0.333333,0.129140,0.938272,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-06-30 19:00:00,0.500000,0.450320,0.691358,0.0,0.0,1.0,0.826087
716,2022-06-30 20:00:00,0.500000,0.381854,0.696420,0.0,0.0,1.0,0.869565
717,2022-06-30 21:00:00,0.500000,0.396048,0.759259,0.0,0.0,1.0,0.913043
718,2022-06-30 22:00:00,0.500000,0.394935,0.814815,0.0,0.0,1.0,0.956522


## LightGBM

In [144]:
X6 = X6.drop(columns = ['datetime'])
y6 = y6.drop(columns = ['datetime'])
X6_test = X6_test.drop(columns = ['datetime'])
y6_test = y6_test.drop(columns = ['datetime'])

In [145]:
study6 = optuna.create_study(direction='maximize',sampler=TPESampler())
study6.optimize(lambda trial : objectiveLGBM(trial, X6, y6), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study6.best_trial.value,study6.best_trial.params))

[32m[I 2023-01-12 08:05:05,434][0m A new study created in memory with name: no-name-ec142747-13ee-4616-85c9-6bf1b18edb79[0m
[32m[I 2023-01-12 08:05:07,544][0m Trial 0 finished with value: 0.9984051036682615 and parameters: {'num_leaves': 95, 'n_estimators': 1550, 'feature_fraction': 0.9245673565703048, 'bagging_fraction': 0.5412971305491867, 'bagging_freq': 6, 'min_child_samples': 60}. Best is trial 0 with value: 0.9984051036682615.[0m
[32m[I 2023-01-12 08:05:10,138][0m Trial 1 finished with value: 0.9969969969969971 and parameters: {'num_leaves': 299, 'n_estimators': 1130, 'feature_fraction': 0.9247541400738318, 'bagging_fraction': 0.5731523751758392, 'bagging_freq': 6, 'min_child_samples': 26}. Best is trial 0 with value: 0.9984051036682615.[0m
[32m[I 2023-01-12 08:05:13,697][0m Trial 2 finished with value: 0.9666666666666667 and parameters: {'num_leaves': 80, 'n_estimators': 981, 'feature_fraction': 0.7953449385373434, 'bagging_fraction': 0.6156496967723301, 'bagging_freq

Best trial: score 1.0,
params {'num_leaves': 262, 'n_estimators': 1614, 'feature_fraction': 0.9982794913272247, 'bagging_fraction': 0.5013884938484308, 'bagging_freq': 4, 'min_child_samples': 89}


In [146]:
optuna.visualization.plot_param_importances(study6) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study6) # 최적화 과정 시각화

In [147]:
X6_train, X6_val, y6_train, y6_val = train_test_split(X6, y6, test_size = 0.2, random_state = 42)

In [148]:
X6_train.shape, X6_val.shape, y6_train.shape, y6_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [149]:
model = LGBMClassifier(**study6.best_trial.params)

In [150]:
model6 = model.fit(X6_train, y6_train,
          eval_set = [(X6_train, y6_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.746666	training's multi_logloss: 0.746666
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.667399	training's multi_logloss: 0.667399
[3]	training's multi_logloss: 0.583192	training's multi_logloss: 0.583192
[4]	training's multi_logloss: 0.515256	training's multi_logloss: 0.515256
[5]	training's multi_logloss: 0.45642	training's multi_logloss: 0.45642
[6]	training's multi_logloss: 0.415254	training's multi_logloss: 0.415254
[7]	training's multi_logloss: 0.374374	training's multi_logloss: 0.374374
[8]	training's multi_logloss: 0.338491	training's multi_logloss: 0.338491
[9]	training's multi_logloss: 0.31076	training's multi_logloss: 0.31076
[10]	training's multi_logloss: 0.279323	training's multi_logloss: 0.279323
[11]	training's multi_logloss: 0.251416	training's multi_logloss: 0.251416
[12]	training's multi_logloss: 0.230722	training's multi_logloss: 0.230722
[13]	training's multi_logloss: 0.20816	training's 

In [151]:
train6_preds = model6.predict(X6_train)
val6_preds = model6.predict(X6_val)

In [152]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [153]:
get_clf_eval(y6_train, train6_preds)
get_clf_eval(y6_val, val6_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [154]:
preds_6= model6.predict(X6_test)
preds_6

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 3.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3.,
       3., 3., 3., 2., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 3., 3., 2., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3.,
       3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3.,
       3., 3., 3., 3., 1., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 2., 3., 3., 3., 3., 3.,
       2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3.,
       3., 2., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 3., 3., 3., 3.

In [155]:
test_jun['classification'] = preds_6
test_jun

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-06-01,3,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1524,84.08,1.0,2022-06-01 00:00:00,2022,6,1,0
1,2022-06-01,3,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1058,85.25,1.0,2022-06-01 01:00:00,2022,6,1,1
2,2022-06-01,3,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,761,90.33,1.0,2022-06-01 02:00:00,2022,6,1,2
3,2022-06-01,3,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,670,88.75,1.0,2022-06-01 03:00:00,2022,6,1,3
4,2022-06-01,3,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,887,86.17,1.0,2022-06-01 04:00:00,2022,6,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-06-30,4,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2041,66.17,1.0,2022-06-30 19:00:00,2022,6,30,19
716,2022-06-30,4,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1795,66.58,1.0,2022-06-30 20:00:00,2022,6,30,20
717,2022-06-30,4,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1846,71.67,1.0,2022-06-30 21:00:00,2022,6,30,21
718,2022-06-30,4,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1842,76.17,1.0,2022-06-30 22:00:00,2022,6,30,22


# 7월 데이터 머신러닝

## 데이터 가공

In [156]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [157]:
X7 = train_jul.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [158]:
y7 = train_jul[['datetime', 'classification']]
X7_1 = X7.drop(columns = ['datetime', 'classification'])
y7_1 = X7.datetime

In [159]:
X7_1_scaler = scaler.fit_transform(X7_1)

In [160]:
X7_1_sc = pd.DataFrame(X7_1_scaler)
X7_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X7 = pd.concat([y7_1, X7_1_sc], axis = 1)
X7

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-07-01 00:00:00,0.000000,0.213879,0.797766,0.0,0.0,0.0,0.000000
1,2019-07-01 01:00:00,0.000000,0.102389,0.840914,0.0,0.0,0.0,0.043478
2,2019-07-01 02:00:00,0.000000,0.074858,0.823147,0.0,0.0,0.0,0.086957
3,2019-07-01 03:00:00,0.000000,0.068942,0.806193,0.0,0.0,0.0,0.130435
4,2019-07-01 04:00:00,0.000000,0.173606,0.810457,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-07-31 19:00:00,0.833333,0.444369,0.769848,1.0,0.0,1.0,0.826087
2228,2021-07-31 20:00:00,0.833333,0.428896,0.757970,1.0,0.0,1.0,0.869565
2229,2021-07-31 21:00:00,0.833333,0.448009,0.756345,1.0,0.0,1.0,0.913043
2230,2021-07-31 22:00:00,0.833333,0.484414,0.775736,1.0,0.0,1.0,0.956522


In [161]:
X7_test = test_jul.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [162]:
y7_test = test_jul[['datetime', 'classification']]
X7_1_test = X7_test.drop(columns = ['datetime', 'classification'])
y7_1_test = X7_test.datetime

In [163]:
X7_1_test_scaler = scaler.fit_transform(X7_1_test)

In [164]:
X7_1_test_sc = pd.DataFrame(X7_1_test_scaler)
X7_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X7_test = pd.concat([y7_1_test, X7_1_test_sc], axis = 1)
X7_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-07-01 00:00:00,0.666667,0.145562,0.913380,0.0,0.0,0.0,0.000000
1,2022-07-01 01:00:00,0.666667,0.092425,0.937576,0.0,0.0,0.0,0.043478
2,2022-07-01 02:00:00,0.666667,0.047202,0.921365,0.0,0.0,0.0,0.086957
3,2022-07-01 03:00:00,0.666667,0.046354,0.894145,0.0,0.0,0.0,0.130435
4,2022-07-01 04:00:00,0.666667,0.135105,0.879023,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-07-31 19:00:00,1.000000,0.622668,0.602831,0.0,0.0,1.0,0.826087
740,2022-07-31 20:00:00,1.000000,0.721594,0.369949,0.0,0.0,1.0,0.869565
741,2022-07-31 21:00:00,1.000000,0.611362,0.467699,0.0,0.0,1.0,0.913043
742,2022-07-31 22:00:00,1.000000,0.488977,0.687515,0.0,0.0,1.0,0.956522


## LightGBM

In [165]:
X7 = X7.drop(columns = ['datetime'])
y7 = y7.drop(columns = ['datetime'])
X7_test = X7_test.drop(columns = ['datetime'])
y7_test = y7_test.drop(columns = ['datetime'])

In [166]:
study7 = optuna.create_study(direction='maximize',sampler=TPESampler())
study7.optimize(lambda trial : objectiveLGBM(trial, X7, y7), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study7.best_trial.value,study7.best_trial.params))

[32m[I 2023-01-12 08:05:56,223][0m A new study created in memory with name: no-name-e9b3c591-d18a-4460-b2d6-a57f4c1d659a[0m
[32m[I 2023-01-12 08:05:59,906][0m Trial 0 finished with value: 0.9967159277504104 and parameters: {'num_leaves': 304, 'n_estimators': 1749, 'feature_fraction': 0.8172742497292171, 'bagging_fraction': 0.8889011307918517, 'bagging_freq': 5, 'min_child_samples': 45}. Best is trial 0 with value: 0.9967159277504104.[0m
[32m[I 2023-01-12 08:06:02,737][0m Trial 1 finished with value: 1.0 and parameters: {'num_leaves': 49, 'n_estimators': 1996, 'feature_fraction': 0.9779768728586129, 'bagging_fraction': 0.43337478055636913, 'bagging_freq': 7, 'min_child_samples': 34}. Best is trial 1 with value: 1.0.[0m
[32m[I 2023-01-12 08:06:04,068][0m Trial 2 finished with value: 1.0 and parameters: {'num_leaves': 16, 'n_estimators': 903, 'feature_fraction': 0.6951179945767609, 'bagging_fraction': 0.8981334743370892, 'bagging_freq': 3, 'min_child_samples': 71}. Best is tria

Best trial: score 1.0,
params {'num_leaves': 49, 'n_estimators': 1996, 'feature_fraction': 0.9779768728586129, 'bagging_fraction': 0.43337478055636913, 'bagging_freq': 7, 'min_child_samples': 34}


In [167]:
optuna.visualization.plot_param_importances(study7) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study7) # 최적화 과정 시각화

In [168]:
X7_train, X7_val, y7_train, y7_val = train_test_split(X7, y7, test_size = 0.2, random_state = 42)

In [169]:
X7_train.shape, X7_val.shape, y7_train.shape, y7_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [170]:
model = LGBMClassifier(**study7.best_trial.params)

In [171]:
model7 = model.fit(X7_train, y7_train,
          eval_set = [(X7_train, y7_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.726707	training's multi_logloss: 0.726707
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.640857	training's multi_logloss: 0.640857
[3]	training's multi_logloss: 0.55951	training's multi_logloss: 0.55951
[4]	training's multi_logloss: 0.49187	training's multi_logloss: 0.49187
[5]	training's multi_logloss: 0.434567	training's multi_logloss: 0.434567
[6]	training's multi_logloss: 0.392456	training's multi_logloss: 0.392456
[7]	training's multi_logloss: 0.352458	training's multi_logloss: 0.352458
[8]	training's multi_logloss: 0.317605	training's multi_logloss: 0.317605
[9]	training's multi_logloss: 0.288958	training's multi_logloss: 0.288958
[10]	training's multi_logloss: 0.258513	training's multi_logloss: 0.258513
[11]	training's multi_logloss: 0.231731	training's multi_logloss: 0.231731
[12]	training's multi_logloss: 0.210608	training's multi_logloss: 0.210608
[13]	training's multi_logloss: 0.189363	training's

In [172]:
train7_preds = model7.predict(X7_train)
val7_preds = model7.predict(X7_val)

In [173]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [174]:
get_clf_eval(y7_train, train7_preds)
get_clf_eval(y7_val, val7_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9983
재현율: 0.9872


In [175]:
preds_7= model7.predict(X7_test)
preds_7

array([1., 1., 1., 1., 1., 1., 2., 3., 3., 2., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 1., 1.,
       3., 1., 3., 3., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 2., 3., 3., 3., 3., 1., 1.,
       2., 3., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 1., 2., 3., 3.,
       2., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 3., 3., 3., 3., 1., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 2., 3.,
       1., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [176]:
test_jul['classification'] = preds_7
test_jul

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-07-01,5,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,870,85.17,1.0,2022-07-01 00:00:00,2022,7,1,0
1,2022-07-01,5,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,682,87.17,1.0,2022-07-01 01:00:00,2022,7,1,1
2,2022-07-01,5,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,522,85.83,1.0,2022-07-01 02:00:00,2022,7,1,2
3,2022-07-01,5,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,519,83.58,1.0,2022-07-01 03:00:00,2022,7,1,3
4,2022-07-01,5,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,833,82.33,1.0,2022-07-01 04:00:00,2022,7,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-07-31,7,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2558,59.50,1.0,2022-07-31 19:00:00,2022,7,31,19
740,2022-07-31,7,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2908,40.25,2.0,2022-07-31 20:00:00,2022,7,31,20
741,2022-07-31,7,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2518,48.33,2.0,2022-07-31 21:00:00,2022,7,31,21
742,2022-07-31,7,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2085,66.50,1.0,2022-07-31 22:00:00,2022,7,31,22


# 8월 데이터 머신러닝

## 데이터 가공

In [177]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [178]:
X8 = train_aug.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [179]:
y8 = train_aug[['datetime', 'classification']]
X8_1 = X8.drop(columns = ['datetime', 'classification'])
y8_1 = X8.datetime

In [180]:
X8_1_scaler = scaler.fit_transform(X8_1)

In [181]:
X8_1_sc = pd.DataFrame(X8_1_scaler)
X8_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X8 = pd.concat([y8_1, X8_1_sc], axis = 1)
X8

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-08-01 00:00:00,0.500000,0.255410,0.728526,0.0,0.0,0.0,0.000000
1,2019-08-01 01:00:00,0.500000,0.132360,0.619549,0.0,0.0,0.0,0.043478
2,2019-08-01 02:00:00,0.500000,0.084298,0.656215,0.0,0.0,0.0,0.086957
3,2019-08-01 03:00:00,0.500000,0.079014,0.669799,0.0,0.0,0.0,0.130435
4,2019-08-01 04:00:00,0.500000,0.138148,0.692779,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2227,2021-08-31 19:00:00,0.166667,0.727730,0.098662,1.0,0.0,1.0,0.826087
2228,2021-08-31 20:00:00,0.166667,0.719930,0.178736,1.0,0.0,1.0,0.869565
2229,2021-08-31 21:00:00,0.166667,0.616004,0.274027,1.0,0.0,1.0,0.913043
2230,2021-08-31 22:00:00,0.166667,0.418470,0.608518,1.0,0.0,1.0,0.956522


In [182]:
X8_test = test_aug.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [183]:
y8_test = test_aug[['datetime', 'classification']]
X8_1_test = X8_test.drop(columns = ['datetime', 'classification'])
y8_1_test = X8_test.datetime

In [184]:
X8_1_test_scaler = scaler.fit_transform(X8_1_test)

In [185]:
X8_1_test_sc = pd.DataFrame(X8_1_test_scaler)
X8_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X8_test = pd.concat([y8_1_test, X8_1_test_sc], axis = 1)
X8_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-08-01 00:00:00,0.000000,0.174714,0.887850,0.0,0.0,0.0,0.000000
1,2022-08-01 01:00:00,0.000000,0.106272,0.937695,0.0,0.0,0.0,0.043478
2,2022-08-01 02:00:00,0.000000,0.069189,0.903427,0.0,0.0,0.0,0.086957
3,2022-08-01 03:00:00,0.000000,0.091837,0.795389,0.0,0.0,0.0,0.130435
4,2022-08-01 04:00:00,0.000000,0.153559,0.766355,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-08-31 19:00:00,0.333333,0.729965,0.130841,0.0,0.0,1.0,0.826087
740,2022-08-31 20:00:00,0.333333,0.703584,0.340561,0.0,0.0,1.0,0.869565
741,2022-08-31 21:00:00,0.333333,0.661274,0.552399,0.0,0.0,1.0,0.913043
742,2022-08-31 22:00:00,0.333333,0.548532,0.618941,0.0,0.0,1.0,0.956522


## LightGBM

In [186]:
X8 = X8.drop(columns = ['datetime'])
y8 = y8.drop(columns = ['datetime'])
X8_test = X8_test.drop(columns = ['datetime'])
y8_test = y8_test.drop(columns = ['datetime'])

In [187]:
study8 = optuna.create_study(direction='maximize',sampler=TPESampler())
study8.optimize(lambda trial : objectiveLGBM(trial, X8, y8), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study8.best_trial.value,study8.best_trial.params))

[32m[I 2023-01-12 08:06:53,068][0m A new study created in memory with name: no-name-cb1f3a9d-31a6-4815-a0d0-f8fa13009c4b[0m
[32m[I 2023-01-12 08:06:56,800][0m Trial 0 finished with value: 0.9952598722415796 and parameters: {'num_leaves': 74, 'n_estimators': 2408, 'feature_fraction': 0.8030996112630261, 'bagging_fraction': 0.7324268772300233, 'bagging_freq': 3, 'min_child_samples': 46}. Best is trial 0 with value: 0.9952598722415796.[0m
[32m[I 2023-01-12 08:07:00,285][0m Trial 1 finished with value: 0.9833333333333334 and parameters: {'num_leaves': 335, 'n_estimators': 1679, 'feature_fraction': 0.87971049539669, 'bagging_fraction': 0.5946372955690611, 'bagging_freq': 3, 'min_child_samples': 27}. Best is trial 0 with value: 0.9952598722415796.[0m
[32m[I 2023-01-12 08:07:09,463][0m Trial 2 finished with value: 0.9984051036682615 and parameters: {'num_leaves': 176, 'n_estimators': 1896, 'feature_fraction': 0.4456116443398696, 'bagging_fraction': 0.9035611058008589, 'bagging_freq

Best trial: score 1.0,
params {'num_leaves': 289, 'n_estimators': 992, 'feature_fraction': 0.6169208240293004, 'bagging_fraction': 0.6834294272675097, 'bagging_freq': 7, 'min_child_samples': 89}


In [188]:
optuna.visualization.plot_param_importances(study8) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study8) # 최적화 과정 시각화

In [189]:
X8_train, X8_val, y8_train, y8_val = train_test_split(X8, y8, test_size = 0.2, random_state = 42)

In [190]:
X8_train.shape, X8_val.shape, y8_train.shape, y8_val.shape

((1785, 7), (447, 7), (1785, 1), (447, 1))

In [191]:
model = LGBMClassifier(**study8.best_trial.params)

In [192]:
model8 = model.fit(X8_train, y8_train,
          eval_set = [(X8_train, y8_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.795351	training's multi_logloss: 0.795351
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.683424	training's multi_logloss: 0.683424
[3]	training's multi_logloss: 0.612827	training's multi_logloss: 0.612827
[4]	training's multi_logloss: 0.549411	training's multi_logloss: 0.549411
[5]	training's multi_logloss: 0.486807	training's multi_logloss: 0.486807
[6]	training's multi_logloss: 0.438877	training's multi_logloss: 0.438877
[7]	training's multi_logloss: 0.401637	training's multi_logloss: 0.401637
[8]	training's multi_logloss: 0.365138	training's multi_logloss: 0.365138
[9]	training's multi_logloss: 0.337276	training's multi_logloss: 0.337276
[10]	training's multi_logloss: 0.308619	training's multi_logloss: 0.308619
[11]	training's multi_logloss: 0.290687	training's multi_logloss: 0.290687
[12]	training's multi_logloss: 0.273556	training's multi_logloss: 0.273556
[13]	training's multi_logloss: 0.252436	traini

In [193]:
train8_preds = model8.predict(X8_train)
val8_preds = model8.predict(X8_val)

In [194]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [195]:
get_clf_eval(y8_train, train8_preds)
get_clf_eval(y8_val, val8_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9379
재현율: 0.9250


In [196]:
preds_8= model8.predict(X8_test)
preds_8

array([1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3.,
       2., 3., 3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 3., 3., 3., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 1., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [197]:
test_aug['classification'] = preds_8
test_aug

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-08-01,1,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,846,83.00,1.0,2022-08-01 00:00:00,2022,8,1,0
1,2022-08-01,1,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,571,87.00,1.0,2022-08-01 01:00:00,2022,8,1,1
2,2022-08-01,1,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,422,84.25,1.0,2022-08-01 02:00:00,2022,8,1,2
3,2022-08-01,1,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,513,75.58,1.0,2022-08-01 03:00:00,2022,8,1,3
4,2022-08-01,1,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,761,73.25,1.0,2022-08-01 04:00:00,2022,8,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2022-08-31,3,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3077,22.25,3.0,2022-08-31 19:00:00,2022,8,31,19
740,2022-08-31,3,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2971,39.08,2.0,2022-08-31 20:00:00,2022,8,31,20
741,2022-08-31,3,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2801,56.08,1.0,2022-08-31 21:00:00,2022,8,31,21
742,2022-08-31,3,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2348,61.42,1.0,2022-08-31 22:00:00,2022,8,31,22


# 9월 데이터 머신러닝

## 데이터 가공

In [198]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [199]:
X9 = train_sep.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [200]:
y9 = train_sep[['datetime', 'classification']]
X9_1 = X9.drop(columns = ['datetime', 'classification'])
y9_1 = X9.datetime

In [201]:
X9_1_scaler = scaler.fit_transform(X9_1)

In [202]:
X9_1_sc = pd.DataFrame(X9_1_scaler)
X9_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X9 = pd.concat([y9_1, X9_1_sc], axis = 1)
X9

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-09-01 00:00:00,1.0,0.306078,0.826561,0.0,0.0,0.0,0.000000
1,2019-09-01 01:00:00,1.0,0.177447,0.882222,0.0,0.0,0.0,0.043478
2,2019-09-01 02:00:00,1.0,0.132536,0.910052,0.0,0.0,0.0,0.086957
3,2019-09-01 03:00:00,1.0,0.098121,0.892352,0.0,0.0,0.0,0.130435
4,2019-09-01 04:00:00,1.0,0.118135,0.905377,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
2155,2021-09-30 19:00:00,0.5,0.767391,0.156741,1.0,0.0,1.0,0.826087
2156,2021-09-30 20:00:00,0.5,0.706371,0.715351,1.0,0.0,1.0,0.869565
2157,2021-09-30 21:00:00,0.5,0.693190,0.753200,1.0,0.0,1.0,0.913043
2158,2021-09-30 22:00:00,0.5,0.583598,0.811644,1.0,0.0,1.0,0.956522


In [203]:
X9_test = test_sep.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [204]:
y9_test = test_sep[['datetime', 'classification']]
X9_1_test = X9_test.drop(columns = ['datetime', 'classification'])
y9_1_test = X9_test.datetime

In [205]:
X9_1_test_scaler = scaler.fit_transform(X9_1_test)

In [206]:
X9_1_test_sc = pd.DataFrame(X9_1_test_scaler)
X9_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X9_test = pd.concat([y9_1_test, X9_1_test_sc], axis = 1)
X9_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-09-01 00:00:00,0.500000,0.217855,0.940684,0.0,0.0,0.0,0.000000
1,2022-09-01 01:00:00,0.500000,0.119990,0.967782,0.0,0.0,0.0,0.043478
2,2022-09-01 02:00:00,0.500000,0.083811,0.952173,0.0,0.0,0.0,0.086957
3,2022-09-01 03:00:00,0.500000,0.074440,0.943806,0.0,0.0,0.0,0.130435
4,2022-09-01 04:00:00,0.500000,0.174909,0.932443,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2022-09-30 19:00:00,0.666667,0.878969,0.055195,0.0,0.0,1.0,0.826087
716,2022-09-30 20:00:00,0.666667,0.773555,0.077048,0.0,0.0,1.0,0.869565
717,2022-09-30 21:00:00,0.666667,0.679334,0.324675,0.0,0.0,1.0,0.913043
718,2022-09-30 22:00:00,0.666667,0.657730,0.467283,0.0,0.0,1.0,0.956522


## LightGBM

In [207]:
X9 = X9.drop(columns = ['datetime'])
y9 = y9.drop(columns = ['datetime'])
X9_test = X9_test.drop(columns = ['datetime'])
y9_test = y9_test.drop(columns = ['datetime'])

In [208]:
study9 = optuna.create_study(direction='maximize',sampler=TPESampler())
study9.optimize(lambda trial : objectiveLGBM(trial, X9, y9), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study9.best_trial.value,study9.best_trial.params))

[32m[I 2023-01-12 08:08:02,194][0m A new study created in memory with name: no-name-b80c58b6-a895-425e-9abf-6fa5b091f7da[0m
[32m[I 2023-01-12 08:08:03,742][0m Trial 0 finished with value: 1.0 and parameters: {'num_leaves': 90, 'n_estimators': 857, 'feature_fraction': 0.8230791687144741, 'bagging_fraction': 0.7839596871332524, 'bagging_freq': 6, 'min_child_samples': 51}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-01-12 08:08:08,293][0m Trial 1 finished with value: 0.9706349206349206 and parameters: {'num_leaves': 334, 'n_estimators': 2874, 'feature_fraction': 0.4982266049343747, 'bagging_fraction': 0.8864167585559708, 'bagging_freq': 1, 'min_child_samples': 48}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-01-12 08:08:09,807][0m Trial 2 finished with value: 1.0 and parameters: {'num_leaves': 47, 'n_estimators': 746, 'feature_fraction': 0.5141435908914062, 'bagging_fraction': 0.7284499562426764, 'bagging_freq': 5, 'min_child_samples': 33}. Best is trial 0 with value: 1

Best trial: score 1.0,
params {'num_leaves': 90, 'n_estimators': 857, 'feature_fraction': 0.8230791687144741, 'bagging_fraction': 0.7839596871332524, 'bagging_freq': 6, 'min_child_samples': 51}


In [209]:
optuna.visualization.plot_param_importances(study9) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study9) # 최적화 과정 시각화

In [210]:
X9_train, X9_val, y9_train, y9_val = train_test_split(X9, y9, test_size = 0.2, random_state = 42)

In [211]:
X9_train.shape, X9_val.shape, y9_train.shape, y9_val.shape

((1728, 7), (432, 7), (1728, 1), (432, 1))

In [212]:
model = LGBMClassifier(**study9.best_trial.params)

In [213]:
model9 = model.fit(X9_train, y9_train,
          eval_set = [(X9_train, y9_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.757699	training's multi_logloss: 0.757699
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.670095	training's multi_logloss: 0.670095
[3]	training's multi_logloss: 0.585766	training's multi_logloss: 0.585766
[4]	training's multi_logloss: 0.515834	training's multi_logloss: 0.515834
[5]	training's multi_logloss: 0.45625	training's multi_logloss: 0.45625
[6]	training's multi_logloss: 0.413601	training's multi_logloss: 0.413601
[7]	training's multi_logloss: 0.372963	training's multi_logloss: 0.372963
[8]	training's multi_logloss: 0.343722	training's multi_logloss: 0.343722
[9]	training's multi_logloss: 0.313925	training's multi_logloss: 0.313925
[10]	training's multi_logloss: 0.284717	training's multi_logloss: 0.284717
[11]	training's multi_logloss: 0.255732	training's multi_logloss: 0.255732
[12]	training's multi_logloss: 0.233617	training's multi_logloss: 0.233617
[13]	training's multi_logloss: 0.213814	training

In [214]:
train9_preds = model9.predict(X9_train)
val9_preds = model9.predict(X9_val)

In [215]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [216]:
get_clf_eval(y9_train, train9_preds)
get_clf_eval(y9_val, val9_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9951
재현율: 0.9545


In [217]:
preds_9= model9.predict(X9_test)
preds_9

array([1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 1., 3., 3., 3., 3., 3., 3.,
       3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3.,
       3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1.,
       1., 1., 1., 1., 3., 3., 3., 3., 2., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 3., 3., 3.,
       3., 1., 1., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 3., 3.

In [218]:
test_sep['classification'] = preds_9
test_sep

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2022-09-01,4,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1114,87.08,1.0,2022-09-01 00:00:00,2022,9,1,0
1,2022-09-01,4,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,738,89.25,1.0,2022-09-01 01:00:00,2022,9,1,1
2,2022-09-01,4,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,599,88.00,1.0,2022-09-01 02:00:00,2022,9,1,2
3,2022-09-01,4,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,563,87.33,1.0,2022-09-01 03:00:00,2022,9,1,3
4,2022-09-01,4,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,949,86.42,1.0,2022-09-01 04:00:00,2022,9,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-09-30,5,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3654,16.17,3.0,2022-09-30 19:00:00,2022,9,30,19
716,2022-09-30,5,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3249,17.92,3.0,2022-09-30 20:00:00,2022,9,30,20
717,2022-09-30,5,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2887,37.75,3.0,2022-09-30 21:00:00,2022,9,30,21
718,2022-09-30,5,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2804,49.17,2.0,2022-09-30 22:00:00,2022,9,30,22


# 10월 데이터 머신러닝

## 데이터 가공

In [219]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [220]:
X10 = train_oct.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [221]:
y10 = train_oct[['datetime', 'classification']]
X10_1 = X10.drop(columns = ['datetime', 'classification'])
y10_1 = X10.datetime

In [222]:
X10_1_scaler = scaler.fit_transform(X10_1)

In [223]:
X10_1_sc = pd.DataFrame(X10_1_scaler)
X10_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X10 = pd.concat([y10_1, X10_1_sc], axis = 1)
X10

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-10-01 00:00:00,0.166667,0.238464,0.795866,0.0,0.0,0.0,0.000000
1,2019-10-01 01:00:00,0.166667,0.138438,0.822532,0.0,0.0,0.0,0.043478
2,2019-10-01 02:00:00,0.166667,0.084042,0.816537,0.0,0.0,0.0,0.086957
3,2019-10-01 03:00:00,0.166667,0.075277,0.812196,0.0,0.0,0.0,0.130435
4,2019-10-01 04:00:00,0.166667,0.159577,0.796693,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1483,2020-10-31 19:00:00,0.833333,0.673112,0.335090,1.0,0.0,1.0,0.826087
1484,2020-10-31 20:00:00,0.833333,0.747358,0.344496,1.0,0.0,1.0,0.869565
1485,2020-10-31 21:00:00,0.833333,0.691415,0.597726,1.0,0.0,1.0,0.913043
1486,2020-10-31 22:00:00,0.833333,0.626966,0.745943,1.0,0.0,1.0,0.956522


In [224]:
X10_test = test_oct.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [225]:
y10_test = test_oct[['datetime', 'classification']]
X10_1_test = X10_test.drop(columns = ['datetime', 'classification'])
y10_1_test = X10_test.datetime

In [226]:
X10_1_test_scaler = scaler.fit_transform(X10_1_test)

In [227]:
X10_1_test_sc = pd.DataFrame(X10_1_test_scaler)
X10_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X10_test = pd.concat([y10_1_test, X10_1_test_sc], axis = 1)
X10_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-10-01 00:00:00,0.666667,0.183454,0.953728,0.0,0.0,0.0,0.000000
1,2021-10-01 01:00:00,0.666667,0.103143,0.975385,0.0,0.0,0.0,0.043478
2,2021-10-01 02:00:00,0.666667,0.074940,0.962604,0.0,0.0,0.0,0.086957
3,2021-10-01 03:00:00,0.666667,0.077088,0.973373,0.0,0.0,0.0,0.130435
4,2021-10-01 04:00:00,0.666667,0.182917,0.956686,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2021-10-31 19:00:00,1.000000,0.784045,0.149941,0.0,0.0,1.0,0.826087
740,2021-10-31 20:00:00,1.000000,0.741606,0.081893,0.0,0.0,1.0,0.869565
741,2021-10-31 21:00:00,1.000000,0.760408,0.164734,0.0,0.0,1.0,0.913043
742,2021-10-31 22:00:00,1.000000,0.676873,0.597633,0.0,0.0,1.0,0.956522


## LightGBM

In [228]:
X10 = X10.drop(columns = ['datetime'])
y10 = y10.drop(columns = ['datetime'])
X10_test = X10_test.drop(columns = ['datetime'])
y10_test = y10_test.drop(columns = ['datetime'])

In [229]:
study10 = optuna.create_study(direction='maximize',sampler=TPESampler())
study10.optimize(lambda trial : objectiveLGBM(trial, X10, y10), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study10.best_trial.value,study10.best_trial.params))

[32m[I 2023-01-12 08:08:58,839][0m A new study created in memory with name: no-name-3a1e3f54-a283-4541-a55f-a0aa05e2e303[0m
[32m[I 2023-01-12 08:09:01,250][0m Trial 0 finished with value: 0.9975845410628019 and parameters: {'num_leaves': 146, 'n_estimators': 2793, 'feature_fraction': 0.7589794121063239, 'bagging_fraction': 0.9369550447164671, 'bagging_freq': 7, 'min_child_samples': 66}. Best is trial 0 with value: 0.9975845410628019.[0m
[32m[I 2023-01-12 08:09:03,918][0m Trial 1 finished with value: 0.9782442748091603 and parameters: {'num_leaves': 321, 'n_estimators': 2881, 'feature_fraction': 0.8859907915407369, 'bagging_fraction': 0.9094732289421061, 'bagging_freq': 2, 'min_child_samples': 41}. Best is trial 0 with value: 0.9975845410628019.[0m
[32m[I 2023-01-12 08:09:05,512][0m Trial 2 finished with value: 0.9775899603022152 and parameters: {'num_leaves': 276, 'n_estimators': 1063, 'feature_fraction': 0.4507578611157707, 'bagging_fraction': 0.820965330683018, 'bagging_fr

Best trial: score 1.0,
params {'num_leaves': 95, 'n_estimators': 1757, 'feature_fraction': 0.7567339804005654, 'bagging_fraction': 0.5334042467253567, 'bagging_freq': 4, 'min_child_samples': 55}


In [230]:
optuna.visualization.plot_param_importances(study10) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study10) # 최적화 과정 시각화

In [231]:
X10_train, X10_val, y10_train, y10_val = train_test_split(X10, y10, test_size = 0.2, random_state = 42)

In [232]:
X10_train.shape, X10_val.shape, y10_train.shape, y10_val.shape

((1190, 7), (298, 7), (1190, 1), (298, 1))

In [233]:
model = LGBMClassifier(**study10.best_trial.params)

In [234]:
model10 = model.fit(X10_train, y10_train,
          eval_set = [(X10_train, y10_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.772367	training's multi_logloss: 0.772367
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.686605	training's multi_logloss: 0.686605
[3]	training's multi_logloss: 0.604003	training's multi_logloss: 0.604003
[4]	training's multi_logloss: 0.533952	training's multi_logloss: 0.533952
[5]	training's multi_logloss: 0.47352	training's multi_logloss: 0.47352
[6]	training's multi_logloss: 0.4304	training's multi_logloss: 0.4304
[7]	training's multi_logloss: 0.389242	training's multi_logloss: 0.389242
[8]	training's multi_logloss: 0.360076	training's multi_logloss: 0.360076
[9]	training's multi_logloss: 0.330012	training's multi_logloss: 0.330012
[10]	training's multi_logloss: 0.300521	training's multi_logloss: 0.300521
[11]	training's multi_logloss: 0.270444	training's multi_logloss: 0.270444
[12]	training's multi_logloss: 0.247491	training's multi_logloss: 0.247491
[13]	training's multi_logloss: 0.227195	training's m

In [235]:
train10_preds = model10.predict(X10_train)
val10_preds = model10.predict(X10_val)

In [236]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [237]:
get_clf_eval(y10_train, train10_preds)
get_clf_eval(y10_val, val10_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [238]:
preds_10= model10.predict(X10_test)
preds_10

array([1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 3., 2., 3., 3., 3., 3., 3., 3., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 2.,
       2., 2., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [239]:
test_oct['classification'] = preds_10
test_oct

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-10-01,5,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,945,91.67,1.0,2021-10-01 00:00:00,2021,10,1,0
1,2021-10-01,5,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,646,93.50,1.0,2021-10-01 01:00:00,2021,10,1,1
2,2021-10-01,5,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,541,92.42,1.0,2021-10-01 02:00:00,2021,10,1,2
3,2021-10-01,5,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,549,93.33,1.0,2021-10-01 03:00:00,2021,10,1,3
4,2021-10-01,5,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,943,91.92,1.0,2021-10-01 04:00:00,2021,10,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2021-10-31,7,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3181,23.75,3.0,2021-10-31 19:00:00,2021,10,31,19
740,2021-10-31,7,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3023,18.00,3.0,2021-10-31 20:00:00,2021,10,31,20
741,2021-10-31,7,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3093,25.00,3.0,2021-10-31 21:00:00,2021,10,31,21
742,2021-10-31,7,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2782,61.58,1.0,2021-10-31 22:00:00,2021,10,31,22


# 11월 데이터 머신러닝

## 데이터 가공

In [240]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [241]:
X11 = train_nov.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [242]:
y11 = train_nov[['datetime', 'classification']]
X11_1 = X11.drop(columns = ['datetime', 'classification'])
y11_1 = X11.datetime

In [243]:
X11_1_scaler = scaler.fit_transform(X11_1)

In [244]:
X11_1_sc = pd.DataFrame(X11_1_scaler)
X11_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X11 = pd.concat([y11_1, X11_1_sc], axis = 1)
X11

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-11-01 00:00:00,0.666667,0.297395,0.795599,0.0,0.0,0.0,0.000000
1,2019-11-01 01:00:00,0.666667,0.151712,0.835778,0.0,0.0,0.0,0.043478
2,2019-11-01 02:00:00,0.666667,0.096237,0.843989,0.0,0.0,0.0,0.086957
3,2019-11-01 03:00:00,0.666667,0.068741,0.844865,0.0,0.0,0.0,0.130435
4,2019-11-01 04:00:00,0.666667,0.138447,0.806547,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1435,2020-11-30 19:00:00,0.000000,0.611433,0.560215,1.0,0.0,1.0,0.826087
1436,2020-11-30 20:00:00,0.000000,0.637723,0.778301,1.0,0.0,1.0,0.869565
1437,2020-11-30 21:00:00,0.000000,0.661602,0.811145,1.0,0.0,1.0,0.913043
1438,2020-11-30 22:00:00,0.000000,0.411963,0.852201,1.0,0.0,1.0,0.956522


In [245]:
X11_test = test_nov.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [246]:
y11_test = test_nov[['datetime', 'classification']]
X11_1_test = X11_test.drop(columns = ['datetime', 'classification'])
y11_1_test = X11_test.datetime

In [247]:
X11_1_test_scaler = scaler.fit_transform(X11_1_test)

In [248]:
X11_1_test_sc = pd.DataFrame(X11_1_test_scaler)
X11_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X11_test = pd.concat([y11_1_test, X11_1_test_sc], axis = 1)
X11_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-11-01 00:00:00,0.000000,0.148232,0.851606,0.0,0.0,0.0,0.000000
1,2021-11-01 01:00:00,0.000000,0.050071,0.855447,0.0,0.0,0.0,0.043478
2,2021-11-01 02:00:00,0.000000,0.013013,0.876862,0.0,0.0,0.0,0.086957
3,2021-11-01 03:00:00,0.000000,0.036492,0.840899,0.0,0.0,0.0,0.130435
4,2021-11-01 04:00:00,0.000000,0.177935,0.841946,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2021-11-30 19:00:00,0.166667,0.807921,0.120345,0.0,0.0,1.0,0.826087
716,2021-11-30 20:00:00,0.166667,0.728996,0.504423,0.0,0.0,1.0,0.869565
717,2021-11-30 21:00:00,0.166667,0.664498,0.770135,0.0,0.0,1.0,0.913043
718,2021-11-30 22:00:00,0.166667,0.459406,0.802142,0.0,0.0,1.0,0.956522


## LightGBM

In [249]:
X11 = X11.drop(columns = ['datetime'])
y11 = y11.drop(columns = ['datetime'])
X11_test = X11_test.drop(columns = ['datetime'])
y11_test = y11_test.drop(columns = ['datetime'])

In [250]:
study11 = optuna.create_study(direction='maximize',sampler=TPESampler())
study11.optimize(lambda trial : objectiveLGBM(trial, X11, y11), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study11.best_trial.value,study11.best_trial.params))

[32m[I 2023-01-12 08:09:41,439][0m A new study created in memory with name: no-name-47660d9f-2e47-449d-a09a-ba6b715ed0c6[0m
[32m[I 2023-01-12 08:09:43,471][0m Trial 0 finished with value: 0.9833333333333334 and parameters: {'num_leaves': 186, 'n_estimators': 2643, 'feature_fraction': 0.7735990607384096, 'bagging_fraction': 0.4734782527664506, 'bagging_freq': 1, 'min_child_samples': 56}. Best is trial 0 with value: 0.9833333333333334.[0m
[32m[I 2023-01-12 08:09:44,682][0m Trial 1 finished with value: 0.9555555555555556 and parameters: {'num_leaves': 219, 'n_estimators': 1730, 'feature_fraction': 0.5602450555862877, 'bagging_fraction': 0.5270688195601956, 'bagging_freq': 2, 'min_child_samples': 87}. Best is trial 0 with value: 0.9833333333333334.[0m
[32m[I 2023-01-12 08:09:46,249][0m Trial 2 finished with value: 1.0 and parameters: {'num_leaves': 416, 'n_estimators': 1762, 'feature_fraction': 0.7392881542712737, 'bagging_fraction': 0.6100984147422484, 'bagging_freq': 2, 'min_c

Best trial: score 1.0,
params {'num_leaves': 416, 'n_estimators': 1762, 'feature_fraction': 0.7392881542712737, 'bagging_fraction': 0.6100984147422484, 'bagging_freq': 2, 'min_child_samples': 71}


In [251]:
optuna.visualization.plot_param_importances(study11) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study11) # 최적화 과정 시각화

In [252]:
X11_train, X11_val, y11_train, y11_val = train_test_split(X11, y11, test_size = 0.2, random_state = 42)

In [253]:
X11_train.shape, X11_val.shape, y11_train.shape, y11_val.shape

((1152, 7), (288, 7), (1152, 1), (288, 1))

In [254]:
model = LGBMClassifier(**study11.best_trial.params)

In [255]:
model11 = model.fit(X11_train, y11_train,
          eval_set = [(X11_train, y11_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.759053	training's multi_logloss: 0.759053
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.676336	training's multi_logloss: 0.676336
[3]	training's multi_logloss: 0.594214	training's multi_logloss: 0.594214
[4]	training's multi_logloss: 0.525984	training's multi_logloss: 0.525984
[5]	training's multi_logloss: 0.465926	training's multi_logloss: 0.465926
[6]	training's multi_logloss: 0.423692	training's multi_logloss: 0.423692
[7]	training's multi_logloss: 0.383109	training's multi_logloss: 0.383109
[8]	training's multi_logloss: 0.354268	training's multi_logloss: 0.354268
[9]	training's multi_logloss: 0.324718	training's multi_logloss: 0.324718
[10]	training's multi_logloss: 0.295273	training's multi_logloss: 0.295273
[11]	training's multi_logloss: 0.266649	training's multi_logloss: 0.266649
[12]	training's multi_logloss: 0.243515	training's multi_logloss: 0.243515
[13]	training's multi_logloss: 0.223263	traini

In [256]:
train11_preds = model11.predict(X11_train)
val11_preds = model11.predict(X11_val)

In [257]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [258]:
get_clf_eval(y11_train, train11_preds)
get_clf_eval(y11_val, val11_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9688
재현율: 0.9350


In [259]:
preds_11= model11.predict(X11_test)
preds_11

array([1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 2., 3., 3., 3.,
       3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3.,
       3., 3., 3., 2., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 1., 1., 3., 1., 1., 1., 1., 1., 1., 1.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 2., 3.,
       3., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 3., 3., 3., 3.

In [260]:
test_nov['classification'] = preds_11
test_nov

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-11-01,1,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,914,82.00,1.0,2021-11-01 00:00:00,2021,11,1,0
1,2021-11-01,1,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,567,82.33,1.0,2021-11-01 01:00:00,2021,11,1,1
2,2021-11-01,1,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,436,84.17,1.0,2021-11-01 02:00:00,2021,11,1,2
3,2021-11-01,1,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,519,81.08,1.0,2021-11-01 03:00:00,2021,11,1,3
4,2021-11-01,1,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,1019,81.17,1.0,2021-11-01 04:00:00,2021,11,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2021-11-30,2,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3246,19.17,3.0,2021-11-30 19:00:00,2021,11,30,19
716,2021-11-30,2,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2967,52.17,1.0,2021-11-30 20:00:00,2021,11,30,20
717,2021-11-30,2,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2739,75.00,1.0,2021-11-30 21:00:00,2021,11,30,21
718,2021-11-30,2,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2014,77.75,1.0,2021-11-30 22:00:00,2021,11,30,22


# 12월 데이터 머신러닝

## 데이터 가공

In [261]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [262]:
X12 = train_dec.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [263]:
y12 = train_dec[['datetime', 'classification']]
X12_1 = X12.drop(columns = ['datetime', 'classification'])
y12_1 = X12.datetime

In [264]:
X12_1_scaler = scaler.fit_transform(X12_1)

In [265]:
X12_1_sc = pd.DataFrame(X12_1_scaler)
X12_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X12 = pd.concat([y12_1, X12_1_sc], axis = 1)
X12

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-12-01 00:00:00,1.0,0.265801,0.937621,0.0,0.0,0.0,0.000000
1,2019-12-01 01:00:00,1.0,0.172621,0.936713,0.0,0.0,0.0,0.043478
2,2019-12-01 02:00:00,1.0,0.123407,0.958376,0.0,0.0,0.0,0.086957
3,2019-12-01 03:00:00,1.0,0.067949,0.954633,0.0,0.0,0.0,0.130435
4,2019-12-01 04:00:00,1.0,0.060954,0.959397,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1483,2020-12-31 19:00:00,0.5,0.695229,0.562436,1.0,0.0,1.0,0.826087
1484,2020-12-31 20:00:00,0.5,0.699226,0.632301,1.0,0.0,1.0,0.869565
1485,2020-12-31 21:00:00,0.5,0.630277,0.734377,1.0,0.0,1.0,0.913043
1486,2020-12-31 22:00:00,0.5,0.427679,0.806283,1.0,0.0,1.0,0.956522


In [266]:
X12_test = test_dec.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [267]:
y12_test = test_dec[['datetime', 'classification']]
X12_1_test = X12_test.drop(columns = ['datetime', 'classification'])
y12_1_test = X12_test.datetime

In [268]:
X12_1_test_scaler = scaler.fit_transform(X12_1_test)

In [269]:
X12_1_test_sc = pd.DataFrame(X12_1_test_scaler)
X12_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X12_test = pd.concat([y12_1_test, X12_1_test_sc], axis = 1)
X12_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-12-01 00:00:00,0.333333,0.205834,0.878374,0.0,0.0,0.0,0.000000
1,2021-12-01 01:00:00,0.333333,0.127045,0.851384,0.0,0.0,0.0,0.043478
2,2021-12-01 02:00:00,0.333333,0.081788,0.911271,0.0,0.0,0.0,0.086957
3,2021-12-01 03:00:00,0.333333,0.094602,0.927603,0.0,0.0,0.0,0.130435
4,2021-12-01 04:00:00,0.333333,0.162486,0.914167,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2021-12-31 19:00:00,0.666667,0.711287,0.610101,0.0,0.0,1.0,0.826087
740,2021-12-31 20:00:00,0.666667,0.747274,0.681455,0.0,0.0,1.0,0.869565
741,2021-12-31 21:00:00,0.666667,0.750000,0.785706,0.0,0.0,1.0,0.913043
742,2021-12-31 22:00:00,0.666667,0.535169,0.859145,0.0,0.0,1.0,0.956522


## LightGBM

In [270]:
X12 = X12.drop(columns = ['datetime'])
y12 = y12.drop(columns = ['datetime'])
X12_test = X12_test.drop(columns = ['datetime'])
y12_test = y12_test.drop(columns = ['datetime'])

In [271]:
study12 = optuna.create_study(direction='maximize',sampler=TPESampler())
study12.optimize(lambda trial : objectiveLGBM(trial, X12, y12), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study12.best_trial.value,study12.best_trial.params))

[32m[I 2023-01-12 08:10:19,947][0m A new study created in memory with name: no-name-10f729c1-f909-4b75-9d5f-4d1ecd03e175[0m
[32m[I 2023-01-12 08:10:20,712][0m Trial 0 finished with value: 1.0 and parameters: {'num_leaves': 462, 'n_estimators': 1144, 'feature_fraction': 0.5695638076575624, 'bagging_fraction': 0.4347312746504447, 'bagging_freq': 4, 'min_child_samples': 87}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-01-12 08:10:23,331][0m Trial 1 finished with value: 0.9971988795518207 and parameters: {'num_leaves': 318, 'n_estimators': 2618, 'feature_fraction': 0.8185990042591605, 'bagging_fraction': 0.9880052290591286, 'bagging_freq': 2, 'min_child_samples': 78}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-01-12 08:10:25,312][0m Trial 2 finished with value: 1.0 and parameters: {'num_leaves': 202, 'n_estimators': 2513, 'feature_fraction': 0.9234059951036638, 'bagging_fraction': 0.8083878425609838, 'bagging_freq': 1, 'min_child_samples': 89}. Best is trial 0 with valu

Best trial: score 1.0,
params {'num_leaves': 462, 'n_estimators': 1144, 'feature_fraction': 0.5695638076575624, 'bagging_fraction': 0.4347312746504447, 'bagging_freq': 4, 'min_child_samples': 87}


In [272]:
optuna.visualization.plot_param_importances(study12) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study12) # 최적화 과정 시각화

In [273]:
X12_train, X12_val, y12_train, y12_val = train_test_split(X12, y12, test_size = 0.2, random_state = 42)

In [274]:
X12_train.shape, X12_val.shape, y12_train.shape, y12_val.shape

((1190, 7), (298, 7), (1190, 1), (298, 1))

In [275]:
model = LGBMClassifier(**study12.best_trial.params)

In [276]:
model12 = model.fit(X12_train, y12_train,
          eval_set = [(X12_train, y12_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.854728	training's multi_logloss: 0.854728
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.752488	training's multi_logloss: 0.752488
[3]	training's multi_logloss: 0.682163	training's multi_logloss: 0.682163
[4]	training's multi_logloss: 0.619615	training's multi_logloss: 0.619615
[5]	training's multi_logloss: 0.555625	training's multi_logloss: 0.555625
[6]	training's multi_logloss: 0.507154	training's multi_logloss: 0.507154
[7]	training's multi_logloss: 0.467327	training's multi_logloss: 0.467327
[8]	training's multi_logloss: 0.431016	training's multi_logloss: 0.431016
[9]	training's multi_logloss: 0.402914	training's multi_logloss: 0.402914
[10]	training's multi_logloss: 0.373154	training's multi_logloss: 0.373154
[11]	training's multi_logloss: 0.35262	training's multi_logloss: 0.35262
[12]	training's multi_logloss: 0.334577	training's multi_logloss: 0.334577
[13]	training's multi_logloss: 0.311363	training

In [277]:
train12_preds = model12.predict(X12_train)
val12_preds = model12.predict(X12_val)

In [278]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [279]:
get_clf_eval(y12_train, train12_preds)
get_clf_eval(y12_val, val12_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [280]:
preds_12= model12.predict(X12_test)
preds_12

array([1., 1., 1., 1., 1., 1., 1., 3., 3., 2., 1., 3., 3., 3., 3., 3., 3.,
       3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3.,
       3., 2., 2., 1., 2., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 3., 3., 3., 3., 1., 2., 2., 1., 3., 3., 3., 3., 3., 3.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 3., 2., 3., 3., 3., 3., 3., 2., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 3.,
       3., 3., 3., 1., 2., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 3., 3., 3., 3., 3., 3., 2., 3., 3., 3., 3., 3.,
       3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3., 3., 3., 3., 3.,
       3., 2., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 3., 3., 3.

In [281]:
test_dec['classification'] = preds_12
test_dec

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-12-01,3,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,959,81.58,1.0,2021-12-01 00:00:00,2021,12,1,0
1,2021-12-01,3,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,670,79.25,1.0,2021-12-01 01:00:00,2021,12,1,1
2,2021-12-01,3,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,504,84.42,1.0,2021-12-01 02:00:00,2021,12,1,2
3,2021-12-01,3,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,551,85.83,1.0,2021-12-01 03:00:00,2021,12,1,3
4,2021-12-01,3,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,800,84.67,1.0,2021-12-01 04:00:00,2021,12,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2021-12-31,5,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2813,58.42,1.0,2021-12-31 19:00:00,2021,12,31,19
740,2021-12-31,5,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2945,64.58,1.0,2021-12-31 20:00:00,2021,12,31,20
741,2021-12-31,5,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2955,73.58,1.0,2021-12-31 21:00:00,2021,12,31,21
742,2021-12-31,5,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2167,79.92,1.0,2021-12-31 22:00:00,2021,12,31,22


# 월별 데이터 합치기

In [282]:
result = pd.concat([test_jan,
                    test_feb,
                    test_mar,
                    test_apr,
                    test_may,
                    test_jun,
                    test_jul,
                    test_aug,
                    test_sep,
                    test_oct,
                    test_nov,
                    test_dec])
result = result.sort_values(by = 'datetime')
result = result.reset_index(drop = True)
result

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-10-01,5,0:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,945,91.67,1.0,2021-10-01 00:00:00,2021,10,1,0
1,2021-10-01,5,1:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,646,93.50,1.0,2021-10-01 01:00:00,2021,10,1,1
2,2021-10-01,5,2:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,541,92.42,1.0,2021-10-01 02:00:00,2021,10,1,2
3,2021-10-01,5,3:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,549,93.33,1.0,2021-10-01 03:00:00,2021,10,1,3
4,2021-10-01,5,4:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,943,91.92,1.0,2021-10-01 04:00:00,2021,10,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2022-09-30,5,19:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3654,16.17,3.0,2022-09-30 19:00:00,2022,9,30,19
8756,2022-09-30,5,20:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,3249,17.92,3.0,2022-09-30 20:00:00,2022,9,30,20
8757,2022-09-30,5,21:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2887,37.75,3.0,2022-09-30 21:00:00,2022,9,30,21
8758,2022-09-30,5,22:00:00,청담대교,광진구,C-17,청담대교북단,청담대교남단,3,1575,2804,49.17,2.0,2022-09-30 22:00:00,2022,9,30,22


In [283]:
result = result.drop(columns = ['dow', 
                                'district_name', 
                                'branch_num', 
                                'arr_point', 
                                'lane', 
                                'distance', 
                                'volume', 
                                'speed', 
                                'datetime', 
                                'year', 
                                'month', 
                                'day', 
                                'hour'])
result

Unnamed: 0,date,time,branch_name,dep_point,classification
0,2021-10-01,0:00:00,청담대교,청담대교북단,1.0
1,2021-10-01,1:00:00,청담대교,청담대교북단,1.0
2,2021-10-01,2:00:00,청담대교,청담대교북단,1.0
3,2021-10-01,3:00:00,청담대교,청담대교북단,1.0
4,2021-10-01,4:00:00,청담대교,청담대교북단,1.0
...,...,...,...,...,...
8755,2022-09-30,19:00:00,청담대교,청담대교북단,3.0
8756,2022-09-30,20:00:00,청담대교,청담대교북단,3.0
8757,2022-09-30,21:00:00,청담대교,청담대교북단,3.0
8758,2022-09-30,22:00:00,청담대교,청담대교북단,2.0


# csv 파일 만들기

In [284]:
result.to_csv('chungdam_depnorth_result.csv', index = False)