# 라이브러리

In [2]:
import sys
print(sys.version)

3.8.15 (default, Oct 12 2022, 19:14:39) 
[GCC 7.5.0]


In [3]:
import pandas as pd
import random
import os
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

In [4]:
# 한글 폰트 깨짐 현상 해결을 위한 나눔 폰트 설치
# 코드 1회 실행 후 주석 처리하고 런타임 재시작 및 모두 실행
# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

## 데이터 로드

In [6]:
# 경로 설정
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# csv 파일 읽어오기
# root = '/content/drive/MyDrive/최종프로젝트/교통/분석/2nd_modified_data/'
root = '/content/drive/MyDrive/Project/'
C17_depsouth = pd.read_csv(root + 'Data_chungdam_depsouth.csv', encoding='cp949')
C17_depsouth_test = pd.read_csv(root + 'chungdam_depsouth_test.csv', encoding='cp949')

In [8]:
# 데이터 확인
print(C17_depsouth.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24096 entries, 0 to 24095
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            24096 non-null  object 
 1   dow             24096 non-null  int64  
 2   time            24096 non-null  object 
 3   branch_name     24096 non-null  object 
 4   district_name   24096 non-null  object 
 5   branch_num      24096 non-null  object 
 6   dep_point       24096 non-null  object 
 7   arr_point       24096 non-null  object 
 8   lane            24096 non-null  int64  
 9   distance        24096 non-null  int64  
 10  volume          24096 non-null  int64  
 11  speed           24096 non-null  float64
 12  classification  0 non-null      float64
dtypes: float64(2), int64(4), object(7)
memory usage: 2.4+ MB
None


In [9]:
# 결측치 확인
print(C17_depsouth.isnull().sum())

date                  0
dow                   0
time                  0
branch_name           0
district_name         0
branch_num            0
dep_point             0
arr_point             0
lane                  0
distance              0
volume                0
speed                 0
classification    24096
dtype: int64


In [10]:
# date 컬럼과 time 컬럼을 합쳐 datetime이라는 컬럼 만들기
C17_depsouth['datetime'] = C17_depsouth['date'] + ' ' + C17_depsouth['time']
C17_depsouth_test['datetime'] = C17_depsouth_test['date'] + ' ' + C17_depsouth_test['time']

In [11]:
# date 컬럼과 time 컬럼 제거
# C4_depsouth = C4_depsouth.drop(C4_depsouth[['date', 'time']], axis=1)

In [12]:
# datetime 문자형 컬럼을 datetime 자료형으로 변환
C17_depsouth['datetime'] = pd.to_datetime(C17_depsouth['datetime'])
C17_depsouth_test['datetime'] = pd.to_datetime(C17_depsouth_test['datetime'])

# classification 컬럼값 변경

In [13]:
C17_depsouth.describe()

Unnamed: 0,dow,lane,distance,volume,speed,classification
count,24096.0,24096.0,24096.0,24096.0,24096.0,0.0
mean,3.997012,3.0,1138.0,2548.210201,75.394719,
std,1.998295,0.0,0.0,1154.699366,17.156033,
min,1.0,3.0,1138.0,130.0,8.58,
25%,2.0,3.0,1138.0,1384.75,70.0,
50%,4.0,3.0,1138.0,3129.0,77.08,
75%,6.0,3.0,1138.0,3439.0,88.58,
max,7.0,3.0,1138.0,5106.0,105.25,


In [14]:
C17_depsouth.loc[C17_depsouth['speed'] >= C17_depsouth['speed'].mean(), 'classification'] = 1
C17_depsouth.loc[C17_depsouth['speed'] < 20, 'classification'] = 3
C17_depsouth.loc[(C17_depsouth['speed'] >= 20) 
                      & (C17_depsouth['speed'] < 40) 
                      & ((C17_depsouth['volume'] >= C17_depsouth['volume'].mean())), 'classification'] = 3
C17_depsouth.loc[(C17_depsouth['speed'] >= 20) 
                      & (C17_depsouth['speed'] < C17_depsouth['speed'].mean()) 
                      & ((C17_depsouth['volume'] < C17_depsouth['volume'].mean())), 'classification'] = 2
C17_depsouth.loc[(C17_depsouth['speed'] >= 40) 
                      & (C17_depsouth['speed'] < C17_depsouth['speed'].mean()) 
                      & ((C17_depsouth['volume'] >= C17_depsouth['volume'].mean())), 'classification'] = 2

In [15]:
C17_depsouth['classification']

0        2.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
24091    3.0
24092    2.0
24093    2.0
24094    1.0
24095    1.0
Name: classification, Length: 24096, dtype: float64

In [16]:
C17_depsouth_test.loc[C17_depsouth_test['speed'] >= C17_depsouth_test['speed'].mean(), 'classification_act'] = 1
C17_depsouth_test.loc[C17_depsouth_test['speed'] < 20, 'classification_act'] = 3
C17_depsouth_test.loc[(C17_depsouth_test['speed'] >= 20) 
                      & (C17_depsouth_test['speed'] < 40) 
                      & ((C17_depsouth_test['volume'] >= C17_depsouth_test['volume'].mean())), 'classification_act'] = 3
C17_depsouth_test.loc[(C17_depsouth_test['speed'] >= 20) 
                      & (C17_depsouth_test['speed'] < C17_depsouth_test['speed'].mean()) 
                      & ((C17_depsouth_test['volume'] < C17_depsouth_test['volume'].mean())), 'classification_act'] = 2
C17_depsouth_test.loc[(C17_depsouth_test['speed'] >= 40) 
                      & (C17_depsouth_test['speed'] < C17_depsouth_test['speed'].mean()) 
                      & ((C17_depsouth_test['volume'] >= C17_depsouth_test['volume'].mean())), 'classification_act'] = 2

In [17]:
C17_depsouth_test.to_csv('C17_depsouth_test.csv', index=False)

In [18]:
C17_depsouth['year'] = C17_depsouth['datetime'].dt.year
C17_depsouth['month'] = C17_depsouth['datetime'].dt.month
C17_depsouth['day'] = C17_depsouth['datetime'].dt.day
C17_depsouth['hour'] = C17_depsouth['datetime'].dt.hour

In [19]:
C17_depsouth

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2019-01-01,2,0:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,1550,72.86,2.0,2019-01-01 00:00:00,2019,1,1,0
1,2019-01-01,2,1:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,1470,83.42,1.0,2019-01-01 01:00:00,2019,1,1,1
2,2019-01-01,2,2:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,852,94.92,1.0,2019-01-01 02:00:00,2019,1,1,2
3,2019-01-01,2,3:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,535,97.33,1.0,2019-01-01 03:00:00,2019,1,1,3
4,2019-01-01,2,4:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,488,95.00,1.0,2019-01-01 04:00:00,2019,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24091,2021-09-30,4,19:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3102,17.40,3.0,2021-09-30 19:00:00,2021,9,30,19
24092,2021-09-30,4,20:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3985,54.55,2.0,2021-09-30 20:00:00,2021,9,30,20
24093,2021-09-30,4,21:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3402,69.50,2.0,2021-09-30 21:00:00,2021,9,30,21
24094,2021-09-30,4,22:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,2784,75.92,1.0,2021-09-30 22:00:00,2021,9,30,22


# 월별로 데이터 나누기

In [20]:
C17_ds_month = C17_depsouth['month']
C17_ds_month_list  = sorted(set(C17_ds_month))
C17_ds_month_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [21]:
month_data = []
for i in range(0, len(C17_ds_month_list)):
  month = C17_depsouth[C17_depsouth['month'] == C17_ds_month_list[i]]
  month = month.reset_index(drop=True)
  month_data.append(month)

In [22]:
train_jan = month_data[0]
train_feb = month_data[1]
train_mar = month_data[2]
train_apr = month_data[3]
train_may = month_data[4]
train_jun = month_data[5]
train_jul = month_data[6]
train_aug = month_data[7]
train_sep = month_data[8]
train_oct = month_data[9]
train_nov = month_data[10]
train_dec = month_data[11]

In [23]:
C17_depsouth_test['year'] = C17_depsouth_test['datetime'].dt.year
C17_depsouth_test['month'] = C17_depsouth_test['datetime'].dt.month
C17_depsouth_test['day'] = C17_depsouth_test['datetime'].dt.day
C17_depsouth_test['hour'] = C17_depsouth_test['datetime'].dt.hour

In [24]:
C17_ds_test_mon = C17_depsouth_test['month']
C17_ds_test_mon_list  = sorted(set(C17_ds_test_mon))
C17_ds_test_mon_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [25]:
month_test_data = []
for i in range(0, len(C17_ds_month_list)):
  month = C17_depsouth_test[C17_depsouth_test['month'] == C17_ds_test_mon_list[i]]
  month = month.reset_index(drop=True)
  month_test_data.append(month)

In [26]:
test_jan = month_test_data[0]
test_feb = month_test_data[1]
test_mar = month_test_data[2]
test_apr = month_test_data[3]
test_may = month_test_data[4]
test_jun = month_test_data[5]
test_jul = month_test_data[6]
test_aug = month_test_data[7]
test_sep = month_test_data[8]
test_oct = month_test_data[9]
test_nov = month_test_data[10]
test_dec = month_test_data[11]

In [27]:
test_dec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date                744 non-null    object        
 1   dow                 744 non-null    int64         
 2   time                744 non-null    object        
 3   branch_name         744 non-null    object        
 4   district_name       744 non-null    object        
 5   branch_num          744 non-null    object        
 6   dep_point           744 non-null    object        
 7   arr_point           744 non-null    object        
 8   lane                744 non-null    int64         
 9   distance            744 non-null    int64         
 10  volume              744 non-null    int64         
 11  speed               744 non-null    float64       
 12  classification      0 non-null      float64       
 13  datetime            744 non-null    datetime64[ns]

# 1월 데이터 머신러닝

## 데이터 가공

In [74]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [75]:
X1 = train_jan.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [76]:
y1 = train_jan[['datetime', 'classification']]
X1_1 = X1.drop(columns = ['datetime', 'classification'])
y1_1 = X1.datetime

In [77]:
X1_1_scaler = scaler.fit_transform(X1_1)

In [78]:
X1_1_sc = pd.DataFrame(X1_1_scaler)
X1_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X1 = pd.concat([y1_1, X1_1_sc], axis = 1)

In [79]:
X1_test = test_jan.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance',
                               'classification_act'])

In [80]:
y1_test = test_jan[['datetime', 'classification_act']]
X1_1_test = X1_test.drop(columns = ['datetime', 'classification'])
y1_1_test = X1_test.datetime

In [81]:
X1_1_test_scaler = scaler.fit_transform(X1_1_test)

In [82]:
X1_1_test_sc = pd.DataFrame(X1_1_test_scaler)
X1_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X1_test = pd.concat([y1_1_test, X1_1_test_sc], axis = 1)
X1_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2022-01-01 00:00:00,0.833333,0.218144,0.924651,0.0,0.0,0.0,0.000000
1,2022-01-01 01:00:00,0.833333,0.224436,0.916944,0.0,0.0,0.0,0.043478
2,2022-01-01 02:00:00,0.833333,0.080493,0.972359,0.0,0.0,0.0,0.086957
3,2022-01-01 03:00:00,0.833333,0.045884,0.945781,0.0,0.0,0.0,0.130435
4,2022-01-01 04:00:00,0.833333,0.046670,0.943522,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2022-01-31 19:00:00,0.000000,0.556109,0.785116,0.0,0.0,1.0,0.826087
740,2022-01-31 20:00:00,0.000000,0.601730,0.795083,0.0,0.0,1.0,0.869565
741,2022-01-31 21:00:00,0.000000,0.509439,0.789635,0.0,0.0,1.0,0.913043
742,2022-01-31 22:00:00,0.000000,0.310960,0.779668,0.0,0.0,1.0,0.956522


## LightGBM

In [1]:
# optuna 설치
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [40]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error

In [41]:
import lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score

In [86]:
X1 = X1.drop(columns = ['datetime'])
y1 = y1.drop(columns = ['datetime'])
X1_test = X1_test.drop(columns = ['datetime'])
y1_test = y1_test.drop(columns = ['datetime'])

In [47]:
# LigthGBM 하이퍼파라미터 값 지정
def objectiveLGBM(trial: Trial, X, y):
    param = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'learning_rate': 0.01,
        'n_estimators': trial.suggest_int('n_estimators', 700, 3000),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'gpu_use_dp':True
    }
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

     # 학습 모델 생성
    model = LGBMClassifier(**param)
    lgb_model = model.fit(X_train, y_train, verbose=True) # 학습 진행
    train_preds = lgb_model.predict(X_train)
    test_preds = lgb_model.predict(X_test)

    # 모델 성능 확인
    train_precision = precision_score(y_test, test_preds, average= "macro")
    
    return train_precision

In [88]:
study1 = optuna.create_study(direction='maximize',sampler=TPESampler())
study1.optimize(lambda trial : objectiveLGBM(trial, X1, y1), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study1.best_trial.value,study1.best_trial.params))

[32m[I 2022-12-01 10:56:34,427][0m A new study created in memory with name: no-name-05c0e924-5eea-42f7-a85e-e338d62ddea8[0m
[32m[I 2022-12-01 10:56:36,190][0m Trial 0 finished with value: 0.9742211193824097 and parameters: {'num_leaves': 2, 'n_estimators': 1673, 'feature_fraction': 0.7659889226610512, 'bagging_fraction': 0.8740628593951206, 'bagging_freq': 7, 'min_child_samples': 96}. Best is trial 0 with value: 0.9742211193824097.[0m
[32m[I 2022-12-01 10:56:40,651][0m Trial 1 finished with value: 0.9986666666666667 and parameters: {'num_leaves': 106, 'n_estimators': 895, 'feature_fraction': 0.7076178653807161, 'bagging_fraction': 0.8906751063943867, 'bagging_freq': 3, 'min_child_samples': 22}. Best is trial 1 with value: 0.9986666666666667.[0m
[32m[I 2022-12-01 10:56:46,017][0m Trial 2 finished with value: 0.9543166203547783 and parameters: {'num_leaves': 492, 'n_estimators': 2569, 'feature_fraction': 0.8066173524607496, 'bagging_fraction': 0.7592484209562143, 'bagging_freq

KeyboardInterrupt: ignored

In [None]:
optuna.visualization.plot_param_importances(study1) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study1) # 최적화 과정 시각화

In [None]:
X1_train, X1_val, y1_train, y1_val = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

In [None]:
X1_train.shape, X1_val.shape, y1_train.shape, y1_val.shape

In [None]:
model = LGBMClassifier(**study1.best_trial.params)

In [None]:
model1 = model.fit(X1_train, y1_train,
          eval_set = [(X1_train, y1_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

In [None]:
train1_preds = model1.predict(X1_train)
val1_preds = model1.predict(X1_val)

In [None]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [None]:
train_pre_score = precision_score(y1_train, train1_preds, average= "macro")
train_rec_score = recall_score(y1_train, train1_preds, average= "macro")
val_pre_score = precision_score(y1_val, val1_preds, average= "macro")
val_rec_score = recall_score(y1_val, val1_preds, average= "macro")

In [None]:
get_clf_eval(y1_train, train1_preds)
get_clf_eval(y1_val, val1_preds)

In [None]:
preds_1 = model1.predict(X1_test)
preds_1

In [None]:
get_clf_eval(y1_train, train1_preds)
get_clf_eval(y1_test, preds_1)

In [None]:
test_pre_score = precision_score(y1_test, preds_1, average= "macro")
test_rec_score = recall_score(y1_test, preds_1, average= "macro")

In [None]:
test_jan['classification'] = preds_1
test_jan

# 2월 데이터 머신러닝

## 데이터 가공

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [None]:
X2 = train_feb.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [None]:
y2 = train_feb[['datetime', 'classification']]
X2_1 = X2.drop(columns = ['datetime', 'classification'])
y2_1 = X2.datetime

In [None]:
X2_1_scaler = scaler.fit_transform(X2_1)

In [None]:
X2_1_sc = pd.DataFrame(X2_1_scaler)
X2_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X2 = pd.concat([y2_1, X2_1_sc], axis = 1)
X2

In [None]:
X2_test = test_feb.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [None]:
y2_test = test_feb[['datetime', 'classification']]
X2_1_test = X2_test.drop(columns = ['datetime', 'classification'])
y2_1_test = X2_test.datetime

In [None]:
X2_1_test_scaler = scaler.fit_transform(X2_1_test)

In [None]:
X2_1_test_sc = pd.DataFrame(X2_1_test_scaler)
X2_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X2_test = pd.concat([y2_1_test, X2_1_test_sc], axis = 1)
X2_test

## LightGBM

In [None]:
X2 = X2.drop(columns = ['datetime'])
y2 = y2.drop(columns = ['datetime'])
X2_test = X2_test.drop(columns = ['datetime'])
y2_test = y2_test.drop(columns = ['datetime'])

In [None]:
study2 = optuna.create_study(direction='maximize',sampler=TPESampler())
study2.optimize(lambda trial : objectiveLGBM(trial, X2, y2), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study2.best_trial.value,study2.best_trial.params))

In [None]:
optuna.visualization.plot_param_importances(study2) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study2) # 최적화 과정 시각화

In [None]:
X2_train, X2_val, y2_train, y2_val = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

In [None]:
X2_train.shape, X2_val.shape, y2_train.shape, y2_val.shape

In [None]:
model = LGBMClassifier(**study2.best_trial.params)

In [None]:
model2 = model.fit(X2_train, y2_train,
          eval_set = [(X2_train, y2_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

In [None]:
train2_preds = model2.predict(X2_train)
val2_preds = model2.predict(X2_val)

In [None]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [None]:
get_clf_eval(y2_train, train2_preds)
get_clf_eval(y2_val, val2_preds)

In [None]:
preds_2= model2.predict(X2_test)
preds_2

In [None]:
test_feb['classification'] = preds_2
test_feb

# 3월 데이터 머신러닝

## 데이터 가공

In [None]:
X3 = train_mar.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [None]:
y3 = train_mar[['datetime', 'classification']]
X3_1 = X3.drop(columns = ['datetime', 'classification'])
y3_1 = X3.datetime

In [None]:
X3_1_scaler = scaler.fit_transform(X3_1)

In [None]:
X3_1_sc = pd.DataFrame(X3_1_scaler)
X3_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X3 = pd.concat([y3_1, X3_1_sc], axis = 1)
X3

In [None]:
X3_test = test_mar.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [None]:
y3_test = test_mar[['datetime', 'classification']]
X3_1_test = X3_test.drop(columns = ['datetime', 'classification'])
y3_1_test = X3_test.datetime

In [None]:
X3_1_test_scaler = scaler.fit_transform(X3_1_test)

In [None]:
X3_1_test_sc = pd.DataFrame(X3_1_test_scaler)
X3_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X3_test = pd.concat([y3_1_test, X3_1_test_sc], axis = 1)
X3_test

## LightGBM

In [None]:
X3 = X3.drop(columns = ['datetime'])
y3 = y3.drop(columns = ['datetime'])
X3_test = X3_test.drop(columns = ['datetime'])
y3_test = y3_test.drop(columns = ['datetime'])

In [None]:
study3 = optuna.create_study(direction='maximize',sampler=TPESampler())
study3.optimize(lambda trial : objectiveLGBM(trial, X3, y3), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study3.best_trial.value,study3.best_trial.params))

In [None]:
optuna.visualization.plot_param_importances(study3) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study3) # 최적화 과정 시각화

In [None]:
X3_train, X3_val, y3_train, y3_val = train_test_split(X3, y3, test_size = 0.2, random_state = 42)

In [None]:
X3_train.shape, X3_val.shape, y3_train.shape, y3_val.shape

In [None]:
model = LGBMClassifier(**study3.best_trial.params)

In [None]:
model3 = model.fit(X3_train, y3_train,
          eval_set = [(X3_train, y3_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

In [None]:
train3_preds = model3.predict(X3_train)
val3_preds = model3.predict(X3_val)

In [None]:
get_clf_eval(y3_train, train3_preds)
get_clf_eval(y3_val, val3_preds)

In [None]:
preds_3= model3.predict(X3_test)
preds_3

In [None]:
test_mar['classification'] = preds_3
test_mar

# 4월 데이터 머신러닝

## 데이터 가공

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [None]:
X4 = train_apr.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [None]:
y4 = train_apr[['datetime', 'classification']]
X4_1 = X4.drop(columns = ['datetime', 'classification'])
y4_1 = X4.datetime

In [None]:
X4_1_scaler = scaler.fit_transform(X4_1)

In [None]:
X4_1_sc = pd.DataFrame(X4_1_scaler)
X4_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X4 = pd.concat([y4_1, X4_1_sc], axis = 1)
X4

In [None]:
X4_test = test_apr.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [None]:
y4_test = test_apr[['datetime', 'classification']]
X4_1_test = X4_test.drop(columns = ['datetime', 'classification'])
y4_1_test = X4_test.datetime

In [None]:
X4_1_test_scaler = scaler.fit_transform(X4_1_test)

In [None]:
X4_1_test_sc = pd.DataFrame(X4_1_test_scaler)
X4_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X4_test = pd.concat([y4_1_test, X4_1_test_sc], axis = 1)
X4_test

## LightGBM

In [None]:
X4 = X4.drop(columns = ['datetime'])
y4 = y4.drop(columns = ['datetime'])
X4_test = X4_test.drop(columns = ['datetime'])
y4_test = y4_test.drop(columns = ['datetime'])

In [None]:
study4 = optuna.create_study(direction='maximize',sampler=TPESampler())
study4.optimize(lambda trial : objectiveLGBM(trial, X4, y4), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study4.best_trial.value,study4.best_trial.params))

In [None]:
optuna.visualization.plot_param_importances(study4) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study4) # 최적화 과정 시각화

In [None]:
X4_train, X4_val, y4_train, y4_val = train_test_split(X4, y4, test_size = 0.2, random_state = 42)

In [None]:
X4_train.shape, X4_val.shape, y4_train.shape, y4_val.shape

In [None]:
model = LGBMClassifier(**study4.best_trial.params)

In [None]:
model4 = model.fit(X4_train, y4_train,
          eval_set = [(X4_train, y4_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

In [None]:
train4_preds = model4.predict(X4_train)
val4_preds = model4.predict(X4_val)

In [None]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [None]:
get_clf_eval(y4_train, train4_preds)
get_clf_eval(y4_val, val4_preds)

In [None]:
preds_4= model4.predict(X4_test)
preds_4

In [None]:
test_apr['classification'] = preds_4
test_apr

# 5월 데이터 머신러닝

## 데이터 가공

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [None]:
X5 = train_may.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [None]:
y5 = train_may[['datetime', 'classification']]
X5_1 = X5.drop(columns = ['datetime', 'classification'])
y5_1 = X5.datetime

In [None]:
X5_1_scaler = scaler.fit_transform(X5_1)

In [None]:
X5_1_sc = pd.DataFrame(X5_1_scaler)
X5_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X5 = pd.concat([y5_1, X5_1_sc], axis = 1)
X5

In [None]:
X5_test = test_may.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [None]:
y5_test = test_may[['datetime', 'classification']]
X5_1_test = X5_test.drop(columns = ['datetime', 'classification'])
y5_1_test = X5_test.datetime

In [None]:
X5_1_test_scaler = scaler.fit_transform(X5_1_test)

In [None]:
X5_1_test_sc = pd.DataFrame(X5_1_test_scaler)
X5_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X5_test = pd.concat([y5_1_test, X5_1_test_sc], axis = 1)
X5_test

## LightGBM

In [None]:
X5 = X5.drop(columns = ['datetime'])
y5 = y5.drop(columns = ['datetime'])
X5_test = X5_test.drop(columns = ['datetime'])
y5_test = y5_test.drop(columns = ['datetime'])

In [None]:
study5 = optuna.create_study(direction='maximize',sampler=TPESampler())
study5.optimize(lambda trial : objectiveLGBM(trial, X5, y5), n_trials=20) 
print('Best trial: score {},\nparams {}'.format(study5.best_trial.value,study5.best_trial.params))

In [None]:
optuna.visualization.plot_param_importances(study5) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study5) # 최적화 과정 시각화

In [None]:
X5_train, X5_val, y5_train, y5_val = train_test_split(X5, y5, test_size = 0.2, random_state = 42)

In [None]:
X5_train.shape, X5_val.shape, y5_train.shape, y5_val.shape

In [None]:
model = LGBMClassifier(**study5.best_trial.params)

In [None]:
model5 = model.fit(X5_train, y5_train,
          eval_set = [(X5_train, y5_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

In [None]:
train5_preds = model5.predict(X5_train)
val5_preds = model5.predict(X5_val)

In [None]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [None]:
get_clf_eval(y5_train, train5_preds)
get_clf_eval(y5_val, val5_preds)

In [None]:
preds_5= model5.predict(X5_test)
preds_5

In [None]:
test_may['classification'] = preds_5
test_may

# 6월 데이터 머신러닝

## 데이터 가공

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [None]:
X6 = train_jun.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [None]:
y6 = train_jun[['datetime', 'classification']]
X6_1 = X6.drop(columns = ['datetime', 'classification'])
y6_1 = X6.datetime

In [None]:
X6_1_scaler = scaler.fit_transform(X6_1)

In [None]:
X6_1_sc = pd.DataFrame(X6_1_scaler)
X6_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X6 = pd.concat([y6_1, X6_1_sc], axis = 1)
X6

In [None]:
X6_test = test_jun.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [None]:
y6_test = test_jun[['datetime', 'classification']]
X6_1_test = X6_test.drop(columns = ['datetime', 'classification'])
y6_1_test = X6_test.datetime

In [None]:
X6_1_test_scaler = scaler.fit_transform(X6_1_test)

In [None]:
X6_1_test_sc = pd.DataFrame(X6_1_test_scaler)
X6_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X6_test = pd.concat([y6_1_test, X6_1_test_sc], axis = 1)
X6_test

## LightGBM

In [None]:
X6 = X6.drop(columns = ['datetime'])
y6 = y6.drop(columns = ['datetime'])
X6_test = X6_test.drop(columns = ['datetime'])
y6_test = y6_test.drop(columns = ['datetime'])

In [None]:
study6 = optuna.create_study(direction='maximize',sampler=TPESampler())
study6.optimize(lambda trial : objectiveLGBM(trial, X6, y6), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study6.best_trial.value,study6.best_trial.params))

In [None]:
optuna.visualization.plot_param_importances(study6) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study6) # 최적화 과정 시각화

In [None]:
X6_train, X6_val, y6_train, y6_val = train_test_split(X6, y6, test_size = 0.2, random_state = 42)

In [None]:
X6_train.shape, X6_val.shape, y6_train.shape, y6_val.shape

In [None]:
model = LGBMClassifier(**study6.best_trial.params)

In [None]:
model6 = model.fit(X6_train, y6_train,
          eval_set = [(X6_train, y6_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

In [None]:
train6_preds = model6.predict(X6_train)
val6_preds = model6.predict(X6_val)

In [None]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [None]:
get_clf_eval(y6_train, train6_preds)
get_clf_eval(y6_val, val6_preds)

In [None]:
preds_6= model6.predict(X6_test)
preds_6

In [None]:
test_jun['classification'] = preds_6
test_jun

# 7월 데이터 머신러닝

## 데이터 가공

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [None]:
X7 = train_jul.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [None]:
y7 = train_jul[['datetime', 'classification']]
X7_1 = X7.drop(columns = ['datetime', 'classification'])
y7_1 = X7.datetime

In [None]:
X7_1_scaler = scaler.fit_transform(X7_1)

In [None]:
X7_1_sc = pd.DataFrame(X7_1_scaler)
X7_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X7 = pd.concat([y7_1, X7_1_sc], axis = 1)
X7

In [None]:
X7_test = test_jul.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [None]:
y7_test = test_jul[['datetime', 'classification']]
X7_1_test = X7_test.drop(columns = ['datetime', 'classification'])
y7_1_test = X7_test.datetime

In [None]:
X7_1_test_scaler = scaler.fit_transform(X7_1_test)

In [None]:
X7_1_test_sc = pd.DataFrame(X7_1_test_scaler)
X7_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X7_test = pd.concat([y7_1_test, X7_1_test_sc], axis = 1)
X7_test

## LightGBM

In [None]:
X7 = X7.drop(columns = ['datetime'])
y7 = y7.drop(columns = ['datetime'])
X7_test = X7_test.drop(columns = ['datetime'])
y7_test = y7_test.drop(columns = ['datetime'])

In [None]:
study7 = optuna.create_study(direction='maximize',sampler=TPESampler())
study7.optimize(lambda trial : objectiveLGBM(trial, X7, y7), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study7.best_trial.value,study7.best_trial.params))

In [None]:
optuna.visualization.plot_param_importances(study7) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study7) # 최적화 과정 시각화

In [None]:
X7_train, X7_val, y7_train, y7_val = train_test_split(X7, y7, test_size = 0.2, random_state = 42)

In [None]:
X7_train.shape, X7_val.shape, y7_train.shape, y7_val.shape

In [None]:
model = LGBMClassifier(**study7.best_trial.params)

In [None]:
model7 = model.fit(X7_train, y7_train,
          eval_set = [(X7_train, y7_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

In [None]:
train7_preds = model7.predict(X7_train)
val7_preds = model7.predict(X7_val)

In [None]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [None]:
get_clf_eval(y7_train, train7_preds)
get_clf_eval(y7_val, val7_preds)

In [None]:
preds_7= model7.predict(X7_test)
preds_7

In [None]:
test_jul['classification'] = preds_7
test_jul

# 8월 데이터 머신러닝

## 데이터 가공

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [None]:
X8 = train_aug.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [None]:
y8 = train_aug[['datetime', 'classification']]
X8_1 = X8.drop(columns = ['datetime', 'classification'])
y8_1 = X8.datetime

In [None]:
X8_1_scaler = scaler.fit_transform(X8_1)

In [None]:
X8_1_sc = pd.DataFrame(X8_1_scaler)
X8_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X8 = pd.concat([y8_1, X8_1_sc], axis = 1)
X8

In [None]:
X8_test = test_aug.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [None]:
y8_test = test_aug[['datetime', 'classification']]
X8_1_test = X8_test.drop(columns = ['datetime', 'classification'])
y8_1_test = X8_test.datetime

In [None]:
X8_1_test_scaler = scaler.fit_transform(X8_1_test)

In [None]:
X8_1_test_sc = pd.DataFrame(X8_1_test_scaler)
X8_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X8_test = pd.concat([y8_1_test, X8_1_test_sc], axis = 1)
X8_test

## LightGBM

In [None]:
X8 = X8.drop(columns = ['datetime'])
y8 = y8.drop(columns = ['datetime'])
X8_test = X8_test.drop(columns = ['datetime'])
y8_test = y8_test.drop(columns = ['datetime'])

In [None]:
study8 = optuna.create_study(direction='maximize',sampler=TPESampler())
study8.optimize(lambda trial : objectiveLGBM(trial, X8, y8), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study8.best_trial.value,study8.best_trial.params))

In [None]:
optuna.visualization.plot_param_importances(study8) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study8) # 최적화 과정 시각화

In [None]:
X8_train, X8_val, y8_train, y8_val = train_test_split(X8, y8, test_size = 0.2, random_state = 42)

In [None]:
X8_train.shape, X8_val.shape, y8_train.shape, y8_val.shape

In [None]:
model = LGBMClassifier(**study8.best_trial.params)

In [None]:
model8 = model.fit(X8_train, y8_train,
          eval_set = [(X8_train, y8_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

In [None]:
train8_preds = model8.predict(X8_train)
val8_preds = model8.predict(X8_val)

In [None]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [None]:
get_clf_eval(y8_train, train8_preds)
get_clf_eval(y8_val, val8_preds)

In [None]:
preds_8= model8.predict(X8_test)
preds_8

In [None]:
test_aug['classification'] = preds_8
test_aug

# 9월 데이터 머신러닝

## 데이터 가공

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [None]:
X9 = train_sep.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [None]:
y9 = train_sep[['datetime', 'classification']]
X9_1 = X9.drop(columns = ['datetime', 'classification'])
y9_1 = X9.datetime

In [None]:
X9_1_scaler = scaler.fit_transform(X9_1)

In [None]:
X9_1_sc = pd.DataFrame(X9_1_scaler)
X9_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X9 = pd.concat([y9_1, X9_1_sc], axis = 1)
X9

In [None]:
X9_test = test_sep.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [None]:
y9_test = test_sep[['datetime', 'classification']]
X9_1_test = X9_test.drop(columns = ['datetime', 'classification'])
y9_1_test = X9_test.datetime

In [None]:
X9_1_test_scaler = scaler.fit_transform(X9_1_test)

In [None]:
X9_1_test_sc = pd.DataFrame(X9_1_test_scaler)
X9_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X9_test = pd.concat([y9_1_test, X9_1_test_sc], axis = 1)
X9_test

## LightGBM

In [None]:
X9 = X9.drop(columns = ['datetime'])
y9 = y9.drop(columns = ['datetime'])
X9_test = X9_test.drop(columns = ['datetime'])
y9_test = y9_test.drop(columns = ['datetime'])

In [None]:
study9 = optuna.create_study(direction='maximize',sampler=TPESampler())
study9.optimize(lambda trial : objectiveLGBM(trial, X9, y9), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study9.best_trial.value,study9.best_trial.params))

In [None]:
optuna.visualization.plot_param_importances(study9) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study9) # 최적화 과정 시각화

In [None]:
X9_train, X9_val, y9_train, y9_val = train_test_split(X9, y9, test_size = 0.2, random_state = 42)

In [None]:
X9_train.shape, X9_val.shape, y9_train.shape, y9_val.shape

In [None]:
model = LGBMClassifier(**study9.best_trial.params)

In [None]:
model9 = model.fit(X9_train, y9_train,
          eval_set = [(X9_train, y9_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

In [None]:
train9_preds = model9.predict(X9_train)
val9_preds = model9.predict(X9_val)

In [None]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [None]:
get_clf_eval(y9_train, train9_preds)
get_clf_eval(y9_val, val9_preds)

In [None]:
preds_9= model9.predict(X9_test)
preds_9

In [None]:
test_sep['classification'] = preds_9
test_sep

# 10월 데이터 머신러닝

## 데이터 가공

In [28]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [29]:
X10 = train_oct.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [30]:
y10 = train_oct[['datetime', 'classification']]
X10_1 = X10.drop(columns = ['datetime', 'classification'])
y10_1 = X10.datetime

In [31]:
X10_1_scaler = scaler.fit_transform(X10_1)

In [32]:
X10_1_sc = pd.DataFrame(X10_1_scaler)
X10_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X10 = pd.concat([y10_1, X10_1_sc], axis = 1)
X10

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-10-01 00:00:00,0.166667,0.245917,0.877381,0.0,0.0,0.0,0.000000
1,2019-10-01 01:00:00,0.166667,0.151543,0.899167,0.0,0.0,0.0,0.043478
2,2019-10-01 02:00:00,0.166667,0.100726,0.875442,0.0,0.0,0.0,0.086957
3,2019-10-01 03:00:00,0.166667,0.107532,0.897342,0.0,0.0,0.0,0.130435
4,2019-10-01 04:00:00,0.166667,0.125681,0.887761,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1483,2020-10-31 19:00:00,0.833333,0.801270,0.253793,1.0,0.0,1.0,0.826087
1484,2020-10-31 20:00:00,0.833333,0.742287,0.638759,1.0,0.0,1.0,0.869565
1485,2020-10-31 21:00:00,0.833333,0.759528,0.581727,1.0,0.0,1.0,0.913043
1486,2020-10-31 22:00:00,0.833333,0.700318,0.647314,1.0,0.0,1.0,0.956522


In [34]:
X10_test = test_oct.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance',
                               'classification_act'])

In [35]:
y10_test = test_oct[['datetime', 'classification_act']]
X10_1_test = X10_test.drop(columns = ['datetime', 'classification'])
y10_1_test = X10_test.datetime

In [36]:
X10_1_test_scaler = scaler.fit_transform(X10_1_test)

In [37]:
X10_1_test_sc = pd.DataFrame(X10_1_test_scaler)
X10_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X10_test = pd.concat([y10_1_test, X10_1_test_sc], axis = 1)
X10_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-10-01 00:00:00,0.666667,0.194549,0.951216,0.0,0.0,0.0,0.000000
1,2021-10-01 01:00:00,0.666667,0.119030,0.942248,0.0,0.0,0.0,0.043478
2,2021-10-01 02:00:00,0.666667,0.070518,0.938850,0.0,0.0,0.0,0.086957
3,2021-10-01 03:00:00,0.666667,0.068017,0.936540,0.0,0.0,0.0,0.130435
4,2021-10-01 04:00:00,0.666667,0.111528,0.949042,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2021-10-31 19:00:00,1.000000,0.740185,0.716809,0.0,0.0,1.0,0.826087
740,2021-10-31 20:00:00,1.000000,0.668917,0.609322,0.0,0.0,1.0,0.869565
741,2021-10-31 21:00:00,1.000000,0.823206,0.622911,0.0,0.0,1.0,0.913043
742,2021-10-31 22:00:00,1.000000,0.734934,0.699823,0.0,0.0,1.0,0.956522


## LightGBM

In [38]:
X10 = X10.drop(columns = ['datetime'])
y10 = y10.drop(columns = ['datetime'])
X10_test = X10_test.drop(columns = ['datetime'])
y10_test = y10_test.drop(columns = ['datetime'])

In [48]:
study10 = optuna.create_study(direction='maximize',sampler=TPESampler())
study10.optimize(lambda trial : objectiveLGBM(trial, X10, y10), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study10.best_trial.value,study10.best_trial.params))

[32m[I 2022-12-01 11:01:13,355][0m A new study created in memory with name: no-name-401b515c-f8d7-47c9-ab64-283f31a3d5ba[0m
[32m[I 2022-12-01 11:01:16,696][0m Trial 0 finished with value: 0.970140683318978 and parameters: {'num_leaves': 262, 'n_estimators': 1692, 'feature_fraction': 0.7027109508028988, 'bagging_fraction': 0.8701680673283382, 'bagging_freq': 4, 'min_child_samples': 66}. Best is trial 0 with value: 0.970140683318978.[0m
[32m[I 2022-12-01 11:01:20,074][0m Trial 1 finished with value: 0.9888888888888889 and parameters: {'num_leaves': 124, 'n_estimators': 1112, 'feature_fraction': 0.6676989609245352, 'bagging_fraction': 0.44883488795942866, 'bagging_freq': 5, 'min_child_samples': 10}. Best is trial 1 with value: 0.9888888888888889.[0m
[32m[I 2022-12-01 11:01:22,410][0m Trial 2 finished with value: 0.9831569664902998 and parameters: {'num_leaves': 23, 'n_estimators': 2592, 'feature_fraction': 0.8557767041411353, 'bagging_fraction': 0.48472592950707655, 'bagging_fr

Best trial: score 1.0,
params {'num_leaves': 66, 'n_estimators': 1270, 'feature_fraction': 0.9590632457272333, 'bagging_fraction': 0.6993747957170935, 'bagging_freq': 3, 'min_child_samples': 84}


In [49]:
optuna.visualization.plot_param_importances(study10) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study10) # 최적화 과정 시각화

In [50]:
X10_train, X10_val, y10_train, y10_val = train_test_split(X10, y10, test_size = 0.2, random_state = 42)

In [51]:
X10_train.shape, X10_val.shape, y10_train.shape, y10_val.shape

((1190, 7), (298, 7), (1190, 1), (298, 1))

In [52]:
model = LGBMClassifier(**study10.best_trial.params)

In [53]:
model10 = model.fit(X10_train, y10_train,
          eval_set = [(X10_train, y10_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.829752	training's multi_logloss: 0.829752
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.738867	training's multi_logloss: 0.738867
[3]	training's multi_logloss: 0.64783	training's multi_logloss: 0.64783
[4]	training's multi_logloss: 0.569495	training's multi_logloss: 0.569495
[5]	training's multi_logloss: 0.50413	training's multi_logloss: 0.50413
[6]	training's multi_logloss: 0.455196	training's multi_logloss: 0.455196
[7]	training's multi_logloss: 0.415655	training's multi_logloss: 0.415655
[8]	training's multi_logloss: 0.380882	training's multi_logloss: 0.380882
[9]	training's multi_logloss: 0.346044	training's multi_logloss: 0.346044
[10]	training's multi_logloss: 0.311413	training's multi_logloss: 0.311413
[11]	training's multi_logloss: 0.281037	training's multi_logloss: 0.281037
[12]	training's multi_logloss: 0.258413	training's multi_logloss: 0.258413
[13]	training's multi_logloss: 0.234103	training's

In [54]:
train10_preds = model10.predict(X10_train)
val10_preds = model10.predict(X10_val)

In [55]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [58]:
train_pre_score = precision_score(y10_train, train10_preds, average= "macro")
train_rec_score = recall_score(y10_train, train10_preds, average= "macro")
val_pre_score = precision_score(y10_val, val10_preds, average= "macro")
val_rec_score = recall_score(y10_val, val10_preds, average= "macro")

In [60]:
get_clf_eval(y10_train, train10_preds)
get_clf_eval(y10_val, val10_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9701
재현율: 0.9797


In [62]:
preds_10= model10.predict(X10_test)
preds_10

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 3., 3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 1., 2., 2.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 1., 2., 2., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 1.,
       1., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 1., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 3.,
       3., 3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       1., 2., 2., 2., 2., 2., 2., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 1.

In [63]:
get_clf_eval(y10_train, train10_preds)
get_clf_eval(y10_test, preds_10)

정밀도: 1.0000
재현율: 1.0000
정밀도: 0.9597
재현율: 0.9576


In [65]:
test_pre_score = precision_score(y10_test, preds_10, average= "macro")
test_rec_score = recall_score(y10_test, preds_10, average= "macro")

In [69]:
print('정밀도: {:.6f}'.format(test_pre_score))
print('재현율: {:.6f}'.format(test_rec_score))

정밀도: 0.959719
재현율: 0.957576


In [70]:
test_oct['classification'] = preds_10
test_oct

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,classification_act,year,month,day,hour
0,2021-10-01,5,0:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,1045,86.83,1.0,2021-10-01 00:00:00,1.0,2021,10,1,0
1,2021-10-01,5,1:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,743,86.17,1.0,2021-10-01 01:00:00,1.0,2021,10,1,1
2,2021-10-01,5,2:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,549,85.92,1.0,2021-10-01 02:00:00,1.0,2021,10,1,2
3,2021-10-01,5,3:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,539,85.75,1.0,2021-10-01 03:00:00,1.0,2021,10,1,3
4,2021-10-01,5,4:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,713,86.67,1.0,2021-10-01 04:00:00,1.0,2021,10,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2021-10-31,7,19:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3227,69.58,2.0,2021-10-31 19:00:00,2.0,2021,10,31,19
740,2021-10-31,7,20:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,2942,61.67,2.0,2021-10-31 20:00:00,2.0,2021,10,31,20
741,2021-10-31,7,21:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3559,62.67,2.0,2021-10-31 21:00:00,2.0,2021,10,31,21
742,2021-10-31,7,22:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3206,68.33,2.0,2021-10-31 22:00:00,2.0,2021,10,31,22


# 11월 데이터 머신러닝

## 데이터 가공

In [237]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [238]:
X11 = train_nov.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [239]:
y11 = train_nov[['datetime', 'classification']]
X11_1 = X11.drop(columns = ['datetime', 'classification'])
y11_1 = X11.datetime

In [240]:
X11_1_scaler = scaler.fit_transform(X11_1)

In [241]:
X11_1_sc = pd.DataFrame(X11_1_scaler)
X11_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X11 = pd.concat([y11_1, X11_1_sc], axis = 1)
X11

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-11-01 00:00:00,0.666667,0.313590,0.938679,0.0,0.0,0.0,0.000000
1,2019-11-01 01:00:00,0.666667,0.180256,0.950184,0.0,0.0,0.0,0.043478
2,2019-11-01 02:00:00,0.666667,0.112308,0.937644,0.0,0.0,0.0,0.086957
3,2019-11-01 03:00:00,0.666667,0.102051,0.907041,0.0,0.0,0.0,0.130435
4,2019-11-01 04:00:00,0.666667,0.141282,0.893580,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1435,2020-11-30 19:00:00,0.000000,0.759231,0.265647,1.0,0.0,1.0,0.826087
1436,2020-11-30 20:00:00,0.000000,0.753846,0.741141,1.0,0.0,1.0,0.869565
1437,2020-11-30 21:00:00,0.000000,0.787436,0.741141,1.0,0.0,1.0,0.913043
1438,2020-11-30 22:00:00,0.000000,0.552308,0.797745,1.0,0.0,1.0,0.956522


In [242]:
X11_test = test_nov.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [243]:
y11_test = test_nov[['datetime', 'classification']]
X11_1_test = X11_test.drop(columns = ['datetime', 'classification'])
y11_1_test = X11_test.datetime

In [244]:
X11_1_test_scaler = scaler.fit_transform(X11_1_test)

In [245]:
X11_1_test_sc = pd.DataFrame(X11_1_test_scaler)
X11_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X11_test = pd.concat([y11_1_test, X11_1_test_sc], axis = 1)
X11_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-11-01 00:00:00,0.000000,0.166903,0.885993,0.0,0.0,0.0,0.000000
1,2021-11-01 01:00:00,0.000000,0.078209,0.884821,0.0,0.0,0.0,0.043478
2,2021-11-01 02:00:00,0.000000,0.031454,0.871792,0.0,0.0,0.0,0.086957
3,2021-11-01 03:00:00,0.000000,0.033154,0.884821,0.0,0.0,0.0,0.130435
4,2021-11-01 04:00:00,0.000000,0.086710,0.876221,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
715,2021-11-30 19:00:00,0.166667,0.727402,0.717655,0.0,0.0,1.0,0.826087
716,2021-11-30 20:00:00,0.166667,0.754888,0.740456,0.0,0.0,1.0,0.869565
717,2021-11-30 21:00:00,0.166667,0.719751,0.771987,0.0,0.0,1.0,0.913043
718,2021-11-30 22:00:00,0.166667,0.589402,0.798046,0.0,0.0,1.0,0.956522


## LightGBM

In [246]:
X11 = X11.drop(columns = ['datetime'])
y11 = y11.drop(columns = ['datetime'])
X11_test = X11_test.drop(columns = ['datetime'])
y11_test = y11_test.drop(columns = ['datetime'])

In [247]:
study11 = optuna.create_study(direction='maximize',sampler=TPESampler())
study11.optimize(lambda trial : objectiveLGBM(trial, X11, y11), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study11.best_trial.value,study11.best_trial.params))

[32m[I 2022-12-01 03:28:49,208][0m A new study created in memory with name: no-name-8ca5f579-0a6f-47db-af20-ae7cbfff3270[0m
[32m[I 2022-12-01 03:28:52,640][0m Trial 0 finished with value: 1.0 and parameters: {'num_leaves': 107, 'n_estimators': 2930, 'feature_fraction': 0.7314506361960377, 'bagging_fraction': 0.8359406348385503, 'bagging_freq': 7, 'min_child_samples': 39}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-12-01 03:28:56,676][0m Trial 1 finished with value: 1.0 and parameters: {'num_leaves': 369, 'n_estimators': 2305, 'feature_fraction': 0.6261455708901436, 'bagging_fraction': 0.8460746456847057, 'bagging_freq': 3, 'min_child_samples': 33}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-12-01 03:28:58,416][0m Trial 2 finished with value: 0.8984480431848852 and parameters: {'num_leaves': 178, 'n_estimators': 2160, 'feature_fraction': 0.7165541935536528, 'bagging_fraction': 0.4727493506622853, 'bagging_freq': 4, 'min_child_samples': 79}. Best is trial 0 with valu

Best trial: score 1.0,
params {'num_leaves': 107, 'n_estimators': 2930, 'feature_fraction': 0.7314506361960377, 'bagging_fraction': 0.8359406348385503, 'bagging_freq': 7, 'min_child_samples': 39}


In [248]:
optuna.visualization.plot_param_importances(study11) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study11) # 최적화 과정 시각화

In [249]:
X11_train, X11_val, y11_train, y11_val = train_test_split(X11, y11, test_size = 0.2, random_state = 42)

In [250]:
X11_train.shape, X11_val.shape, y11_train.shape, y11_val.shape

((1152, 7), (288, 7), (1152, 1), (288, 1))

In [251]:
model = LGBMClassifier(**study11.best_trial.params)

In [252]:
model11 = model.fit(X11_train, y11_train,
          eval_set = [(X11_train, y11_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.810793	training's multi_logloss: 0.810793
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.713671	training's multi_logloss: 0.713671
[3]	training's multi_logloss: 0.619977	training's multi_logloss: 0.619977
[4]	training's multi_logloss: 0.543539	training's multi_logloss: 0.543539
[5]	training's multi_logloss: 0.479286	training's multi_logloss: 0.479286
[6]	training's multi_logloss: 0.428728	training's multi_logloss: 0.428728
[7]	training's multi_logloss: 0.390184	training's multi_logloss: 0.390184
[8]	training's multi_logloss: 0.359164	training's multi_logloss: 0.359164
[9]	training's multi_logloss: 0.323483	training's multi_logloss: 0.323483
[10]	training's multi_logloss: 0.295949	training's multi_logloss: 0.295949
[11]	training's multi_logloss: 0.264458	training's multi_logloss: 0.264458
[12]	training's multi_logloss: 0.240432	training's multi_logloss: 0.240432
[13]	training's multi_logloss: 0.219152	traini

In [253]:
train11_preds = model11.predict(X11_train)
val11_preds = model11.predict(X11_val)

In [254]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [255]:
get_clf_eval(y11_train, train11_preds)
get_clf_eval(y11_val, val11_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [256]:
preds_11= model11.predict(X11_test)
preds_11

array([1., 1., 1., 1., 1., 1., 2., 2., 1., 1., 2., 2., 1., 2., 2., 2., 2.,
       1., 2., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1.,
       2., 2., 2., 2., 2., 2., 2., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 1., 1., 2., 2., 1., 2., 2., 2., 2., 2., 3., 3.,
       2., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 2., 1., 1., 2., 2., 1.,
       2., 2., 2., 3., 3., 3., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 3., 3., 3., 2., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.,
       2., 3., 3., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 3., 2., 2., 2., 2., 3.,
       3., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 3., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 1., 1.

In [257]:
test_nov['classification'] = preds_11
test_nov

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-11-01,1,0:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,932,85.67,1.0,2021-11-01 00:00:00,2021,11,1,0
1,2021-11-01,1,1:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,619,85.58,1.0,2021-11-01 01:00:00,2021,11,1,1
2,2021-11-01,1,2:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,454,84.58,1.0,2021-11-01 02:00:00,2021,11,1,2
3,2021-11-01,1,3:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,460,85.58,1.0,2021-11-01 03:00:00,2021,11,1,3
4,2021-11-01,1,4:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,649,84.92,1.0,2021-11-01 04:00:00,2021,11,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2021-11-30,2,19:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,2910,72.75,1.0,2021-11-30 19:00:00,2021,11,30,19
716,2021-11-30,2,20:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3007,74.50,1.0,2021-11-30 20:00:00,2021,11,30,20
717,2021-11-30,2,21:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,2883,76.92,1.0,2021-11-30 21:00:00,2021,11,30,21
718,2021-11-30,2,22:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,2423,78.92,1.0,2021-11-30 22:00:00,2021,11,30,22


# 12월 데이터 머신러닝

## 데이터 가공

In [258]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from lightgbm.sklearn import LGBMClassifier

In [259]:
X12 = train_dec.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])


In [260]:
y12 = train_dec[['datetime', 'classification']]
X12_1 = X12.drop(columns = ['datetime', 'classification'])
y12_1 = X12.datetime

In [261]:
X12_1_scaler = scaler.fit_transform(X12_1)

In [262]:
X12_1_sc = pd.DataFrame(X12_1_scaler)
X12_1_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X12 = pd.concat([y12_1, X12_1_sc], axis = 1)
X12

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2019-12-01 00:00:00,1.0,0.274173,0.944483,0.0,0.0,0.0,0.000000
1,2019-12-01 01:00:00,1.0,0.166196,0.964598,0.0,0.0,0.0,0.043478
2,2019-12-01 02:00:00,1.0,0.115671,0.968391,0.0,0.0,0.0,0.086957
3,2019-12-01 03:00:00,1.0,0.086689,0.988506,0.0,0.0,0.0,0.130435
4,2019-12-01 04:00:00,1.0,0.064375,0.976092,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
1483,2020-12-31 19:00:00,0.5,0.730700,0.189655,1.0,0.0,1.0,0.826087
1484,2020-12-31 20:00:00,0.5,0.682739,0.603448,1.0,0.0,1.0,0.869565
1485,2020-12-31 21:00:00,0.5,0.710952,0.677241,1.0,0.0,1.0,0.913043
1486,2020-12-31 22:00:00,0.5,0.467299,0.751954,1.0,0.0,1.0,0.956522


In [263]:
X12_test = test_dec.drop(columns = ['date', 
                               'time', 
                               'branch_name',
                               'district_name',
                               'branch_num',
                               'dep_point',
                               'arr_point',
                               'lane',
                               'distance'])

In [264]:
y12_test = test_dec[['datetime', 'classification']]
X12_1_test = X12_test.drop(columns = ['datetime', 'classification'])
y12_1_test = X12_test.datetime

In [265]:
X12_1_test_scaler = scaler.fit_transform(X12_1_test)

In [266]:
X12_1_test_sc = pd.DataFrame(X12_1_test_scaler)
X12_1_test_sc.columns = ['dow', 'volume', 'speed', 'year', 'month', 'day', 'hour']
X12_test = pd.concat([y12_1_test, X12_1_test_sc], axis = 1)
X12_test

Unnamed: 0,datetime,dow,volume,speed,year,month,day,hour
0,2021-12-01 00:00:00,0.333333,0.243748,0.854942,0.0,0.0,0.0,0.000000
1,2021-12-01 01:00:00,0.333333,0.151092,0.826757,0.0,0.0,0.0,0.043478
2,2021-12-01 02:00:00,0.333333,0.092656,0.855944,0.0,0.0,0.0,0.086957
3,2021-12-01 03:00:00,0.333333,0.102132,0.842415,0.0,0.0,0.0,0.130435
4,2021-12-01 04:00:00,0.333333,0.131877,0.853940,0.0,0.0,0.0,0.173913
...,...,...,...,...,...,...,...,...
739,2021-12-31 19:00:00,0.666667,0.842327,0.187899,0.0,0.0,1.0,0.826087
740,2021-12-31 20:00:00,0.666667,0.803896,0.638858,0.0,0.0,1.0,0.869565
741,2021-12-31 21:00:00,0.666667,0.800737,0.719278,0.0,0.0,1.0,0.913043
742,2021-12-31 22:00:00,0.666667,0.605686,0.800701,0.0,0.0,1.0,0.956522


## LightGBM

In [267]:
X12 = X12.drop(columns = ['datetime'])
y12 = y12.drop(columns = ['datetime'])
X12_test = X12_test.drop(columns = ['datetime'])
y12_test = y12_test.drop(columns = ['datetime'])

In [268]:
study12 = optuna.create_study(direction='maximize',sampler=TPESampler())
study12.optimize(lambda trial : objectiveLGBM(trial, X12, y12), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study12.best_trial.value,study12.best_trial.params))

[32m[I 2022-12-01 03:29:55,092][0m A new study created in memory with name: no-name-bd114f1f-3980-44d9-8416-1986171880ce[0m
[32m[I 2022-12-01 03:29:57,749][0m Trial 0 finished with value: 0.9366391184573003 and parameters: {'num_leaves': 463, 'n_estimators': 2475, 'feature_fraction': 0.9047059231327088, 'bagging_fraction': 0.721277624316727, 'bagging_freq': 5, 'min_child_samples': 95}. Best is trial 0 with value: 0.9366391184573003.[0m
[32m[I 2022-12-01 03:30:02,442][0m Trial 1 finished with value: 0.9786566227244192 and parameters: {'num_leaves': 206, 'n_estimators': 2658, 'feature_fraction': 0.40496172428935456, 'bagging_fraction': 0.5621311167403866, 'bagging_freq': 2, 'min_child_samples': 23}. Best is trial 1 with value: 0.9786566227244192.[0m
[32m[I 2022-12-01 03:30:05,382][0m Trial 2 finished with value: 0.9331874087971649 and parameters: {'num_leaves': 488, 'n_estimators': 2547, 'feature_fraction': 0.5124566710183155, 'bagging_fraction': 0.8640723932705754, 'bagging_f

Best trial: score 1.0,
params {'num_leaves': 66, 'n_estimators': 2163, 'feature_fraction': 0.949509779154371, 'bagging_fraction': 0.43466507367735363, 'bagging_freq': 3, 'min_child_samples': 44}


In [269]:
optuna.visualization.plot_param_importances(study12) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study12) # 최적화 과정 시각화

In [270]:
X12_train, X12_val, y12_train, y12_val = train_test_split(X12, y12, test_size = 0.2, random_state = 42)

In [271]:
X12_train.shape, X12_val.shape, y12_train.shape, y12_val.shape

((1190, 7), (298, 7), (1190, 1), (298, 1))

In [272]:
model = LGBMClassifier(**study12.best_trial.params)

In [273]:
model12 = model.fit(X12_train, y12_train,
          eval_set = [(X12_train, y12_train)],
          verbose = True, eval_metric = "multi_logloss", early_stopping_rounds = 100)

[1]	training's multi_logloss: 0.781236	training's multi_logloss: 0.781236
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_logloss: 0.690829	training's multi_logloss: 0.690829
[3]	training's multi_logloss: 0.603898	training's multi_logloss: 0.603898
[4]	training's multi_logloss: 0.53294	training's multi_logloss: 0.53294
[5]	training's multi_logloss: 0.473508	training's multi_logloss: 0.473508
[6]	training's multi_logloss: 0.426825	training's multi_logloss: 0.426825
[7]	training's multi_logloss: 0.388108	training's multi_logloss: 0.388108
[8]	training's multi_logloss: 0.354293	training's multi_logloss: 0.354293
[9]	training's multi_logloss: 0.318962	training's multi_logloss: 0.318962
[10]	training's multi_logloss: 0.287902	training's multi_logloss: 0.287902
[11]	training's multi_logloss: 0.260588	training's multi_logloss: 0.260588
[12]	training's multi_logloss: 0.241005	training's multi_logloss: 0.241005
[13]	training's multi_logloss: 0.218935	training

In [274]:
train12_preds = model12.predict(X12_train)
val12_preds = model12.predict(X12_val)

In [275]:
def get_clf_eval(y_act, y_pred):
  precision = precision_score(y_act, y_pred, average= "macro")
  recall = recall_score(y_act, y_pred, average= "macro")
  print('정밀도: {:.4f}'.format(precision))
  print('재현율: {:.4f}'.format(recall))

In [276]:
get_clf_eval(y12_train, train12_preds)
get_clf_eval(y12_val, val12_preds)

정밀도: 1.0000
재현율: 1.0000
정밀도: 1.0000
재현율: 1.0000


In [277]:
preds_12= model12.predict(X12_test)
preds_12

array([1., 1., 1., 1., 1., 1., 2., 2., 1., 1., 2., 2., 2., 2., 2., 2., 2.,
       2., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 1., 2.,
       3., 2., 2., 1., 2., 2., 2., 2., 3., 3., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 1., 1., 2., 2., 2., 2., 2., 2., 2., 3., 3., 3.,
       2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3., 2.,
       2., 2., 2., 2., 3., 3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 2., 2., 1., 1., 2., 2.,
       2., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 1., 2., 2., 2., 2., 3., 3., 2., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 3.,
       3., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 1., 1., 2., 2.,
       1., 2., 2., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 1., 1.

In [278]:
test_dec['classification'] = preds_12
test_dec

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-12-01,3,0:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,1122,81.75,1.0,2021-12-01 00:00:00,2021,12,1,0
1,2021-12-01,3,1:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,770,79.50,1.0,2021-12-01 01:00:00,2021,12,1,1
2,2021-12-01,3,2:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,548,81.83,1.0,2021-12-01 02:00:00,2021,12,1,2
3,2021-12-01,3,3:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,584,80.75,1.0,2021-12-01 03:00:00,2021,12,1,3
4,2021-12-01,3,4:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,697,81.67,1.0,2021-12-01 04:00:00,2021,12,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2021-12-31,5,19:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3396,28.50,3.0,2021-12-31 19:00:00,2021,12,31,19
740,2021-12-31,5,20:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3250,64.50,2.0,2021-12-31 20:00:00,2021,12,31,20
741,2021-12-31,5,21:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3238,70.92,2.0,2021-12-31 21:00:00,2021,12,31,21
742,2021-12-31,5,22:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,2497,77.42,1.0,2021-12-31 22:00:00,2021,12,31,22


# 월별 데이터 합치기

In [279]:
result = pd.concat([test_jan,
                    test_feb,
                    test_mar,
                    test_apr,
                    test_may,
                    test_jun,
                    test_jul,
                    test_aug,
                    test_sep,
                    test_oct,
                    test_nov,
                    test_dec])
result = result.sort_values(by = 'datetime')
result = result.reset_index(drop = True)
result

Unnamed: 0,date,dow,time,branch_name,district_name,branch_num,dep_point,arr_point,lane,distance,volume,speed,classification,datetime,year,month,day,hour
0,2021-10-01,5,0:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,1045,86.83,1.0,2021-10-01 00:00:00,2021,10,1,0
1,2021-10-01,5,1:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,743,86.17,1.0,2021-10-01 01:00:00,2021,10,1,1
2,2021-10-01,5,2:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,549,85.92,1.0,2021-10-01 02:00:00,2021,10,1,2
3,2021-10-01,5,3:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,539,85.75,1.0,2021-10-01 03:00:00,2021,10,1,3
4,2021-10-01,5,4:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,713,86.67,1.0,2021-10-01 04:00:00,2021,10,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2022-09-30,5,19:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3208,23.58,3.0,2022-09-30 19:00:00,2022,9,30,19
8756,2022-09-30,5,20:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3694,64.67,2.0,2022-09-30 20:00:00,2022,9,30,20
8757,2022-09-30,5,21:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3504,66.25,2.0,2022-09-30 21:00:00,2022,9,30,21
8758,2022-09-30,5,22:00:00,청담대교,강남구,C-17,청담대교남단,청담대교북단,3,1138,3506,69.25,2.0,2022-09-30 22:00:00,2022,9,30,22


In [280]:
result = result.drop(columns = ['dow', 
                                'district_name', 
                                'branch_num', 
                                'arr_point', 
                                'lane', 
                                'distance', 
                                'volume', 
                                'speed', 
                                'datetime', 
                                'year', 
                                'month', 
                                'day', 
                                'hour'])
result

Unnamed: 0,date,time,branch_name,dep_point,classification
0,2021-10-01,0:00:00,청담대교,청담대교남단,1.0
1,2021-10-01,1:00:00,청담대교,청담대교남단,1.0
2,2021-10-01,2:00:00,청담대교,청담대교남단,1.0
3,2021-10-01,3:00:00,청담대교,청담대교남단,1.0
4,2021-10-01,4:00:00,청담대교,청담대교남단,1.0
...,...,...,...,...,...
8755,2022-09-30,19:00:00,청담대교,청담대교남단,3.0
8756,2022-09-30,20:00:00,청담대교,청담대교남단,2.0
8757,2022-09-30,21:00:00,청담대교,청담대교남단,2.0
8758,2022-09-30,22:00:00,청담대교,청담대교남단,2.0


# csv 파일 만들기

In [281]:
result.to_csv('chungdam_depsouth_result.csv', index = False)