In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import warnings                                      
warnings.filterwarnings('ignore')

import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("input.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,수집시분초,평균속도,교통량,콘존ID,차로번호
0,0,0,90.0,3.0,0010CZE010,1
1,1,0,82.0,3.0,0010CZE010,2
2,2,0,58.0,1.0,0010CZE010,3
3,6,0,104.0,1.0,0010CZE011,1
4,8,0,61.0,1.0,0010CZE011,3


In [3]:
df = df.loc[df['콘존ID'].str.contains('0010', na = False)]

In [4]:
#정체 정도에 따라 라벨링
congestion = []

for row in df['평균속도']:
    if row <= 40:
        congestion.append(2)
    elif row <= 80:
        congestion.append(1)
    else:
        congestion.append(0)

df['congestion'] = congestion
df.head()

Unnamed: 0.1,Unnamed: 0,수집시분초,평균속도,교통량,콘존ID,차로번호,congestion
0,0,0,90.0,3.0,0010CZE010,1,0
1,1,0,82.0,3.0,0010CZE010,2,0
2,2,0,58.0,1.0,0010CZE010,3,1
3,6,0,104.0,1.0,0010CZE011,1,0
4,8,0,61.0,1.0,0010CZE011,3,1


In [5]:
df = df.loc[:, ['수집시분초', '평균속도', '교통량', '콘존ID', '차로번호', 'congestion']]
df

Unnamed: 0,수집시분초,평균속도,교통량,콘존ID,차로번호,congestion
0,0,90.0,3.0,0010CZE010,1,0
1,0,82.0,3.0,0010CZE010,2,0
2,0,58.0,1.0,0010CZE010,3,1
3,0,104.0,1.0,0010CZE011,1,0
4,0,61.0,1.0,0010CZE011,3,1
...,...,...,...,...,...,...
103408193,235930,70.0,4.0,0010CZS280,2,1
103408194,235930,62.0,2.0,0010CZS280,3,1
103408195,235930,91.0,5.0,0010CZS380,2,0
103408196,235930,21.0,2.0,0010CZS380,3,2


In [6]:
df.isnull().sum()

수집시분초         0
평균속도          0
교통량           0
콘존ID          0
차로번호          0
congestion    0
dtype: int64

In [7]:
#중복값 삭제
#df = df.drop_duplicates()
#df

In [None]:
def time_elim(df):
    #00초만 남기고 제거
    idx_sec = df[(df['수집시분초'] % 100) != 0].index

    #15분 간격만 남기고 제거
    df = df.drop(idx_sec)
    #idx_min = df[((df['수집시분초'] % 10000) != 0) &
    #            ((df['수집시분초'] % 10000) != 1500) &
    #            ((df['수집시분초'] % 10000) != 3000) &
    #            ((df['수집시분초'] % 10000) != 4500)].index

    #5분 간격만 남기고 제거
    idx_min = df[(df['수집시분초'] / 500) == 0].index

time_elim(df)
#time_elim(df_test)

df

In [9]:
#Encoding(label, one-hot, etc.)
encoding_columns = ['수집시분초', '콘존ID', '차로번호']
not_encoding_columns = ['교통량', '평균속도', 'congestion']

enc_classes = {}
def encoding_label(x):
    le = LabelEncoder()
    le.fit(x)
    label = le.transform(x)
    
    enc_classes[x.name] = le.classes_
    
    return label

d1 = df[encoding_columns].apply(encoding_label)
d2 = df[not_encoding_columns]

#traffic congestion
data = d1.join(d2)
data


Unnamed: 0,수집시분초,콘존ID,차로번호,교통량,평균속도,congestion
0,0,0,0,3.0,90.0,0
1,0,0,1,3.0,82.0,0
2,0,0,2,1.0,58.0,1
3,0,1,0,1.0,104.0,0
4,0,1,2,1.0,61.0,1
...,...,...,...,...,...,...
103408193,16666,86,1,4.0,70.0,1
103408194,16666,86,2,2.0,62.0,1
103408195,16666,98,1,5.0,91.0,0
103408196,16666,98,2,2.0,21.0,2


In [10]:
#scaling(StandardScaling, MinMaxScaling, RobustScaling, etc.)

y = data['congestion']
X_velocity = data.drop(columns = ['congestion'])
X = data.drop(columns = ['congestion', '평균속도'])
type(y), type(X)
'''
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

sc = RobustScaler()
X = sc.fit_transform(X)
'''

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y, random_state = 42)
X_train, y_train


(           수집시분초  콘존ID  차로번호   교통량
 47747641   16177    88     0   1.0
 60874445    8443    74     3   5.0
 11825878     792    89     3   1.0
 4911931     5540   106     2  12.0
 100928181  14603    51     3   7.0
 ...          ...   ...   ...   ...
 67191215    1473    46     2   3.0
 25798064     233    46     2   2.0
 40370646    7801    49     1  16.0
 77073131   15790    42     1   7.0
 89663138   10635     9     0   5.0
 
 [15100995 rows x 4 columns],
 47747641     0
 60874445     1
 11825878     0
 4911931      2
 100928181    1
             ..
 67191215     0
 25798064     0
 40370646     1
 77073131     0
 89663138     0
 Name: congestion, Length: 15100995, dtype: int64)

In [11]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE

sm = BorderlineSMOTE(random_state = 1)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [12]:
#GridSearchCV를 이용한 하이퍼 파라미터 튜닝
'''
params = {'max_depth' : [6, 8, 10, 12],
          'min_samples_leaf' : [8, 12, 18],
          'min_samples_split' : [8, 16, 20]
         }
classifier = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(classifier, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(X_train, y_train)

print("최적 하이퍼 파라미터: ", grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))
'''

'\nparams = {\'max_depth\' : [6, 8, 10, 12],\n          \'min_samples_leaf\' : [8, 12, 18],\n          \'min_samples_split\' : [8, 16, 20]\n         }\nclassifier = RandomForestClassifier(random_state = 0, n_jobs = -1)\ngrid_cv = GridSearchCV(classifier, param_grid = params, cv = 3, n_jobs = -1)\ngrid_cv.fit(X_train, y_train)\n\nprint("최적 하이퍼 파라미터: ", grid_cv.best_params_)\nprint(\'최고 예측 정확도: {:.4f}\'.format(grid_cv.best_score_))\n'

In [None]:
classifier = RandomForestClassifier(
max_depth = 12,
min_samples_leaf = 12,
min_samples_split = 8,
n_estimators = 100)

classifier.fit(X_train, y_train)

pred_train = classifier.predict(X_train)
pred_test = classifier.predict(X_test)
acc_train = accuracy_score(y_train, pred_train)
acc_test = accuracy_score(y_test, pred_test)

print(f'학습: {acc_train}, 테스트: {acc_test}')

In [None]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

def print_score(classifier, X_train, y_train, X_test, y_test, train = True):
    if train == True:
        print("Training results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_train, classifier.predict(X_train))))
        print("Classification Report:\n{}\n".format(classification_report(y_train, classifier.predict(X_train))))
        print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_train, classifier.predict(X_train))))
        
        #res = cross_val_score(classifier, X_train, y_train, cv = 10, n_jobs = -1, scoring = 'accuracy')
        
        #print("Average Accuracy:\t{0:.4f}\n".format(res.mean()))
        #print("Standard Deviation:\t{0:.4f}\n".format(res.std()))
        
    elif train == False:
        print("Test results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_test, classifier.predict(X_test))))
        print("Classification Report:\n{}\n".format(classification_report(y_test, classifier.predict(X_test))))
        print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test, classifier.predict(X_test))))

In [None]:
print_score(classifier, X_train, y_train, X_test, y_test, train = True)

In [None]:
print_score(classifier, X_train, y_train, X_test, y_test, train = False)