# 교통 데이터 VDS 
- 머신러닝 모델
- 딥러닝 모델 사용

## 1. 라이브러리 설정하기

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
def plot_ml_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 20)):   
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.legend(loc='lower right')
    return plt

## 2. 데이터 가져오기 
- 판다스를 이용하여 iris 데이터 가져오기

In [3]:
import pandas as pd

In [4]:
from pandas import datetime

In [5]:
def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M')

In [6]:
df = pd.read_csv('./daejeon_vds16.csv',  date_parser=parser)

In [7]:
df.head()

Unnamed: 0,Date,ToVol,SmVol,MeVol,LaVol,Speed,Occ.Rate
0,2017-04-02 0:00,43,34,9,0,50.3,1.9
1,2017-04-02 0:05,45,32,13,0,58.9,1.84
2,2017-04-02 0:10,46,34,12,0,50.6,1.87
3,2017-04-02 0:15,45,36,9,0,50.9,1.72
4,2017-04-02 0:20,27,13,13,1,62.2,1.12


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8064 entries, 0 to 8063
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      8064 non-null   object 
 1   ToVol     8064 non-null   int64  
 2   SmVol     8064 non-null   int64  
 3   MeVol     8064 non-null   int64  
 4   LaVol     8064 non-null   int64  
 5   Speed     8064 non-null   float64
 6   Occ.Rate  8064 non-null   float64
dtypes: float64(2), int64(4), object(1)
memory usage: 441.1+ KB


In [9]:
df.describe()

Unnamed: 0,ToVol,SmVol,MeVol,LaVol,Speed,Occ.Rate
count,8064.0,8064.0,8064.0,8064.0,8064.0,8064.0
mean,110.459945,79.353299,29.948537,1.15811,49.327431,6.166941
std,63.954451,46.802106,19.081136,1.530192,7.921856,6.739946
min,6.0,2.0,0.0,0.0,9.1,0.23
25%,50.0,35.0,13.0,0.0,44.9,2.14
50%,122.0,87.0,29.0,1.0,48.5,5.55
75%,155.0,111.0,44.0,2.0,54.2,7.29
max,338.0,250.0,145.0,16.0,87.8,82.1


In [10]:
maxs = df.max()
print(maxs)

Date        2017-04-29 9:55
ToVol                   338
SmVol                   250
MeVol                   145
LaVol                    16
Speed                  87.8
Occ.Rate               82.1
dtype: object


In [11]:
df.tail()

Unnamed: 0,Date,ToVol,SmVol,MeVol,LaVol,Speed,Occ.Rate
8059,2017-04-29 23:35,45,35,10,0,51.3,2.01
8060,2017-04-29 23:40,47,33,14,0,53.9,1.89
8061,2017-04-29 23:45,32,28,4,0,50.6,1.36
8062,2017-04-29 23:50,31,21,10,0,59.3,1.4
8063,2017-04-29 23:55,39,33,6,0,52.5,1.74


## 라벨을 정하기
### 속도 Speed를 라벨로 정하자

In [12]:
def get_score(v):
    if v < 20:
        score = 'Jam'
    elif v < 40:
        score = 'Slow'
    else :
        score = 'Normal'
    return score

In [13]:
df["label"] = df["Speed"].apply(lambda v: get_score(v))
df

Unnamed: 0,Date,ToVol,SmVol,MeVol,LaVol,Speed,Occ.Rate,label
0,2017-04-02 0:00,43,34,9,0,50.3,1.90,Normal
1,2017-04-02 0:05,45,32,13,0,58.9,1.84,Normal
2,2017-04-02 0:10,46,34,12,0,50.6,1.87,Normal
3,2017-04-02 0:15,45,36,9,0,50.9,1.72,Normal
4,2017-04-02 0:20,27,13,13,1,62.2,1.12,Normal
...,...,...,...,...,...,...,...,...
8059,2017-04-29 23:35,45,35,10,0,51.3,2.01,Normal
8060,2017-04-29 23:40,47,33,14,0,53.9,1.89,Normal
8061,2017-04-29 23:45,32,28,4,0,50.6,1.36,Normal
8062,2017-04-29 23:50,31,21,10,0,59.3,1.40,Normal


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8064 entries, 0 to 8063
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      8064 non-null   object 
 1   ToVol     8064 non-null   int64  
 2   SmVol     8064 non-null   int64  
 3   MeVol     8064 non-null   int64  
 4   LaVol     8064 non-null   int64  
 5   Speed     8064 non-null   float64
 6   Occ.Rate  8064 non-null   float64
 7   label     8064 non-null   object 
dtypes: float64(2), int64(4), object(2)
memory usage: 504.1+ KB


In [15]:
df.head()

Unnamed: 0,Date,ToVol,SmVol,MeVol,LaVol,Speed,Occ.Rate,label
0,2017-04-02 0:00,43,34,9,0,50.3,1.9,Normal
1,2017-04-02 0:05,45,32,13,0,58.9,1.84,Normal
2,2017-04-02 0:10,46,34,12,0,50.6,1.87,Normal
3,2017-04-02 0:15,45,36,9,0,50.9,1.72,Normal
4,2017-04-02 0:20,27,13,13,1,62.2,1.12,Normal


** 라벨을 위해 'label'의 텍스트는 범주형 혹은 숫자로 인코딩 해야한다

## 3. 데이터 가시화 

In [16]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("classic")

In [None]:
import seaborn as sns
plt.figure(figsize=(14,8))
sns.scatterplot(data=df, x = 'Date', y = 'Speed', hue='label', style='label')

<matplotlib.axes._subplots.AxesSubplot at 0x19fa940fbc8>

In [None]:
df.head()

In [None]:
# df.set_index('Date', inplace=True)

In [None]:
plt.figure(figsize=(14,6))
sns.scatterplot(data=df, x = 'Date', y = 'ToVol', hue='label', style='label')

In [None]:
df.hist(edgecolor='black', linewidth=1.2)
fig=plt.gcf()
fig.set_size_inches(16,8)
plt.show()

In [None]:
plt.figure(figsize=(15,10))

plt.subplot(2,2,1)
sns.violinplot(x='label',y='ToVol',data=df)
plt.subplot(2,2,2)
sns.violinplot(x='label',y='Speed',data=df)
plt.subplot(2,2,3)
sns.violinplot(x='label',y='Occ.Rate',data=df)
plt.subplot(2,2,4)
sns.violinplot(x='label',y='LaVol',data=df)

In [None]:
sns.pairplot(df,hue='label')

In [None]:
plt.figure(figsize=(12,6)) 
sns.heatmap(df.corr(),annot=True,cmap='YlGnBu')
plt.show()

## 4 머신러닝을 위한 입력 데이터 만들자

### 1) 입력 X와 출력 y의 값을 정하기

In [None]:
df['label'].unique()

In [None]:
#feature_cols = ['ToVol', 'SmVol', 'Speed', 'Occ.Rate']
#feature_cols = ['ToVol', 'SmVol', 'LaVol', 'MeVol']
feature_cols = ['ToVol','Occ.Rate']
target_col = 'label'
X = df[feature_cols]
y = df[target_col]

X = df.drop(['Species','Id'], axis=1)

In [None]:
X.head()

y = df['Species']

In [None]:
y.head()

### 2) 출력용 라벨을 머신러닝 
#### 텍스트를 숫자로 바꾸자 

In [None]:
class_dic = {'Jam':0, 'Slow':1, 'Normal':2}
y_ohc = y.apply(lambda z: class_dic[z])

In [None]:
y_ohc.head()

### 3) 데이터를 훈련과 테스트로 나누자 
- (실전) 데이터를 validation을 포함해서 나눌수 있다.
- (해보기) 전체 데이터를 train : validation : test = 0.6: 0.2: 0.2 로 나누어라

In [None]:
from sklearn.model_selection import train_test_split,  ShuffleSplit, learning_curve
#from sklearn.model_selection import learning_curve, train_test_split, KFold,  ShuffleSplit

X_train, X_test, y_train, y_test = train_test_split(X, y_ohc, test_size=0.20, random_state=30)

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape,  y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 5. 머신러닝 모델을 만들자

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm 
from sklearn.neighbors import KNeighborsClassifier       

In [None]:
#from sklearn.model_selection import ShuffleSplit

### 0) 머신러닝 훈련과정을 그려보자

### 1) 로지스틱 회귀

In [None]:
from sklearn.linear_model import LogisticRegression  
from sklearn import metrics   

In [None]:
m_lr = LogisticRegression()
m_lr.fit(X_train,y_train)

In [None]:
pred = m_lr.predict(X_test)

acc_lr = metrics.accuracy_score(pred,y_test)
print('The accuracy of the Logistic Regression is', acc_lr)

In [None]:
title = "Learning Curves (Logistic Regression)"
cv = ShuffleSplit(n_splits=4, test_size=0.2, random_state=0)

In [None]:
#import myUtil as myutil
plot_ml_curve(m_lr, title, X, y, ylim=(0.6, 1.01), cv=cv)

### 2 Support Vector Machine (SVM)

In [None]:
sv = svm.SVC() 
sv.fit(X_train,y_train) 
pred = sv.predict(X_test)
acc_svm = metrics.accuracy_score(pred,y_test)
print('The accuracy of the SVM is:', acc_svm)

In [None]:
%%time
title = "SVM Classification"
cv = ShuffleSplit(n_splits=4, test_size=0.2, random_state=0)
plot_ml_curve(sv, title, X, y, ylim=(0.6, 1.01), cv=cv )

### 3 K-Nearest Neighbours

In [None]:
knc = KNeighborsClassifier(n_neighbors=6)
knc.fit(X_train,y_train)
pred = knc.predict(X_test)
acc_knn = metrics.accuracy_score(pred,y_test)
print('The accuracy of the KNN is', acc_knn)

In [None]:
title = "K-Nearest Neighbours (KNN)"
cv = ShuffleSplit(n_splits=4, test_size=0.2, random_state=0)

plot_ml_curve(knc, title, X, y, ylim=(0.6, 1.01), cv=cv, n_jobs=4)

#### 3.1 최적의 k-NN의 정확도 확인 

In [None]:
a_index = list(range(1,11))
a = pd.Series()
x = [1,2,3,4,5,6,7,8,9,10]
for i in list(range(1,11)):
    kcs = KNeighborsClassifier(n_neighbors=i) 
    kcs.fit(X_train,y_train)
    y_pred = kcs.predict(X_test)
    a=a.append(pd.Series(
        metrics.accuracy_score(y_pred,y_test)))
    
plt.plot(a_index, a)
plt.xticks(x)

### 4 램덤포레스트 (RandomForestClassifier)

In [None]:
m_rf = RandomForestClassifier(n_estimators=100, max_depth = 3)

In [None]:
m_rf.fit(X_train, y_train)

In [None]:
pred = m_rf.predict(X_test)

In [None]:
acc_rf = metrics.accuracy_score(pred,y_test)
print('The accuracy of the RFC is:', acc_rf)

In [None]:
title = "Random Forest Classifier"
cv = ShuffleSplit(n_splits=4, test_size=0.2, random_state=0)
plot_ml_curve(m_rf, title, X, y, ylim=(0.6, 1.01), cv=cv)

### 5 의사결정나무 (DecisionTreeClassifier)

In [None]:
m_tree = DecisionTreeClassifier()

m_tree.fit(X_train, y_train)

In [None]:
prd = m_tree.predict(X_test)

In [None]:
acc_dt = metrics.accuracy_score(pred,y_test)
print('The accuracy of the Decision Tree is:', acc_dt)

In [None]:
title = "Decision Tree Classifier"
cv = ShuffleSplit(n_splits=4, test_size=0.2, random_state=0)

plot_ml_curve(m_tree, title, X, y, ylim=(0.6, 1.01), cv=cv)

### 6. 딥러닝 

- 데이터의 범위를 [0,1] 사이로 스케일 
- 사이킷런의 MinMaxScaler()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_ohc, test_size=0.2,random_state=33)

In [None]:
X_train.shape

In [None]:
num_features = len(X_train[1]) 
num_features

In [None]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import Sequential, optimizers
from tensorflow.keras.layers import Flatten, Dense, Softmax

In [None]:

model = keras.Sequential([
    keras.layers.Dense(64, activation = 'relu', input_shape=[num_features]),
    keras.layers.Dense(64, activation = 'relu'),
    keras.layers.Dense(32, activation = 'relu'),
    keras.layers.Dense(3, activation = 'softmax')
])

model.compile(loss='SparseCategoricalCrossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs = 100, 
                    validation_data=(X_test, y_test), batch_size = 128, verbose=2)            

In [None]:
model.evaluate(X_test, y_test)

In [None]:
history.history.keys()

In [None]:
acc_dnn = history.history['accuracy'][np.argmin(history.history['loss'])]
print('The accuracy of the Deep Learning is:', acc_dnn)

In [None]:
import myutil_plot
myutil_plot.plot_history(history)

In [None]:
models = pd.DataFrame({
    'Model': ['Logistic Regression',  'Support Vector Machines','RandomForest',
              'K-Nearest Neighbours', 'Decision Tree','Deep Learning'],
    'Score': [acc_lr, acc_svm, acc_rf, acc_knn, acc_dt, acc_dnn]})
models.sort_values(by='Score', ascending=False)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

fig, loss_ax = plt.subplots()

acc_ax = loss_ax.twinx()

loss_ax.plot(history.history['loss'],'y',label='train loss')
loss_ax.plot(history.history['val_loss'],'r',label='val loss')
acc_ax.plot(history.history['accuracy'],'b',label='train acc')
acc_ax.plot(history.history['val_accuracy'],'g',label='val acc')

loss_ax.set_xlabel('epoch')
loss_ax.set_ylabel('loss')
acc_ax.set_ylabel('accuracy')

loss_ax.legend(loc='center right')
acc_ax.legend(loc='center')
plt.show()