### Import Common Modules

In [None]:
import import_ipynb

import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import matplotlib as mpl
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
mpl.rcParams['axes.unicode_minus'] = False

pd.set_option('display.max_row', 500)
pd.set_option('display.max_columns', 100)

from traffic_common import get_category_age, get_category_season, get_category_time, drop_features, cleansing, \
                            encode_features, conv2XYarr, transform_dataframe, bar_chart, pie_chart

### Load Dataset and Pre-processing

In [None]:
traffic_df = pd.read_csv('dataset/seoul_traffic.csv', encoding='euc-kr')

In [None]:
traffic_df = transform_dataframe(traffic_df)

In [None]:
X = traffic_df.iloc[:,1:]
Y = traffic_df.iloc[:,0]

In [None]:
X = pd.get_dummies(X)
Y = pd.get_dummies(Y).iloc[:,1]

In [None]:
X.reset_index(drop=True, inplace=True)
Y.reset_index(drop=True, inplace=True)

### DBSCAN

In [None]:
neigh = NearestNeighbors(n_neighbors=3, n_jobs=-1)
nbrs = neigh.fit(X)
distances, indices = nbrs.kneighbors(X)

In [None]:
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)

In [None]:
eps_arr = [1.5]
min_samples_arr = [30]

In [None]:
for i in range(len(metric)):
    for j in range(len(min_samples_arr)):
        dbscan = DBSCAN(eps=eps_arr[0], min_samples=min_samples_arr[j], n_jobs=-1)
        dbscan_labels = dbscan.fit_predict(X)
        
        trafficDF = pd.DataFrame()
        trafficDF['dbscan_cluster'] = dbscan_labels
        trafficDF['target'] = Y

        traffic_result = trafficDF.groupby(['target'])['dbscan_cluster'].value_counts()
        print(traffic_result)
        print("\n", str(i) + " " + str(j), "-----------------------\n\n")

In [None]:
X_dbscan = X.copy()
X_dbscan = X_dbscan.drop(X_dbscan.index[0:])

In [None]:
X_dbscan['dbscan_labels'] = -1
X_dbscan['target'] = -1

In [None]:
for i in range(len(X.index)):
    if dbscan_labels[i] == 0 or dbscan_labels[i] == 1 or dbscan_labels[i] == 2:
        X_dbscan = X_dbscan.append(X.iloc[i])
        X_dbscan['dbscan_labels'][i] = dbscan_labels[i]
        X_dbscan['target'][i] = Y.iloc[i]

### PCA Visualization

In [None]:
pca_dbscan = PCA(n_components=2)
pc_dbscan = pca_dbscan.fit_transform(X_dbscan)
plt.scatter(pc_dbscan[:,0], pc_dbscan[:,1])

### Check the distribution of variables by cluster

In [None]:
dbscan_gr0_df = X_dbscan[X_dbscan['dbscan_labels'] == 0]
dbscan_gr1_df = X_dbscan[X_dbscan['dbscan_labels'] == 1]
dbscan_gr2_df = X_dbscan[X_dbscan['dbscan_labels'] == 2]

In [None]:
dbscan_gr0_df.mean().T

In [None]:
dbscan_gr1_df.mean().T

In [None]:
dbscan_gr2_df.mean().T