In [25]:
# https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.cluster import DBSCAN, KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances_argmin_min

In [10]:
full_data = pd.read_csv("./CalculatedData/full_data.csv", low_memory=False)
full_origin = pd.read_csv("./CalculatedData/full_origin.csv", low_memory=False)

def info(input_data):
    db = DBSCAN(eps=0.15, min_samples=10).fit(input_data)
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    
    noise_points = input_data[labels == -1]
    cluster_points = input_data[labels != -1]
    cluster_labels = labels[labels != -1]
    
    # before process outlier
    full_data['origin_label'] = db.labels_
    
    if len(noise_points) > 0:
        cluster_centers = [cluster_points[cluster_labels == i].mean(axis=0) for i in np.unique(cluster_labels)]
        nearest_clusters, _ = pairwise_distances_argmin_min(noise_points, cluster_centers)
        labels[labels == -1] = nearest_clusters
    # after process outlier
    full_data['new_label'] = labels
    
    print("Estimated number of clusters: %d" % n_clusters_)
    print("Estimated number of noise points: %d" % n_noise_)
    
    return n_clusters_, n_noise_, labels, db

TDD2 = full_data.iloc[:, -4:-1]

get_info2 = info(TDD2)
# np.unique(get_info2[2])

Estimated number of clusters: 3
Estimated number of noise points: 11


### Count repeat node

In [3]:
count = {}
for n in full_data['node']:
    key_to_search = n
    ids_to_search = full_data[full_data['node'] == key_to_search]['ids'].values[0]

    number_list = ids_to_search.strip('[]').split(', ')
    number_list = [int(num) for num in number_list]
    
    for num in number_list:
        if num in count:
            count[num] += 1
        else:
            count[num] = 1
# len(count)
# count

### Search Key to find the origin data point

In [5]:
key_to_search = 4
ids_to_search = full_data[full_data['node'] == key_to_search]['ids'].values[0]

number_list = ids_to_search.strip('[]').split(', ')
number_list = [int(num) for num in number_list]

# full_origin.loc[number_list].head()

### Add count column to the origin data

In [6]:
full_origin['count'] = 0

for key, value in count.items():
    full_origin.loc[key, 'count'] = value
    
full_origin.head()

Unnamed: 0,發生年度,發生月份,發生日期,發生時間,事故類別名稱,處理單位名稱警局層,發生地點,天候名稱,光線名稱,道路類別-第1當事者-名稱,...,車輛撞擊部位子類別名稱-最初,車輛撞擊部位大類別名稱-其他,車輛撞擊部位子類別名稱-其他,肇因研判大類別名稱-個別,肇因研判子類別名稱-個別,肇事逃逸類別名稱-是否肇逃,經度,緯度,new_col,count
0,2023,1.0,20230101.0,140800.0,A1,雲林縣警察局,雲林縣崙背鄉五魁村雲11鄉道路燈桿225403號處,晴,日間自然光線,村里道路,...,前車頭,,,駕駛人,逆向行駛,否,120.322165,23.774017,單路部分行車分向線無號誌路段,11
1,2023,1.0,20230101.0,140800.0,A1,雲林縣警察局,雲林縣崙背鄉五魁村雲11鄉道路燈桿225403號處,晴,日間自然光線,村里道路,...,,,,無(非車輛駕駛人因素),尚未發現肇事因素,否,120.322165,23.774017,單路部分行車分向線無號誌路段,13
2,2023,1.0,20230101.0,144312.0,A1,臺南市政府警察局,臺南市北門區錦湖里區道南2線東側 / 臺南市北門區錦湖里省道臺17線北側,陰,日間自然光線,省道,...,前車頭,,,駕駛人,酒醉(後)駕駛失控,否,120.14672,23.293584,交岔路無行車管制號誌交叉路口,5
3,2023,1.0,20230101.0,144312.0,A1,臺南市政府警察局,臺南市北門區錦湖里區道南2線東側 / 臺南市北門區錦湖里省道臺17線北側,陰,日間自然光線,省道,...,,,,無(非車輛駕駛人因素),尚未發現肇事因素,否,120.14672,23.293584,交岔路無行車管制號誌交叉路口,14
4,2023,1.0,20230101.0,144312.0,A1,臺南市政府警察局,臺南市北門區錦湖里區道南2線東側 / 臺南市北門區錦湖里省道臺17線北側,陰,日間自然光線,省道,...,非汽(機)車,,,無(非車輛駕駛人因素),尚未發現肇事因素,否,120.14672,23.293584,交岔路無行車管制號誌交叉路口,16


In [1]:
# labels = get_info2[2]
# db = get_info2[3]
# n_clusters_ = get_info2[0]

# unique_labels = set(labels)
# core_samples_mask = np.zeros_like(labels, dtype=bool)
# core_samples_mask[db.core_sample_indices_] = True

# colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]

# fig = plt.figure(figsize=(12, 8), dpi=100)
# ax = fig.add_subplot(111, projection='3d')

# for k, col in zip(unique_labels, colors):
#     if k == -1:
#         # Black used for noise.
#         col = [0, 0, 0, 1]

#     class_member_mask = labels == k

#     xyz = np.array(TDD2)[class_member_mask & core_samples_mask]
#     ax.scatter(
#         xyz[:, 0],
#         xyz[:, 1],
#         xyz[:, 2],
#         c=[tuple(col)],
#         edgecolor='k',
#         s=140,
#     )

#     xyz = np.array(TDD2)[class_member_mask & ~core_samples_mask]
#     ax.scatter(
#         xyz[:, 0],
#         xyz[:, 1],
#         xyz[:, 2],
#         c=[tuple(col)],
#         edgecolor='k',
#         s=60,
#     )

# plt.title(f"Estimated number of clusters: {n_clusters_}")
# plt.show()