In [145]:
import os
import pandas as pd

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

In [None]:
import bnlearn as bn

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']
# plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']

In [None]:
from utils import read_data
combined_data = read_data()

In [151]:
select_group = [

    # 號誌
    '號誌-號誌種類名稱', 
    # '號誌-號誌動作名稱',

    # 車道劃分
    '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',

    # 大類別
    # '車輛撞擊部位大類別名稱-最初',
    # '事故類型及型態大類別名稱', '車道劃分設施-分向設施大類別名稱',
    # '道路型態大類別名稱',
    # '車輛撞擊部位子類別名稱-最初',
    '事故類型及型態子類別名稱', '車道劃分設施-分向設施子類別名稱',
    '道路型態子類別名稱',

    # 其他
    '速限-第1當事者',
    '道路類別-第1當事者-名稱',

    # 設施
    'youbike_100m_count', 'mrt_100m_count', 'parkinglot_100m_count',
    
    # 駕駛、行人行為
    '肇因研判子類別名稱-主要'
    ]

In [158]:
data = combined_data[select_group].copy()
data['facility'] = data[['youbike_100m_count', 'mrt_100m_count', 'parkinglot_100m_count']].apply(
    lambda row: '1' if (row > 0).any() else '0', axis=1
)
data.drop(columns=['youbike_100m_count', 'mrt_100m_count', 'parkinglot_100m_count'], inplace=True)

max_speed = data['速限-第1當事者'].max()
bins = range(0, int(max_speed) + 11, 10)

data['速限-第1當事者'] = pd.cut(
    data['速限-第1當事者'],
    bins=bins,
    right=False, 
    include_lowest=True,
    labels=[f"{i}-{i+9}" for i in bins[:-1]]
)

In [None]:
for i in data.columns:
    print(i, len(data[i].unique()))

In [159]:
# 學哪些變數之間有邊，結果是一個DAG
model = bn.structure_learning.fit(data, methodtype='hc', scoretype='bic', bw_list_method='edges',
                                  # 肇因對於事故類型一定是上游
                                   black_list=[('事故類型及型態子類別名稱', '肇因研判子類別名稱-主要')])
# 計算每個節點的 條件機率表 (CPT, Conditional Probability Table)
model_param = bn.parameter_learning.fit(model, data)
# 計算邊緣強度，如果p小於顯著就是有相關
model_independence = bn.independence_test(model_param, data, test='chi_square', prune=True)

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]
[bnlearn] >Filter edges based on black_list/white_list
[bnlearn] >Compute structure scores for model comparison (higher is better).
[bnlearn] >Parameter learning> Computing parameters using [bayes]
[bnlearn] >Converting [<class 'pgmpy.base.DAG.DAG'>] to BayesianNetwork model.
[bnlearn] >Converting adjmat to BayesianNetwork.
[bnlearn] >CPD of 號誌-號誌種類名稱:
+-----------------------------+-----------+
| 號誌-號誌種類名稱(無號誌)              | 0.589508  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(行車管制號誌)           | 0.260669  |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(行車管制號誌(附設行人專用號誌)) | 0.0990085 |
+-----------------------------+-----------+
| 號誌-號誌種類名稱(閃光號誌)             | 0.0508145 |
+-----------------------------+-----------+
[bnlearn] >CPD of 道路型態子類別名稱:
+--------------------+-----+
| 肇因研判子類別名稱-主要       | ... |
+--------------------+-----+
| 號誌-號誌種類名稱          | ... |
+--------------------+----

In [160]:
model_independence['independence_test']

Unnamed: 0,source,target,stat_test,p_value,chi_square,dof
0,號誌-號誌種類名稱,道路型態子類別名稱,True,0.0,91290.16,45
1,號誌-號誌種類名稱,速限-第1當事者,True,0.0,26174.84,33
2,號誌-號誌種類名稱,肇因研判子類別名稱-主要,True,0.0,157351.7,336
3,號誌-號誌種類名稱,facility,True,0.0,6364.917,3
4,道路型態子類別名稱,車道劃分設施-分向設施子類別名稱,True,0.0,30192.76,75
5,速限-第1當事者,車道劃分設施-分道設施-快慢車道間名稱,True,0.0,24992.25,44
6,速限-第1當事者,道路類別-第1當事者-名稱,True,0.0,300058.8,88
7,肇因研判子類別名稱-主要,車道劃分設施-分道設施-快車道或一般車道間名稱,True,0.0,24497.59,448
8,肇因研判子類別名稱-主要,事故類型及型態子類別名稱,True,0.0,1324956.0,3248
9,肇因研判子類別名稱-主要,道路型態子類別名稱,True,0.0,288284.7,1680


In [161]:
bn.get_parents(model['model_edges'])

{'肇因研判子類別名稱-主要': ['號誌-號誌種類名稱'],
 '速限-第1當事者': ['號誌-號誌種類名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分向設施子類別名稱'],
 '道路型態子類別名稱': ['號誌-號誌種類名稱', '肇因研判子類別名稱-主要'],
 'facility': ['號誌-號誌種類名稱', '道路類別-第1當事者-名稱'],
 '車道劃分設施-分向設施子類別名稱': ['車道劃分設施-分道設施-快車道或一般車道間名稱', '道路型態子類別名稱'],
 '車道劃分設施-分道設施-路面邊線名稱': ['車道劃分設施-分道設施-快車道或一般車道間名稱',
  '車道劃分設施-分向設施子類別名稱',
  '道路類別-第1當事者-名稱',
  'facility'],
 '車道劃分設施-分道設施-快慢車道間名稱': ['車道劃分設施-分道設施-路面邊線名稱', '車道劃分設施-分向設施子類別名稱', '速限-第1當事者'],
 '道路類別-第1當事者-名稱': ['車道劃分設施-分向設施子類別名稱', '速限-第1當事者'],
 '事故類型及型態子類別名稱': ['肇因研判子類別名稱-主要'],
 '車道劃分設施-分道設施-快車道或一般車道間名稱': ['肇因研判子類別名稱-主要'],
 '號誌-號誌種類名稱': []}

In [None]:
from matplotlib import rcParams
rcParams['font.sans-serif'] = ['Microsoft JhengHei']
rcParams['axes.unicode_minus'] = False

# G = bn.plot(model, interactive=False, node_color="#36AA5B", edge_labels=None)
bn.plot(model_independence, interactive=False, edge_labels='pvalue', 
        params_static={'layout': 'spring_layout', 'font_color': "#974848", 'edge_color': "#974848"})

In [163]:
import numpy as np
import pandas as pd
import networkx as nx
import plotly.graph_objects as go

def draw_bn_plotly(model, alpha=0.05, layout_algo=""):
    edges = [(str(u), str(v)) for u, v in model['model_edges']]
    df = model['independence_test'][['source','target','p_value']].copy()
    df['source'] = df['source'].astype(str); df['target'] = df['target'].astype(str)
    p_map = {(s,t):p for s,t,p in df.itertuples(index=False, name=None)}
    p_map.update({(t,s):p for (s,t),p in list(p_map.items())})

    G = nx.DiGraph()
    G.add_edges_from(edges)

    pos = (nx.spring_layout(G, seed=42) if layout_algo=="spring"
           else nx.kamada_kawai_layout(G))

    # nodes
    deg = dict(G.degree()); mdeg = max(deg.values()) if deg else 1
    node_x, node_y, node_text, node_size = [], [], [], []
    for n in G.nodes():
        x,y = pos[n]
        node_x.append(x); node_y.append(y)
        node_text.append(f"{n}<br>degree: {deg.get(n,0)}")
        node_size.append(10 + 25*(deg.get(n,1)/mdeg))

    node_trace = go.Scatter(
        x=node_x, y=node_y, mode='markers+text',
        text=[str(n) for n in G.nodes()], textposition="top center",
        hovertext=node_text, hoverinfo="text",
        marker=dict(size=node_size, line=dict(width=1), color="#0B4772")
    )

    # edges
    edge_traces = []
    for u,v in G.edges():
        x0,y0 = pos[u]; x1,y1 = pos[v]
        p = p_map.get((u,v), np.nan)
        if np.isnan(p):
            width, dash, color = 1.0, "dot", "#999"
            tip = f"{u} → {v}<br>p-value: N/A"
        else:
            w = -np.log10(max(p, 1e-300))
            width = 1 # 1 + 0.8*min(10, w)
            sig = (p <= alpha)
            dash = "solid" if sig else "dot"
            color = "#396bac" if sig else "#53a3dd"
            tip = f"{u} → {v}<br>p-value: {p:.3e}"

        edge_traces.append(go.Scatter(
            x=[x0, x1], y=[y0, y1],
            mode='lines',
            hoverinfo='text', text=[tip],
            line=dict(width=width, color=color, dash=dash)
        ))

    # 箭頭
    annotations = []
    for u,v in G.edges():
        x0,y0 = pos[u]; x1,y1 = pos[v]
        annotations.append(dict(
            ax=x0, ay=y0, x=x1, y=y1,
            xref="x", yref="y", axref="x", ayref="y",
            showarrow=True, arrowhead=3, arrowsize=1.2, opacity=0.8
        ))

    fig = go.Figure(data=edge_traces + [node_trace],
        layout=go.Layout(
            template=None, showlegend=False,
            hovermode='closest',
            margin=dict(l=10, r=10, t=10, b=10),
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            annotations=annotations,
            width=1000, height=600,
        )
    )
    return fig

draw_bn_plotly(model_independence, alpha=0.01, layout_algo='')

In [None]:
# Conditional Probability Distributions (CPDs)
CPDs = bn.print_CPD(model_param)

## 肇因研判依賴於號誌種類

In [None]:
dfprob_cause = CPDs['肇因研判子類別名稱-主要'].sort_values('p', ascending=False)
# dfprob_cause['p'] = round(dfprob_cause['p'], 4)
# dfprob_cause.head(30)

parent  = '號誌-號誌種類名稱'
child = '肇因研判子類別名稱-主要'

counts = (data.groupby([child, parent]).size().reset_index(name='n'))

# merge cpd for p and n
dfprob_cause_counts = (
    dfprob_cause
    .merge(counts, on=[child, parent], how='left')
    .sort_values('p', ascending=False)
)
# filter low frequency
filtered = dfprob_cause_counts[dfprob_cause_counts['n'] >= 30]
filtered['p'] = round(filtered['p'], 4)
filtered.head(30)


Unnamed: 0,肇因研判子類別名稱-主要,號誌-號誌種類名稱,p,n
0,違反閃光號誌,閃光號誌,0.2517,4083.0
1,未保持行車安全距離,行車管制號誌(附設行人專用號誌),0.142,4490.0
2,有號誌路口，轉彎車未讓直行車先行,行車管制號誌,0.1319,10978.0
3,未保持行車安全距離,行車管制號誌,0.1298,10808.0
4,其他未依規定讓車,閃光號誌,0.1153,1870.0
5,有號誌路口，轉彎車未讓直行車先行,行車管制號誌(附設行人專用號誌),0.1143,3612.0
6,其他不當駕車行為,無號誌,0.109,20516.0
7,其他不當駕車行為,行車管制號誌(附設行人專用號誌),0.1016,3212.0
8,未保持行車安全距離,無號誌,0.1001,18843.0
9,其他不當駕車行為,行車管制號誌,0.0952,7926.0


## 事故類型依賴於肇因研判

In [201]:
dfprob_cause = CPDs['事故類型及型態子類別名稱'].sort_values('p', ascending=False)
# dfprob_cause['p'] = round(dfprob_cause['p'], 4)
# dfprob_cause.head(30)

parent = '肇因研判子類別名稱-主要'
child  = '事故類型及型態子類別名稱'

counts = (data.groupby([child, parent]).size().reset_index(name='n'))

# merge cpd for p and n
dfprob_cause_counts = (
    dfprob_cause
    .merge(counts, on=[child, parent], how='left')
    .sort_values('p', ascending=False)
)

filtered = dfprob_cause_counts[dfprob_cause_counts['n'] >= 30]
filtered['p'] = round(filtered['p'], 4)
filtered.head(30)

Unnamed: 0,事故類型及型態子類別名稱,肇因研判子類別名稱-主要,p,n
0,路口交岔撞,無號誌路口，左方車未讓右方車先行,0.8748,5802.0
1,穿越道路中,未依標誌或標線穿越道路,0.8431,1072.0
2,其他,開啟或關閉車門不當,0.8177,1864.0
3,穿越道路中,未依規定行走地下道、天橋穿越道路,0.7884,121.0
4,倒車撞,倒車未依規定,0.7665,2139.0
5,穿越道路中,車輛未依規定暫停讓行人先行,0.7665,2522.0
6,其他,未待乘客安全上下而開車,0.7579,39.0
7,穿越道路中,未依號誌或手勢指揮(示)穿越道路,0.7529,169.0
8,其他,裝載貨物不穩妥,0.7436,271.0
9,穿越道路中,穿越道路未注意左右來車,0.7295,554.0


## 快車道、一般車道依賴於肇因研判

In [202]:
dfprob_cause = CPDs['車道劃分設施-分道設施-快車道或一般車道間名稱'].sort_values('p', ascending=False)

parent = '肇因研判子類別名稱-主要'
child  = '車道劃分設施-分道設施-快車道或一般車道間名稱'

counts = (data.groupby([child, parent]).size().reset_index(name='n'))

# merge cpd for p and n
dfprob_cause_counts = (
    dfprob_cause
    .merge(counts, on=[child, parent], how='left')
    .sort_values('p', ascending=False)
)

filtered = dfprob_cause_counts[dfprob_cause_counts['n'] >= 30]
filtered['p'] = round(filtered['p'], 4)
filtered.head(30)

Unnamed: 0,車道劃分設施-分道設施-快車道或一般車道間名稱,肇因研判子類別名稱-主要,p,n
0,未繪設車道線,無號誌路口，左方車未讓右方車先行,0.9251,6134.0
1,未繪設車道線,未靠右行駛,0.8708,2041.0
2,未繪設車道線,無號誌路口，支線道未讓幹線道先行,0.8248,10215.0
3,未繪設車道線,違反閃光號誌,0.7798,3289.0
4,未繪設車道線,峻狹坡路會車，下坡車未讓上坡車先行,0.767,38.0
5,未繪設車道線,無號誌路口，轉彎車未讓直行車先行,0.7518,7732.0
6,車道線(附標記),使用車輛自動駕駛或先進駕駛輔助系統設備(裝置)不符規定,0.72,37.0
7,未繪設車道線,無號誌路口，少線道未讓多線道先行,0.7126,715.0
8,未繪設車道線,倒車未依規定,0.6725,1875.0
9,未繪設車道線,違反禁止超車標誌(線),0.6645,658.0


## 道路型態依賴於肇因研判、號誌種類

In [203]:
dfprob_cause = CPDs['道路型態子類別名稱'].sort_values('p', ascending=False)

parent = '肇因研判子類別名稱-主要'
parent2 = '號誌-號誌種類名稱'
child  = '道路型態子類別名稱'

counts = (data.groupby([child, parent, parent2]).size().reset_index(name='n'))

# merge cpd for p and n
dfprob_cause_counts = (
    dfprob_cause
    .merge(counts, on=[child, parent, parent2], how='left')
    .sort_values('p', ascending=False)
)

filtered = dfprob_cause_counts[dfprob_cause_counts['n'] >= 30]
filtered['p'] = round(filtered['p'], 4)
filtered.head(30)

Unnamed: 0,道路型態子類別名稱,肇因研判子類別名稱-主要,號誌-號誌種類名稱,p,n
0,四岔路,無號誌路口，左方車未讓右方車先行,無號誌,0.9202,6000.0
1,直路,未待乘客安全上下而開車,無號誌,0.9127,32.0
2,直路,開啟或關閉車門不當,無號誌,0.8786,1730.0
3,直路,在道路上嬉戲或奔走不定,無號誌,0.8537,82.0
4,直路,違反禁止迴轉或迴車標誌,無號誌,0.8511,326.0
5,直路,停車操作時未注意安全,無號誌,0.844,314.0
6,直路,變換車道不當,無號誌,0.8253,3878.0
7,四岔路,無號誌路口，左方車未讓右方車先行,行車管制號誌,0.8228,42.0
8,四岔路,無號誌路口，左方車未讓右方車先行,閃光號誌,0.8208,46.0
9,直路,操作、觀看行車輔助或娛樂性顯示設備,無號誌,0.8178,72.0
