In [None]:
import os
import pandas as pd

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

In [None]:
import bnlearn as bn

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']
# plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']

In [None]:
combined_data = pd.read_csv('../ComputedData/ForModel/combined_data_with_hotspot.csv')
# 排除離島
combined_data = combined_data[~combined_data['COUNTYNAME'].isna()]

In [None]:
select_group = [
    # 號誌
    '號誌-號誌種類名稱', '號誌-號誌動作名稱',

    # 車道劃分
    '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',

    # 大類別
    '事故類型及型態子類別名稱', '車道劃分設施-分向設施子類別名稱',
    '道路型態子類別名稱',

    # 其他
    '速限-第1當事者',
    '道路類別-第1當事者-名稱',

    # 設施
    'youbike_100m_count', 'mrt_100m_count', 'parkinglot_100m_count',
    
    # 駕駛、行人行為
    '肇因研判子類別名稱-主要',

    'COUNTYNAME'
    ]

In [None]:
data = combined_data[select_group].copy()
data['facility'] = data[['youbike_100m_count', 'mrt_100m_count', 'parkinglot_100m_count']].apply(
    lambda row: '1' if (row > 0).any() else '0', axis=1
)
data.drop(columns=['youbike_100m_count', 'mrt_100m_count', 'parkinglot_100m_count'], inplace=True)

max_speed = data['速限-第1當事者'].max()
bins = range(0, int(max_speed) + 11, 10)

data['速限-第1當事者'] = pd.cut(
    data['速限-第1當事者'],
    bins=bins,
    right=False, 
    include_lowest=True,
    labels=[f"{i}-{i+9}" for i in bins[:-1]]
)

In [None]:
for i in data.columns:
    print(i, len(data[i].unique()))

In [None]:
parent = [
    '號誌-號誌種類名稱', '號誌-號誌動作名稱','車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分道設施-快慢車道間名稱',
    '車道劃分設施-分道設施-路面邊線名稱', '車道劃分設施-分向設施子類別名稱', '道路型態子類別名稱',
    '速限-第1當事者', '道路類別-第1當事者-名稱', 'facility', 'COUNTYNAME'
    ]

cause = ['肇因研判子類別名稱-主要']

result = ['事故類型及型態子類別名稱']

black_list = []
# cause -> parent
black_list += [(c, p) for c in cause for p in parent]
# result -> parent/cause
black_list += [(r, x) for r in result for x in (parent + cause + result)]
# 保險：parent -> result（避免直接 shortcut，如果只想透過肇因解釋）
# black_list += [(p, r) for p in parent for r in result]

white_list = [
    ('速限-第1當事者', '肇因研判子類別名稱-主要'),
    ('道路類別-第1當事者-名稱', '肇因研判子類別名稱-主要'),
    ('道路型態子類別名稱', '肇因研判子類別名稱-主要'),
    ('facility', '肇因研判子類別名稱-主要'),
    ('號誌-號誌種類名稱', '肇因研判子類別名稱-主要'),
    ('肇因研判子類別名稱-主要', '事故類型及型態子類別名稱'),
]

In [None]:
# 學哪些變數之間有邊，結果是一個DAG
model = bn.structure_learning.fit(data, methodtype='hc', scoretype='bic', bw_list_method='edges',
                                  # 肇因對於事故類型一定是上游。ex. 不會因為撞路樹而造成患病，而是因為患病才造成撞路樹
                                   black_list=black_list, white_list=white_list,
                                   fixed_edges=white_list, max_indegree=None)
# 計算每個節點的 條件機率表 (CPT, Conditional Probability Table)
model_param = bn.parameter_learning.fit(model, data, scoretype='bdeu', methodtype='bayes')
# 計算邊緣強度，如果p小於顯著就是有相關
model_independence = bn.independence_test(model_param, data, test='chi_square', prune=True)

# model_independence['independence_test']
# bn.get_parents(model['model_edges'])

In [None]:
from matplotlib import rcParams
rcParams['font.sans-serif'] = ['Microsoft JhengHei']
rcParams['axes.unicode_minus'] = False

# G = bn.plot(model, interactive=False, node_color="#36AA5B", edge_labels=None)
bn.plot(model_independence, interactive=False, edge_labels='pvalue', 
        params_static={'layout': 'spring_layout', 'font_color': "#974848", 'edge_color': "#974848"})

In [None]:
import numpy as np
import networkx as nx
import plotly.graph_objects as go
from utils_behaviour import feature_name_map

def draw_bn_plotly(model, layout_algo="", en=False, width=1000, height=500, seed=42, iter=100):
    edges = [(str(u), str(v)) for u, v in model['model_edges']]
    df = model['independence_test'][['source','target','p_value']].copy()

    if en:
        df['source'] = df['source'].map(feature_name_map).fillna(df['source'])
        df['target'] = df['target'].map(feature_name_map).fillna(df['target'])
        edges = [(feature_name_map.get(u, u), feature_name_map.get(v, v)) for (u, v) in edges]
    else:
        df['source'] = df['source'].astype(str)
        df['target'] = df['target'].astype(str)

    p_map = {(s,t):p for s,t,p in df.itertuples(index=False, name=None)}
    p_map.update({(t,s):p for (s,t),p in list(p_map.items())})

    G = nx.DiGraph()
    G.add_edges_from(edges)

    pos = (nx.spring_layout(G, seed=seed, iterations=iter) if layout_algo=="spring"
           else nx.kamada_kawai_layout(G))

    # nodes
    deg = dict(G.degree())
    node_x, node_y, node_text, node_size = [], [], [], []
    for n in G.nodes():
        x,y = pos[n]
        node_x.append(x); node_y.append(y)
        node_text.append(f"{n}<br>degree: {deg.get(n,0)}")
        # node_size.append(10 + 25*(deg.get(n,1)))
        node_size.append(50)

    node_trace = go.Scatter(
        x=node_x, y=node_y, mode='markers+text',
        text=[str(n) for n in G.nodes()],
        textposition="middle center",
        hovertext=node_text, hoverinfo="text",
        marker=dict(size=node_size, 
                    # line=dict(width=1), 
                    line=dict(color="#24475E", width=2),
                    color="#5390B9")
    )

    # edge
    edge_traces = []
    annotations = []
    r = 0.15
    for u, v in G.edges():
        x0, y0 = pos[u]
        x1, y1 = pos[v]
        dx, dy = x1 - x0, y1 - y0
        d = (dx**2 + dy**2)**0.5
        if d == 0:
            continue

        # 起點：從 source 往 target 方向移動 r
        sx = x0 + dx/d * r
        sy = y0 + dy/d * r
        # 終點：從 target 往 source 方向退 r
        ex = x1 - dx/d * r
        ey = y1 - dy/d * r

        annotations.append(dict(
            ax=sx, ay=sy, x=ex, y=ey,
            xref="x", yref="y", axref="x", ayref="y",
            showarrow=True, arrowhead=3, arrowsize=2, opacity=0.8
        ))

    fig = go.Figure(data=edge_traces + [node_trace],
        layout=go.Layout(
            template=None, showlegend=False,
            hovermode='closest',
            margin=dict(l=10, r=10, t=10, b=10),
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            annotations=annotations,
            width=width, height=height,
        )
    )
    return fig

draw_bn_plotly(model_independence, layout_algo='spring', en=False, width=600, height=400, seed=42, iter=30)


In [None]:
parent = ['速限-第1當事者', '道路類別-第1當事者-名稱', '道路型態子類別名稱', 'facility', '號誌-號誌種類名稱']
child = '肇因研判子類別名稱-主要'

def cpd_add_n(parent, child, model, data, cpd=True, threshold=50):

    if cpd:
        # CPD: P(child | parent) -> counts 也用 parent+child
        vb_all = parent.copy()
        vb_all.append(child)
        counts = (data.groupby(vb_all, dropna=False).size().reset_index(name='n'))

        dfprob_cause_counts = (
            model
            .merge(counts, on=vb_all, how='left')
            .sort_values('p', ascending=False)
        )
    else:
        # Posterior: P(parent | child=v) -> data 已固定 child，counts 只用 parent
        counts = (data.groupby(parent, dropna=False)
                       .size()
                       .reset_index(name='n'))

        dfprob_cause_counts = (
            model
            .merge(counts, on=parent, how='left')
            .sort_values('p', ascending=False)
        )

    dfprob_cause_counts['n'] = dfprob_cause_counts['n'].fillna(0)
    filtered = dfprob_cause_counts[dfprob_cause_counts['n'] >= threshold].copy()

    filtered['p'] = round(filtered['p'], 4)
    filtered['n'] = filtered['n'].astype(int)

    return filtered

def filter_cpd(filtered):

    filtered = filtered[
        (filtered['速限-第1當事者'] == '0-9') |
        (filtered['速限-第1當事者'] == '10-19') |
        (filtered['速限-第1當事者'] == '20-29') |
        (filtered['速限-第1當事者'] == '30-39') |
        (filtered['速限-第1當事者'] == '40-49') |
        (filtered['速限-第1當事者'] == '50-59')
    ]

    filtered = filtered[filtered['facility'] == 1]
    filtered = filtered[filtered['道路類別-第1當事者-名稱'] == '市區道路']

    return filtered

def get_outlier(filtered, new_filtered):
    """
    Q的計算是基於原始的filtered，但是要讓他對比新的filtered
    """

    Q1 = filtered['p'].quantile(0.25)
    Q3 = filtered['p'].quantile(0.75)
    IQR = Q3 - Q1
    outliers = new_filtered[new_filtered['p'] > Q3 + 1.5 * IQR]
    outliers

    return outliers

## Inference
這個方法針對特定的推論得出cpt，現在討論反向所以evidence會是肇因，討論特定特徵下不同設計的機率

In [None]:
data['肇因研判子類別名稱-主要'].value_counts()

In [None]:
# data['肇因研判子類別名稱-主要'].value_counts().head().index
evidence_v = ['未保持行車安全距離', '其他不當駕車行為', '恍神、緊張、心不在焉分心駕駛', '尚未發現肇事因素', '其他未依規定讓車',
              '有號誌路口，轉彎車未讓直行車先行', '左轉彎未依規定', '無號誌路口，支線道未讓幹線道先行', '未保持行車安全間隔', '起步時未注意安全']

for v in evidence_v:
    q2 = bn.inference.fit(
        model_param, 
        variables=parent,
        evidence={'肇因研判子類別名稱-主要': v})

    model = q2.df
    evidence_df = data[data['肇因研判子類別名稱-主要'] == v]

    filtered_condition = cpd_add_n(parent, child, model, evidence_df, cpd=False, threshold=30)

    filtered_condition.to_csv(f'../ComputedData/Behaviour/{v}.csv', encoding='utf-8', index=False)

In [None]:
# Inference方面n和p不會是正比的主要原因是p是由cpt反推，所以不是基於n來計算
filtered_condition = pd.read_csv(f'../ComputedData/Behaviour/{evidence_v[0]}.csv', encoding='utf-8')
new_con_dist = filter_cpd(filtered_condition)
print(new_con_dist.shape)
new_con_dist_out = get_outlier(filtered_condition, new_con_dist)
new_con_dist_out

In [None]:
for i in range(10):
    # Inference方面n和p不會是正比的主要原因是p是由cpt反推，所以不是基於n來計算
    filtered_condition = pd.read_csv(f'../ComputedData/Behaviour/{evidence_v[i]}.csv', encoding='utf-8')
    new_con_dist = filter_cpd(filtered_condition)

    print(round(new_con_dist['p'].sum(), 4))

In [None]:
round(new_con_dist['p'].sum(), 4)

## CPD
沒有針對特徵，回傳肇因的因

In [None]:
from utils_behaviour import category_value_map

# Conditional Probability Distributions (CPDs)
CPDs = bn.print_CPD(model_param)
dfprob_cause = CPDs[child]
# 即使某個肇因總樣本數超大，它的p在某個parent下可能很小，因為在那個parent條件下它不是主流肇因。
filtered = cpd_add_n(parent, child, dfprob_cause, data, threshold=0)

# filtered = filtered[
#     (filtered['速限-第1當事者'] == '0-9') |
#     (filtered['速限-第1當事者'] == '10-19') |
#     (filtered['速限-第1當事者'] == '20-29') |
#     (filtered['速限-第1當事者'] == '30-39') |
#     (filtered['速限-第1當事者'] == '40-49')
# ]

# filtered = filtered[filtered['facility'] == '1']
# filtered = filtered[filtered['道路類別-第1當事者-名稱'] == '市區道路']

# filtered[parent] = filtered[parent].map(category_value_map[parent])
# filtered[child] = filtered[child].map(category_value_map[child])
# filtered[parent2] = filtered[parent2].map(category_value_map[parent2])

### 確認CPT的機率

In [None]:
# parent 組合數
q = dfprob_cause.groupby(parent, dropna=False).ngroups
assert np.prod([data[col].nunique() for col in parent])==q
# 每個 parent 組合底下的機率和都應該 ≈ 1
chk = dfprob_cause.groupby(parent, dropna=False)['p'].sum().unique()

In [None]:
import matplotlib.cm as cm
import matplotlib.colors as mcolors
from utils_behaviour import BubbleChart

filtered['特徵組合'] = filtered[parent] + '\n->' + filtered[child]

labels = filtered['特徵組合'].reset_index(drop=True)
n_values =  filtered['n'].reset_index(drop=True)

cmap = cm.Blues
norm = mcolors.Normalize(vmin=n_values.min(), vmax=n_values.max())
colors = cmap(norm(n_values.values))

bubble_chart = BubbleChart(area=filtered['p'],
                           bubble_spacing=0.1, text_rotation=20)
bubble_chart.collapse()

fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"), figsize=(18,10))
bubble_chart.plot(ax, labels, colors)
for t in ax.texts:
    t.set_fontsize(8)

ax.axis("off")
ax.relim()
ax.autoscale_view()
ax.set_title('Bubble Chart')

sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
fig.colorbar(sm, ax=ax, label='n value')

plt.show()
