In [None]:
import os
import pandas as pd

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

In [None]:
import bnlearn as bn

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']
# plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']

In [None]:
# This is from TDAv2
combined_data = pd.read_csv('../ComputedData/ForModel/combined_data_with_hotspot.csv')
combined_data.shape

In [None]:
select_group = [
    # 號誌
    '號誌-號誌種類名稱', '號誌-號誌動作名稱',

    # 車道劃分
    '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',

    # 大類別
    '事故類型及型態子類別名稱', '車道劃分設施-分向設施子類別名稱',
    '道路型態子類別名稱',

    # 其他
    '速限-第1當事者',
    '道路類別-第1當事者-名稱',

    # 設施
    'youbike_100m_count', 'mrt_100m_count', 'parkinglot_100m_count',
    
    # 駕駛、行人行為
    '肇因研判子類別名稱-主要',

    'COUNTYNAME'
    ]

data = combined_data[select_group].copy()
data['facility'] = data[['youbike_100m_count']].apply(lambda row: '1' if (row > 0).any() else '0', axis=1)
data.drop(columns=['youbike_100m_count', 'mrt_100m_count', 'parkinglot_100m_count'], inplace=True)

max_speed = data['速限-第1當事者'].max()
bins = range(0, int(max_speed) + 11, 10)

data['速限-第1當事者'] = pd.cut(
    data['速限-第1當事者'],
    bins=bins,
    right=False, 
    include_lowest=True,
    labels=[f"{i}-{i+9}" for i in bins[:-1]]
)

In [None]:
parent = [
    '號誌-號誌種類名稱', '號誌-號誌動作名稱','車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分道設施-快慢車道間名稱',
    '車道劃分設施-分道設施-路面邊線名稱', '車道劃分設施-分向設施子類別名稱', '道路型態子類別名稱',
    '速限-第1當事者', '道路類別-第1當事者-名稱', 'facility', 'COUNTYNAME'
    ]
cause = ['肇因研判子類別名稱-主要']
result = ['事故類型及型態子類別名稱']

white_list = [
    ('速限-第1當事者', '肇因研判子類別名稱-主要'),
    ('道路類別-第1當事者-名稱', '肇因研判子類別名稱-主要'),
    ('道路型態子類別名稱', '肇因研判子類別名稱-主要'),
    ('facility', '肇因研判子類別名稱-主要'),
    ('號誌-號誌種類名稱', '肇因研判子類別名稱-主要'),
    # ('肇因研判子類別名稱-主要', '事故類型及型態子類別名稱'),
]

# This is for english version
# from config import feature_name_map
# parent = [feature_name_map.get(col, col) for col in parent]
# cause = [feature_name_map.get(col, col) for col in cause]
# result = [feature_name_map.get(col, col) for col in result]
# white_list = [
#     (feature_name_map.get(src, src), feature_name_map.get(dst, dst))
#     for src, dst in white_list
# ]
# data = data.rename(columns=feature_name_map)

black_list = []
# cause -> parent
black_list += [(c, p) for c in cause for p in parent]
# result -> parent/cause
black_list += [(r, x) for r in result for x in (parent + cause + result)]
# 保險：parent -> result（避免直接 shortcut，如果只想透過肇因解釋）
# black_list += [(p, r) for p in parent for r in result]

In [None]:
# 學哪些變數之間有邊，結果是一個DAG
model = bn.structure_learning.fit(data, methodtype='hc', scoretype='bic', bw_list_method='edges',
                                  # 肇因對於事故類型一定是上游。ex. 不會因為撞路樹而造成患病，而是因為患病才造成撞路樹
                                   black_list=black_list, white_list=white_list,
                                   fixed_edges=white_list, max_indegree=None)
# 計算每個節點的 條件機率表 (CPT, Conditional Probability Table)
model_param = bn.parameter_learning.fit(model, data, scoretype='bdeu', methodtype='bayes')
# 計算邊緣強度，如果p小於顯著就是有相關
model_independence = bn.independence_test(model_param, data, test='chi_square', prune=True)

# model_independence['independence_test']
# bn.get_parents(model['model_edges'])

In [None]:
from matplotlib import rcParams
rcParams['font.sans-serif'] = ['Microsoft JhengHei']
rcParams['axes.unicode_minus'] = False

# G = bn.plot(model, interactive=False, node_color="#36AA5B", edge_labels=None)
bn.plot(model_independence, interactive=False, edge_labels='pvalue', 
        params_static={'layout': 'spring_layout', 'font_color': "#974848", 'edge_color': "#974848"})

In [None]:
import numpy as np
from utils_behaviour import draw_bn_plotly

draw_bn_plotly(model_independence, layout_algo='spring', en=False, width=600, height=400, seed=42, iter=30)

## Inference
這個方法針對特定的推論得出cpt，現在討論反向所以evidence會是肇因，討論特定特徵下不同設計的機率

In [None]:
from utils_behaviour import cpd_add_n, filter_cpd, get_outlier

In [None]:
parent = ['速限-第1當事者', '道路類別-第1當事者-名稱', '道路型態子類別名稱', 'facility', '號誌-號誌種類名稱']
child = '肇因研判子類別名稱-主要'

evidence_v = list(data['肇因研判子類別名稱-主要'].value_counts().head(10).index)

for v in evidence_v:
    q2 = bn.inference.fit(
        model_param, 
        variables=parent,
        evidence={'肇因研判子類別名稱-主要': v})

    model = q2.df
    evidence_df = data[data['肇因研判子類別名稱-主要'] == v]

    filtered_condition = cpd_add_n(parent, child, model, evidence_df, cpd=False, threshold=30)

    filtered_condition.to_csv(f'../ComputedData/Behaviour/{v}.csv', encoding='utf-8', index=False)

In [None]:
for i in range(len(evidence_v)):
    print(evidence_v[i])
    # Inference方面n和p不會是正比的主要原因是p是由cpt反推，所以不是基於n來計算
    filtered_condition = pd.read_csv(f'../ComputedData/Behaviour/{evidence_v[i]}.csv', encoding='utf-8')
    new_con_dist = filter_cpd(filtered_condition)
    # print(new_con_dist.shape)
    new_con_dist_out = get_outlier(filtered_condition, new_con_dist)
    print(new_con_dist_out)

In [None]:
for i in range(len(evidence_v)):
    # Inference方面n和p不會是正比的主要原因是p是由cpt反推，所以不是基於n來計算
    filtered_condition = pd.read_csv(f'../ComputedData/Behaviour/{evidence_v[i]}.csv', encoding='utf-8')
    new_con_dist = filter_cpd(filtered_condition)

    print(round(new_con_dist['p'].sum(), 4))

## CPD
沒有針對特徵，回傳肇因的因

In [None]:
from config import category_value_map

CPDs = bn.print_CPD(model_param)
dfprob_cause = CPDs[child]
filtered = cpd_add_n(parent, child, dfprob_cause, data, threshold=0)

# filtered[parent] = filtered[parent].map(category_value_map[parent])
# filtered[child] = filtered[child].map(category_value_map[child])
# filtered[parent2] = filtered[parent2].map(category_value_map[parent2])

In [None]:
filtered = filtered[filtered['n'] > 100]
filtered['facility'] = filtered['facility'].astype(int)
final_filtered = filter_cpd(filtered)

### 確認CPT的機率

In [None]:
import numpy as np
# parent 組合數
q = dfprob_cause.groupby(parent, dropna=False).ngroups
assert np.prod([data[col].nunique() for col in parent])==q
# 每個 parent 組合底下的機率和都應該 ≈ 1
chk = dfprob_cause.groupby(parent, dropna=False)['p'].sum().unique()