分析改用特定道路組合下為有設施的機率

In [None]:
import os
import pandas as pd
import bnlearn as bn
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

In [None]:
from utils_behaviour import get_model, draw_bn_plotly, cpd_add_n, filter_cpd_for_hotspot, get_outlier
from config import category_value_map, feature_name_map, select_group_behaviour

In [None]:
computeddata = 'ComputedDataV2'
# This is from TDAv2
combined_data = pd.read_csv(f'../{computeddata}/ForModel/combined_data_with_hotspot.csv')
combined_data.shape

In [None]:
data = combined_data[select_group_behaviour].copy()
data['facility'] = data[['youbike_100m_count']].apply(lambda row: '1' if (row > 0).any() else '0', axis=1)
data.drop(columns=['youbike_100m_count'], inplace=True)

max_speed = data['速限-第1當事者'].max()
bins = range(0, int(max_speed) + 11, 10)

data['速限-第1當事者'] = pd.cut(
    data['速限-第1當事者'],
    bins=bins,
    right=False, 
    include_lowest=True,
    labels=[f"{i}-{i+9}" for i in bins[:-1]]
)

In [None]:
parent = [
    '號誌-號誌種類名稱', '號誌-號誌動作名稱','車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分道設施-快慢車道間名稱',
    '車道劃分設施-分道設施-路面邊線名稱', '車道劃分設施-分向設施子類別名稱', '道路型態子類別名稱',
    '速限-第1當事者', '道路類別-第1當事者-名稱', 'facility', 'COUNTYNAME'
    ]
cause = ['肇因研判子類別名稱-主要']
result = ['事故類型及型態子類別名稱']

white_list = [
    ('速限-第1當事者', '肇因研判子類別名稱-主要'),
    ('道路類別-第1當事者-名稱', '肇因研判子類別名稱-主要'),
    ('道路型態子類別名稱', '肇因研判子類別名稱-主要'),
    ('facility', '肇因研判子類別名稱-主要'),
    ('號誌-號誌種類名稱', '肇因研判子類別名稱-主要'),
]

black_list = []
# cause -> parent
black_list += [(c, p) for c in cause for p in parent]
# result -> parent/cause
black_list += [(r, x) for r in result for x in (parent + cause + result)]

In [None]:
model_all, model_param_all, model_independence_all = get_model(data, black_list=black_list, white_list=white_list)
# draw_bn_plotly(model_independence_all, layout_algo='spring', en=False, width=600, height=400, seed=42, iter=30)

In [None]:
parent = ['速限-第1當事者', '道路類別-第1當事者-名稱', '道路型態子類別名稱', 'facility', '號誌-號誌種類名稱']
child = '肇因研判子類別名稱-主要'
evidence_v = list(data['肇因研判子類別名稱-主要'].value_counts().head(5).index)

In [8]:
CPDs = bn.print_CPD(model_param_all)
dfprob_cause = CPDs[child]
filtered = cpd_add_n(parent, child, dfprob_cause, data, threshold=0)

In [9]:
filtered_n = filtered[filtered['n'] > 0]
filtered_n.sort_values(by=['號誌-號誌種類名稱', '速限-第1當事者', '道路型態子類別名稱', 'facility', 'p'], ascending=False, inplace=True)

filtered_n['total_n'] = (
    filtered_n
    .groupby(parent)['n']
    .transform('sum')
)

filtered_n.sort_values(by=['total_n', 'p'], ascending=False, inplace=True)

# check_sum must be around 1
# check_sum = (filtered_n.groupby(parent)['p'].sum())
# check_sum

In [10]:
filtered_n = filtered_n[parent + ['肇因研判子類別名稱-主要', 'p', 'n', 'total_n']]
filtered_n.sort_values(
    by=parent + ['肇因研判子類別名稱-主要'], 
    ascending=False, inplace=False)
filtered_n

Unnamed: 0,速限-第1當事者,道路類別-第1當事者-名稱,道路型態子類別名稱,facility,號誌-號誌種類名稱,肇因研判子類別名稱-主要,p,n,total_n
623208,50-59,市區道路,直路,0,無號誌,未保持行車安全距離,0.1407,11427,81243
111720,50-59,市區道路,直路,0,無號誌,其他不當駕車行為,0.1356,11019,81243
402024,50-59,市區道路,直路,0,無號誌,恍神、緊張、心不在焉分心駕駛,0.1064,8645,81243
996456,50-59,市區道路,直路,0,無號誌,起步時未注意安全,0.0745,6050,81243
319080,50-59,市區道路,直路,0,無號誌,尚未發現肇事因素,0.0587,4769,81243
...,...,...,...,...,...,...,...,...,...
753443,0-9,鄉道,四岔路,1,無號誌,無號誌路口，左方車未讓右方車先行,0.9331,1,1
200473,0-9,縣道,其他,1,無號誌,動物竄出,0.9331,1,1
221209,0-9,縣道,其他,0,無號誌,右轉彎未依規定,0.9331,1,1
1230362,0-9,鄉道,其他,0,無號誌,違反其他標誌(線)禁制,0.9331,1,1


In [11]:
import numpy as np
design = ['道路類別-第1當事者-名稱', '號誌-號誌種類名稱', '速限-第1當事者', '道路型態子類別名稱']
df = filtered_n.copy()
df['facility'] = df['facility'].astype(int)

# 對「道路組合 + facility」把 n 加總
g = (df.groupby(design + ['facility'])['n']
       .sum()
       .unstack('facility', fill_value=0)
       .rename(columns={0: 'facility_0', 1: 'facility_1'})
    )

# P(facility=1 | 道路組合)
g['total_n'] = g['facility_0'] + g['facility_1']
g['Facility_P'] = g['facility_1'] / (g['total_n']).replace(0, np.nan)

g = g.reset_index()
g = g[(g['total_n'] > 0) &
      (g['Facility_P'] > 0.1)]
g.sort_values(by='Facility_P', ascending=False, inplace=True)
g_hotspot = filter_cpd_for_hotspot(g)

In [12]:
g_hotspot = g_hotspot[['號誌-號誌種類名稱', '速限-第1當事者', '道路型態子類別名稱', 'Facility_P', 'total_n']]
g_hotspot.sort_values(by=['號誌-號誌種類名稱', '速限-第1當事者', 'Facility_P'], ascending=False, inplace=True)
g_hotspot['Facility_P'] = g_hotspot['Facility_P'].round(3)
# g_hotspot = g_hotspot[g_hotspot['total_n']>100]

for col in ['號誌-號誌種類名稱', '道路型態子類別名稱']:
    g_hotspot[col] = g_hotspot[col].map(category_value_map[col])
g_hotspot.rename(columns=feature_name_map, inplace=True)

# g_hotspot.to_excel(f'../{computeddata}/Behaviour/Behaviour_hotspot.xlsx', index=False)

In [19]:
print(g_hotspot[(g_hotspot['Road type (subcategory)'] == 'Roundabout') &
          (g_hotspot['total_n'] > 10)
          ].sort_values(by='Facility_P', ascending=False))

facility                                Traffic signal type  \
754                                         Flashing signal   
528                                               No signal   
703       Traffic control signal (with pedestrian-only s...   
615                                  Traffic control signal   
498                                               No signal   
627                                  Traffic control signal   
513                                               No signal   

facility Speed limit (Party 1) Road type (subcategory)  Facility_P  total_n  
754                      40-49              Roundabout       1.000       20  
528                      50-59              Roundabout       0.472      521  
703                      50-59              Roundabout       0.408       49  
615                      40-49              Roundabout       0.333       12  
498                      30-39              Roundabout       0.328       58  
627                      50

### 去掉肇因重新分析
他的效果和BehaviourV2_hotspot相似，都是找出有設施的機率

In [None]:
parent = [
    '號誌-號誌種類名稱', '號誌-號誌動作名稱','車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分道設施-快慢車道間名稱',
    '車道劃分設施-分道設施-路面邊線名稱', '車道劃分設施-分向設施子類別名稱', '道路型態子類別名稱',
    '速限-第1當事者', '道路類別-第1當事者-名稱', 'facility', 'COUNTYNAME'
    ]
cause = ['facility']
result = ['事故類型及型態子類別名稱']

white_list = [
    ('速限-第1當事者', 'facility'),
    ('道路類別-第1當事者-名稱', 'facility'),
    ('道路型態子類別名稱', 'facility'),
    ('號誌-號誌種類名稱', 'facility'),
]

black_list = []
# cause -> parent
black_list += [(c, p) for c in cause for p in parent]
# result -> parent/cause
black_list += [(r, x) for r in result for x in (parent + cause + result)]

model_all, model_param_all, model_independence_all = get_model(data, black_list=black_list, white_list=white_list)

In [None]:
parent = ['速限-第1當事者', '道路類別-第1當事者-名稱', '道路型態子類別名稱', '號誌-號誌種類名稱']
child = 'facility'
evidence_v = list(data['facility'].value_counts().head(5).index)
CPDs = bn.print_CPD(model_param_all)
dfprob_cause = CPDs[child]
filtered = cpd_add_n(parent, child, dfprob_cause, data, threshold=0)

hotspot = filter_cpd_for_hotspot(filtered)
hotspot = hotspot[(hotspot['p'] > 0.1) & (hotspot['n'] > 100)]

In [None]:
filtered_0 = hotspot[(hotspot['facility'] == '0')].sort_values(by=['p'], ascending=False)#.head(10)
filtered_1 = hotspot[(hotspot['facility'] == '1')].sort_values(by=['p'], ascending=False)#.head(10)
full_filter = pd.concat([filtered_0, filtered_1], axis=0)
full_filter.sort_values(by=['facility', '號誌-號誌種類名稱', '速限-第1當事者', 'p'], ascending=False, inplace=True)
full_filter
# for col in ['道路類別-第1當事者-名稱', '道路型態子類別名稱', '號誌-號誌種類名稱']:
#     full_filter[col] = full_filter[col].map(category_value_map[col])
# full_filter.rename(columns=feature_name_map, inplace=True)
# full_filter.to_excel(f'../{computeddata}/Behaviour/Origin_outlierV2.xlsx', index=False)