設施 和 當事者區分(類別) （腳踏自行車） 是否有關聯

In [None]:
import os
import pandas as pd

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

In [None]:
import bnlearn as bn

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']

In [None]:
filtered_A1 = pd.read_csv('../ComputedData/Accident/DataA1_with_youbike.csv')
filtered_A2 = pd.read_csv('../ComputedData/Accident/DataA2_with_youbike.csv')

filtered_A1['source'] = 'A1'
filtered_A2['source'] = 'A2'
filtered_A1['num_accidents'] = 1 
filtered_A2['num_accidents'] = 1
combined_data = pd.concat([filtered_A1, filtered_A2], ignore_index=True)

# 替換離群值成中位數
median_speed = combined_data.loc[combined_data['速限-第1當事者'] < 200, '速限-第1當事者'].median()
median_age = combined_data.loc[(combined_data['當事者事故發生時年齡'] > 0) & (combined_data['當事者事故發生時年齡'] < 100),
                '當事者事故發生時年齡'].median()
combined_data.loc[combined_data['速限-第1當事者'] >= 200, '速限-第1當事者'] = median_speed
combined_data.loc[(combined_data['當事者事故發生時年齡'] >= 100) | 
                (combined_data['當事者事故發生時年齡'] <= 0), '當事者事故發生時年齡'] = median_age

In [None]:
combined_data1 = combined_data[combined_data['當事者順位'] == 1]
combined_data2 = combined_data[combined_data['當事者順位'] > 1]
combined_data1.shape, combined_data2.shape

In [None]:
def create_crosstab(data, type='大', scale='100'):
    type_name = f'當事者區分-類別-{type}類別名稱-車種'
    table = pd.crosstab(
        data[type_name],
        data[f'youbike_{scale}m_count'],
        normalize='index'
    )
    raw_counts = pd.crosstab(data[type_name], data[f'youbike_{scale}m_count'])
    row_sum = raw_counts.sum(axis=1)
    table = table[row_sum > 30]

    return table.sort_values(by=0, ascending=True)

In [None]:
c1 = create_crosstab(combined_data1, type='子', scale='50')
c1 = c1[c1.columns[c1.columns != 0]].sum(axis=1)

c1.plot(kind='barh', color='#36AA5B', figsize=(8, 6))
plt.xlabel('Ratio')
plt.grid(axis='x')
plt.tight_layout()

### 分析第二順位和第一順位的關係

In [None]:
types = '子'

combined_data['事故ID'] = (
    combined_data['發生日期'].astype(str) + '_' +
    combined_data['發生時間'].astype(str) + '_' +
    combined_data['發生地點'].astype(str)
)
first_rank = combined_data[combined_data['當事者順位'] == 1.0].copy()
second_rank = combined_data[combined_data['當事者順位'] == 2.0][['事故ID', f'當事者區分-類別-{types}類別名稱-車種']]
second_rank = second_rank.rename(columns={f'當事者區分-類別-{types}類別名稱-車種': '第二順位車種'})

result = pd.merge(first_rank, second_rank, on='事故ID', how='left')
result = result.rename(columns={f'當事者區分-類別-{types}類別名稱-車種': '第一順位車種'})
result_filter = result[result['第二順位車種'].notna()]
result_filter.shape

In [None]:
result_filter = result_filter[result_filter['第一順位車種'].isin(
    result_filter['第一順位車種'].value_counts()[result_filter['第一順位車種'].value_counts() > 1000].index
)]
result_filter = result_filter[result_filter['第二順位車種'].isin(
    result_filter['第二順位車種'].value_counts()[result_filter['第二順位車種'].value_counts() > 1000].index
)]

In [None]:
import plotly.graph_objects as go

df = result_filter.dropna(subset=['第一順位車種', '第二順位車種'])

left_nodes = [f"{x}(主)" for x in sorted(df['第一順位車種'].unique())]
right_nodes = [f"{x}(次)" for x in sorted(df['第二順位車種'].unique())]

all_nodes = left_nodes + right_nodes
left_map = {x: i for i, x in enumerate(left_nodes)}
right_map = {x: i+len(left_nodes) for i, x in enumerate(right_nodes)}

flows = df.groupby(['第一順位車種', '第二順位車種']).size().reset_index(name='count')

sources = flows['第一順位車種'].map(lambda x: left_map[f"{x}(主)"])
targets = flows['第二順位車種'].map(lambda x: right_map[f"{x}(次)"])
values = flows['count']

link_colors = []
for s, t in zip(flows['第一順位車種'], flows['第二順位車種']):
    # if "腳踏自行車" in s or "腳踏自行車" in t:
    #     link_colors.append("#349a76")  # 綠色半透明
    # else:
    #     link_colors.append("rgba(150,150,150,0.3)") 
    if "慢車" in s or "慢車" in t:
        link_colors.append("#349a76")  # 綠色半透明
    else:
        link_colors.append("rgba(150,150,150,0.3)") 

fig = go.Figure(data=[go.Sankey(
    # orientation="v",
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=all_nodes,
        color=["#6fa8dc"]*len(left_nodes) + ["#f6b26b"]*len(right_nodes)
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color=link_colors
    )
)])
fig.update_layout(
    title_text="Sankey diagram", 
    font_size=12,
    width=800,
    height=1200)
fig.show()

### 分析設施和腳踏車事故的相關性
設施的數量增加腳踏車發生比例

In [None]:
bicycle_scale = 'youbike_100m_count'

In [None]:
bicycle = combined_data1[combined_data1['當事者區分-類別-子類別名稱-車種'] == '腳踏自行車']
bicycle[bicycle_scale].value_counts(normalize=True), combined_data1[bicycle_scale].value_counts(normalize=True)

In [None]:
bicycle = combined_data1[combined_data1['當事者區分-類別-子類別名稱-車種'] == '腳踏自行車']
bicycle[bicycle_scale].value_counts(normalize=True), combined_data1[bicycle_scale].value_counts(normalize=True)

# 任意當事人包含腳踏車做分析
result_filter['accident_include_bike'] = result_filter.apply(
    lambda x: 1 if (x['第一順位車種'] == '腳踏自行車' or x['第二順位車種'] == '腳踏自行車') else 0,
    axis=1
)
result_filter[bicycle_scale] = result_filter.apply(
    lambda x: 1 if x[bicycle_scale] > 0 else 0,
    axis=1
)
result_filter[['accident_include_bike', bicycle_scale]].value_counts()

In [None]:
youbike_type = 'youbike_300m_count'

result['accident_include_bike'] = result.apply(
    lambda x: 1 if (x['第一順位車種'] == '腳踏自行車' or x['第二順位車種'] == '腳踏自行車') else 0, axis=1
)
include_youbike = result[(result['第一順位車種'] == '腳踏自行車') | 
                         (result['第二順位車種'] == '腳踏自行車')]

total_accidents_by_facility = result.groupby(youbike_type).size()
bike_accidents_by_facility = include_youbike.groupby(youbike_type).size()

facility_ratio = (bike_accidents_by_facility / total_accidents_by_facility).fillna(0)
facility_ratio_df = facility_ratio.reset_index()
facility_ratio_df.columns = [youbike_type, 'bike_accident_ratio']

plt.figure(figsize=(8, 6))
plt.bar(facility_ratio_df[youbike_type], facility_ratio_df['bike_accident_ratio'], color="#59B275")
plt.xlabel('Number of YouBike Facilities around Accident')
plt.ylabel('Bicycle Accident Ratio')
plt.grid(axis='y', linestyle='--', alpha=0.5)
bars = plt.bar(
    facility_ratio_df[youbike_type],
    facility_ratio_df['bike_accident_ratio'],
    color="#59B275"
)
for bar, xval in zip(bars, facility_ratio_df[youbike_type]):
    height = bar.get_height()
    total = total_accidents_by_facility.loc[xval]
    plt.text(
        bar.get_x() + bar.get_width()/2,
        height,
        str(total),
        ha='center', va='bottom', fontsize=9
    )
plt.tight_layout()
plt.show()

### 分析附近有腳踏車租借情況下的差異

In [None]:
bike_around = combined_data1[combined_data1['youbike_200m_count'] > 0]

In [None]:
bike_around['當事者區分-類別-子類別名稱-車種'].value_counts(normalize=True)

In [None]:
combined_data1['當事者區分-類別-子類別名稱-車種'].value_counts(normalize=True)