In [3]:
import os
import sys
import time
import ast
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.font_manager import FontProperties

# 獲取當前工作目錄
current_dir = os.getcwd()
version3_path = os.path.join(current_dir, "TrafficTDApython", "Version3", "tdamapper", "core_old.py")

from utils.utils_v3 import *
from utils.plots import *
from utils.mappping_model import *

try:
    myfont = FontProperties(fname=r"/System/Library/Fonts/PingFang.ttc")
    sns.set(style="whitegrid", font=myfont.get_name())
except Exception as e:
    print(e)

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

dataA2 = pd.read_csv("../Version3/Data/A2.csv", low_memory=False)
dataA1 = pd.read_csv("../Version3/Data/A1.csv")
info = pd.read_csv("./Data/CarData/full_info.csv", low_memory=False)

[Errno 2] No such file or directory: 'C:\\System\\Library\\Fonts\\PingFang.ttc'


In [15]:
select_lst = [
    # 月份是為了篩選每個月2萬筆
    '發生月份',

    '天候名稱', '光線名稱', 
    '道路類別-第1當事者-名稱', '速限-第1當事者', 
    '路面狀況-路面鋪裝名稱', '路面狀況-路面狀態名稱', '路面狀況-路面缺陷名稱',
    '道路障礙-障礙物名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱',
    '號誌-號誌種類名稱', '號誌-號誌動作名稱',
    '車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '當事者屬-性-別名稱', '當事者事故發生時年齡',
    '保護裝備名稱', '行動電話或電腦或其他相類功能裝置名稱',
    '肇事逃逸類別名稱-是否肇逃',
    '死亡受傷人數',

    # 大類別
    '道路型態大類別名稱', '事故位置大類別名稱',
    '車道劃分設施-分向設施大類別名稱',
    '事故類型及型態大類別名稱', '當事者區分-類別-大類別名稱-車種', '當事者行動狀態大類別名稱',
    '車輛撞擊部位大類別名稱-最初', '車輛撞擊部位大類別名稱-其他',

    # 小類別
    # '道路型態子類別名稱', '事故位置子類別名稱', '事故類型及型態子類別名稱', '肇因研判子類別名稱-主要',
    # '當事者區分-類別-子類別名稱-車種', '當事者行動狀態子類別名稱', '車輛撞擊部位子類別名稱-最初',
    # '車輛撞擊部位子類別名稱-其他', '肇因研判子類別名稱-個別',

    # 兩個欄位只有兩個觀察值不同
    '肇因研判大類別名稱-主要',
    # '肇因研判大類別名稱-個別',
    
    '經度', '緯度',
]

def preprocess(input_data, select_lst):
    # 篩選到第一個順位，因為注重的是單次事故的情況
    main_data = input_data[input_data['當事者順位'] == 1].reset_index(drop=True, inplace=False)
    sample_data = main_data[main_data['發生月份'] < 11]
    selected_data = sample_data[select_lst]
    
    # 將資料分出死亡和受傷，合併到原本的資料後去除多餘的死亡受傷人數
    split_death_injury_data = split_death_injury(selected_data['死亡受傷人數'])
    full_data = pd.concat([selected_data, split_death_injury_data], axis=1)

    # 補齊缺失值
    full_data[select_lst] = full_data[select_lst].fillna('未紀錄')

    # 速限範圍
    full_data = full_data[(full_data['速限-第1當事者'] < 200) &
                      (full_data['當事者事故發生時年齡'] < 100) &
                      (full_data['當事者事故發生時年齡'] > 0)]

    full_data.drop(columns=['死亡受傷人數'], inplace=True)
    
    # 篩選駕駛人的資料
    full_data = full_data[full_data['當事者行動狀態大類別名稱'] == '車的狀態']
    full_data.drop(columns=['當事者行動狀態大類別名稱'], inplace=True)
    # 篩選離群資料(影響MCA的因子得分)
    full_data = full_data[(full_data['肇因研判大類別名稱-主要'] != '非駕駛者') &
                  (full_data['肇因研判大類別名稱-主要'] != '無(非車輛駕駛人因素)') &
                  (full_data['肇因研判大類別名稱-主要'] != '無(車輛駕駛者因素)') &
                  (full_data['行動電話或電腦或其他相類功能裝置名稱'] != '未紀錄') &
                    (full_data['車輛撞擊部位大類別名稱-最初'] != '未紀錄')]

    return full_data

full_dataA1 = preprocess(dataA1, select_lst)
full_dataA2 = preprocess(dataA2, select_lst)

# 下採樣資料
sampling_ratio = 0.33  # 下採樣比例，根據A1 和 A2 原始數據量比例調整
total_ratio = len(full_dataA1) / len(full_dataA2) # 保留 A1/A2 的比例
downsampled_A1, downsampled_A2 = downsample_by_month_simple(full_dataA1, full_dataA2, sampling_ratio, total_ratio)
# Concat
rbind_data = pd.concat([downsampled_A1, downsampled_A2], axis=0, ignore_index=True)
rbind_data.drop(columns=['發生月份'], inplace=True)
# 處理年齡和速限
rbind_data = process_age_speed(rbind_data)
# rbind_data.drop(['死亡', '受傷'], axis=1, inplace=True)
# 唯一值處理
columns_to_drop = []
for column in rbind_data.columns:
    if rbind_data[column].nunique() == 1:  # 檢查唯一值數量是否等於 1
        columns_to_drop.append(column)
# Dummy
rbind_data["速限-第1當事者"] = rbind_data["速限-第1當事者"].astype(str)
dummy_data = pd.get_dummies(rbind_data)
print('dummy_data:', dummy_data.shape)
mapper_numpy = dummy_data.to_numpy()

# rbind_data['顯著特徵'] = rbind_data['道路型態子類別名稱'] + ',' + rbind_data['號誌-號誌動作名稱'] + ',' + rbind_data['天候名稱']

44 10057
45 10286
40 9143
35 8000
36 8229
39 8914
47 10743
43 9829
40 9143
47 10743
dummy_data: (95503, 146)


In [10]:
import folium
from folium.plugins import HeatMap

# 基於經緯度生成熱點數據
heat_data = rbind_data[["緯度", "經度"]].dropna().values.tolist()

# 創建地圖
map_taiwan = folium.Map(location=[23.5, 121], zoom_start=7)

# 添加熱點圖
HeatMap(heat_data).add_to(map_taiwan)

# 保存地圖
map_taiwan.save("./Map/traffic_heatmap.html")
print("地圖已生成，保存為 traffic_heatmap.html")


地圖已生成，保存為 traffic_heatmap.html


In [5]:
all_proportion_tables = []

for id in info['ids']:
    
    id_lst = ast.literal_eval(id)
    datas = len(id_lst)
    normalized_datas = datas / len(dummy_data)
    original_data = dummy_data.iloc[id_lst]
    proportion_data  = original_data.sum() / len(original_data)
    proportion_data['資料數量'] = normalized_datas
    proportion_table = proportion_data.to_frame(name='比例').T
    all_proportion_tables.append(proportion_table)
    
final_table = pd.concat(all_proportion_tables, ignore_index=True)

columns_to_drop = []
for column in final_table.columns:
    if final_table[column].nunique() == 1:  # 檢查唯一值數量是否等於 1
        columns_to_drop.append(column)
        
columns_to_drop
final_table = final_table.drop(columns=columns_to_drop)

In [7]:
pass_X, pass_y = get_train_test_data(final_table, classify=False)
pass_y_ridge, pass_decision_scores_ridge, pass_indices_ridge = ridge_cm_kfold(pass_X, pass_y)
pass_y_lasso, pass_decision_scores_lasso, pass_indices_lasso = lasso_cm_kfold(pass_X, pass_y)

Mean Squared Error (MSE): 0.0066
R^2 Score: -0.0143
Mean Squared Error (MSE): 0.0065
R^2 Score: -0.0005


In [9]:
info['死亡比例'] = final_table['死亡']
info['score'] = pass_decision_scores_lasso

from collections import defaultdict

# Step 1: 計算每個索引的總出現次數
index_counts = defaultdict(int)

for _, row in info.iterrows():
    row['ids'] = ast.literal_eval(row['ids'])  # 將字串轉換為列表
    for idx in row['ids']:
        index_counts[idx] += 1
        
# Step 2: 根據總次數計算權重
weights = defaultdict(float)

for _, row in info.iterrows():
    row['ids'] = ast.literal_eval(row['ids'])  # 再次解析 ids
    for idx in row['ids']:
        weights[idx] += row['score'] / index_counts[idx]  # 使用索引的總出現次數作為分母

# Step 3: 將結果轉為 DataFrame
weights_df = pd.DataFrame(list(weights.items()), columns=['index', 'weight']).sort_values(by='index').reset_index(drop=True)

final_data = rbind_data.merge(weights_df, left_index=True, right_on='index', how='left')

In [10]:
import folium
from folium import plugins
import matplotlib

# Normalize weights for color scaling
norm = matplotlib.colors.Normalize(vmin=final_data['weight'].min(), vmax=final_data['weight'].max())
cmap = matplotlib.cm.ScalarMappable(norm=norm, cmap='viridis')  # Using 'viridis' colormap

# Create a Folium map centered on the first data point
m = folium.Map(location=[final_data['緯度'].mean(), final_data['經度'].mean()], zoom_start=12)

# Add each point to the map with color based on weight
for _, row in final_data.iterrows():
    if pd.notna(row['weight']):  # Check if weight is not NaN
        color = matplotlib.colors.to_hex(cmap.to_rgba(row['weight']))  # Convert weight to color
        folium.CircleMarker(
            location=(row['緯度'], row['經度']),
            radius=6,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.8,
            popup=f"Weight: {row['weight']:.3f}"
        ).add_to(m)

# Save the map as an HTML file
map_file_path = "./Map/car_weighted_map.html"
m.save(map_file_path)

# Provide the link to the user
map_file_path

'./Map/car_weighted_map.html'