In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.font_manager import FontProperties

# 獲取當前工作目錄
# 獲取當前工作目錄
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
version3_path = os.path.join(parent_dir, "Version3")

# 暫時將工作目錄切換到 Version3
os.chdir(version3_path)

from utils.models import *
from utils.utils_v3 import *
from utils.plots import *

try:
    myfont = FontProperties(fname=r"/System/Library/Fonts/PingFang.ttc")
    sns.set(style="whitegrid", font=myfont.get_name())
except Exception as e:
    print(e)

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

dataA2 = pd.read_csv("./Data/A2.csv", low_memory=False)
dataA1 = pd.read_csv("./Data/A1.csv")

[Errno 2] No such file or directory: 'C:\\System\\Library\\Fonts\\PingFang.ttc'


In [2]:
car_0 = pd.read_csv(os.path.join(current_dir, "../Version3/Data/CarData/full_0.csv"), encoding='utf-8')
car_1 = pd.read_csv(os.path.join(current_dir, "../Version3/Data/CarData/full_1.csv"), encoding='utf-8')
car_2 = pd.read_csv(os.path.join(current_dir, "../Version3/Data/CarData/full_2.csv"), encoding='utf-8')
car_out = pd.read_csv(os.path.join(current_dir, "../Version3/Data/CarData/full_out.csv"), encoding='utf-8')
car_overlap = pd.read_csv(os.path.join(current_dir, "../Version3/Data/CarData/overlap_data.csv"), encoding='utf-8')

motor_0 = pd.read_csv(os.path.join(current_dir, "../Version3/Data/MotorData/full_0.csv"), encoding='utf-8')
motor_1 = pd.read_csv(os.path.join(current_dir, "../Version3/Data/MotorData/full_1.csv"), encoding='utf-8')
motor_out = pd.read_csv(os.path.join(current_dir, "../Version3/Data/MotorData/full_out.csv"), encoding='utf-8')
motor_overlap = pd.read_csv(os.path.join(current_dir, "../Version3/Data/MotorData/overlap_data.csv"), encoding='utf-8')

pass_0 = pd.read_csv(os.path.join(current_dir, "../Version3/Data/PassData/full_0.csv"), encoding='utf-8')
pass_1 = pd.read_csv(os.path.join(current_dir, "../Version3/Data/PassData/full_1.csv"), encoding='utf-8')
pass_out = pd.read_csv(os.path.join(current_dir, "../Version3/Data/PassData/full_out.csv"), encoding='utf-8')
pass_overlap = pd.read_csv(os.path.join(current_dir, "../Version3/Data/PassData/overlap_data.csv"), encoding='utf-8')

### 這是由拓樸得出來的特徵，可新增於拓樸訓練，但不增加在full_data

In [3]:
car_out['type'] = 'out'
car_overlap['type'] = 'overlap'
motor_out['type'] = 'out'
motor_overlap['type'] = 'overlap'
pass_out['type'] = 'out'
pass_overlap['type'] = 'overlap'

car_out_overlap = pd.concat([car_out, car_overlap])
motor_out_overlap = pd.concat([motor_out, motor_overlap])
pass_out_overlap = pd.concat([pass_out, pass_overlap])

### 行人資料需要特別處理，因為他們沒有以下特徵

In [4]:
pass_0['行動電話或電腦或其他相類功能裝置名稱'] = '非駕駛人'
pass_0['當事者區分-類別-大類別名稱-車種'] = '人'
pass_1['行動電話或電腦或其他相類功能裝置名稱'] = '非駕駛人'
pass_1['當事者區分-類別-大類別名稱-車種'] = '人'
pass_out['行動電話或電腦或其他相類功能裝置名稱'] = '非駕駛人'
pass_out['當事者區分-類別-大類別名稱-車種'] = '人'
pass_overlap['行動電話或電腦或其他相類功能裝置名稱'] = '非駕駛人'
pass_overlap['當事者區分-類別-大類別名稱-車種'] = '人'
pass_out_overlap['行動電話或電腦或其他相類功能裝置名稱'] = '非駕駛人'
pass_out_overlap['當事者區分-類別-大類別名稱-車種'] = '人'

In [5]:
full_data = pd.concat([car_0, car_1, car_2, car_out, car_overlap,
                       motor_0, motor_1, motor_out, motor_overlap,
                       pass_0, pass_1, pass_out, pass_overlap])

In [6]:
select_lst = [
    '天候名稱', '光線名稱', 
    '道路類別-第1當事者-名稱', '速限-第1當事者', 
    
    # 路面狀況
    '路面狀況-路面鋪裝名稱', '路面狀況-路面狀態名稱', '路面狀況-路面缺陷名稱',
    '道路障礙-障礙物名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱',
    # 號誌
    '號誌-號誌種類名稱', '號誌-號誌動作名稱',
    # 車道
    '車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    # 當事人
    '當事者屬-性-別名稱', '當事者事故發生時年齡',
    '保護裝備名稱', '行動電話或電腦或其他相類功能裝置名稱',
    '肇事逃逸類別名稱-是否肇逃',

    # 大類別
    '道路型態大類別名稱', '事故位置大類別名稱',
    '車道劃分設施-分向設施大類別名稱',
    '事故類型及型態大類別名稱', '當事者區分-類別-大類別名稱-車種',
    '車輛撞擊部位大類別名稱-其他',
    '肇因研判大類別名稱-主要',

    # 子類別
    '道路型態子類別名稱', '事故位置子類別名稱', '事故類型及型態子類別名稱', '肇因研判子類別名稱-主要',
    '當事者區分-類別-子類別名稱-車種', '當事者行動狀態子類別名稱', '車輛撞擊部位子類別名稱-最初',
    '車輛撞擊部位子類別名稱-其他', '肇因研判子類別名稱-個別',
    
    '死亡'
]

# select data
car_0 = car_0[select_lst]
car_1 = car_1[select_lst]
car_2 = car_2[select_lst]
motor_0 = motor_0[select_lst]
motor_1 = motor_1[select_lst]
pass_0 = pass_0[select_lst]
pass_1 = pass_1[select_lst]

full_data = full_data[select_lst]

select_lst.append('type')
motor_out_overlap = motor_out_overlap[select_lst]
car_out_overlap = car_out_overlap[select_lst]
pass_out_overlap = pass_out_overlap[select_lst]

## 唯一值檢查
第一區塊是完整模型的資料<br/>
第二區塊是多個模型個別進行模型，所以這裡需要量化後自動drop

In [11]:
print('詳細群體')
for data in [car_0, car_1, car_2, car_out_overlap, motor_0, motor_1, motor_out_overlap, pass_0, pass_1, pass_out_overlap]:
    columns_to_drop = []
    for column in data.columns:
        if data[column].nunique() == 1:
            columns_to_drop.append(column)
    print(columns_to_drop)
    data.drop(columns=columns_to_drop, inplace=True)

詳細群體
['道路障礙-視距品質名稱', '道路障礙-視距名稱']
['道路障礙-視距品質名稱', '道路障礙-視距名稱']
['道路障礙-視距品質名稱', '道路障礙-視距名稱', '當事者行動狀態大類別名稱']
[]
['道路障礙-視距品質名稱', '道路障礙-視距名稱']
['道路障礙-視距品質名稱', '道路障礙-視距名稱']
[]
['路面狀況-路面缺陷名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱', '號誌-號誌動作名稱', '行動電話或電腦或其他相類功能裝置名稱', '事故類型及型態大類別名稱', '當事者區分-類別-大類別名稱-車種']
['道路障礙-視距品質名稱', '道路障礙-視距名稱', '行動電話或電腦或其他相類功能裝置名稱', '當事者區分-類別-大類別名稱-車種']
['行動電話或電腦或其他相類功能裝置名稱', '肇事逃逸類別名稱-是否肇逃', '當事者區分-類別-大類別名稱-車種']


In [13]:
car_0_dummy = pd.get_dummies(car_0)
car_1_dummy = pd.get_dummies(car_1)
car_2_dummy = pd.get_dummies(car_2)
car_out_overlap_dummy = pd.get_dummies(car_out_overlap)
motor_0_dummy = pd.get_dummies(motor_0)
motor_1_dummy = pd.get_dummies(motor_1)
motor_out_overlap_dummy = pd.get_dummies(motor_out_overlap)
pass_0_dummy = pd.get_dummies(pass_0)
pass_1_dummy = pd.get_dummies(pass_1)
pass_out_overlap = pd.get_dummies(pass_out_overlap)

full_data_dummy = pd.get_dummies(full_data)

In [14]:
car_0_X, car_0_y = get_train_test_data(car_0_dummy)
car_1_X, car_1_y = get_train_test_data(car_1_dummy)
car_2_X, car_2_y = get_train_test_data(car_2_dummy)
car_out_overlap_X, car_out_overlap_y = get_train_test_data(car_out_overlap_dummy)
motor_0_X, motor_0_y = get_train_test_data(motor_0_dummy)
motor_1_X, motor_1_y = get_train_test_data(motor_1_dummy)
motor_out_overlap_X, motor_out_overlap_y = get_train_test_data(motor_out_overlap_dummy)
pass_0_X, pass_0_y = get_train_test_data(pass_0_dummy)
pass_1_X, pass_1_y = get_train_test_data(pass_1_dummy)
pass_out_overlap_X, pass_out_overlap_y = get_train_test_data(pass_out_overlap)

full_data_X, full_data_y = get_train_test_data(full_data_dummy)

In [15]:
print(car_0_y.value_counts())
print(car_1_y.value_counts())
print(car_2_y.value_counts())
print(car_out_overlap_y.value_counts())
print(motor_0_y.value_counts())
print(motor_1_y.value_counts())
print(motor_out_overlap_y.value_counts())
print(pass_0_y.value_counts())
print(pass_1_y.value_counts())
print(pass_out_overlap_y.value_counts())

print(full_data_y.value_counts())

0    57472
1      285
Name: y, dtype: int64
0    43045
1      175
Name: y, dtype: int64
0    2084
1      35
Name: y, dtype: int64
0    8772
1      88
Name: y, dtype: int64
0    69156
1      318
Name: y, dtype: int64
0    32699
1       87
Name: y, dtype: int64
0    5353
1      18
Name: y, dtype: int64
0    529
1     11
Name: y, dtype: int64
0    2373
1      33
Name: y, dtype: int64
0    183
1     14
Name: y, dtype: int64
0    221666
1      1064
Name: y, dtype: int64


In [16]:
print(car_0_X.shape)
print(car_1_X.shape)
print(car_2_X.shape)
print(car_out_overlap_X.shape)
print(motor_0_X.shape)
print(motor_1_X.shape)
print(motor_out_overlap_X.shape)
print(pass_0_X.shape)
print(pass_1_X.shape)
print(pass_out_overlap_X.shape)
print(full_data_X.shape)

(57757, 428)
(43220, 413)
(2119, 324)
(8860, 432)
(69474, 399)
(32786, 363)
(5371, 375)
(540, 126)
(2406, 174)
(197, 180)
(222730, 509)


In [None]:
import pickle
import time
import gc

models = [
    ("pass_0", pass_0_X, pass_0_y),
    ("pass_1", pass_1_X, pass_1_y),
    ("pass_out_overlap", pass_out_overlap_X, pass_out_overlap_y),
    ("car_0", car_0_X, car_0_y),
    ("car_1", car_1_X, car_1_y),
    ("car_2", car_2_X, car_2_y),
    ("car_out_overlap", car_out_overlap_X, car_out_overlap_y),
    ("motor_0", motor_0_X, motor_0_y),
    ("motor_1", motor_1_X, motor_1_y),
    ("motor_out_overlap", motor_out_overlap_X, motor_out_overlap_y),
    ("full_data", full_data_X, full_data_y),
]

# Logistic
for name, X, y in models:
    print(f'{name} logistic start')
    start_time = time.time()
    # y_log, decision_scores_log, indices_log = logistic_cm_kfold(X.astype(float), y)
    y_log, decision_scores_log, indices_log = logistic_cm_gridsearch(X.astype(float), y)
    end_time = time.time()
    elapsed_time = end_time - start_time
    with open(f"../Models/ModelPerformanceNofold/logistic/子類別_{name}.pkl", "wb") as f:
        pickle.dump({
            'y': y_log,
            'decision_scores': decision_scores_log,
            'indices': indices_log,
            'elapsed_time': elapsed_time
        }, f)
    print(f'{name} logistic done in {elapsed_time:.2f} seconds')
    del X, y, y_log, decision_scores_log
    gc.collect()

# SVC
for name, X, y in models:
    print(f'{name} svc start')
    start_time = time.time()
    # y_svc, decision_scores_svc, indices_svc = linear_svc_kfold(X.astype(float), y, n_jobs=14)
    y_svc, decision_scores_svc, indices_svc = linear_svc_cm_gridsearch(X.astype(float), y)
    end_time = time.time()
    elapsed_time = end_time - start_time
    with open(f"../Models/ModelPerformanceNofold/svc/子類別_{name}.pkl", "wb") as f:
        pickle.dump({
            'y': y_svc,
            'decision_scores': decision_scores_svc,
            'indices': indices_svc,
            'elapsed_time': elapsed_time
        }, f)
    print(f'{name} svc done in {elapsed_time:.2f} seconds')
    del X, y, y_svc, decision_scores_svc
    gc.collect()

# XGBoost
# for name, X, y in models:
#     print(f'{name} xgboost start')
#     start_time = time.time()
#     y_xgb, decision_scores_xgb, indices_xgb = xgboost_cm_kfold(X.astype(float), y)
#     end_time = time.time()
#     elapsed_time = end_time - start_time
#     with open(f"../Models/ModelPerformance/xgboost/{name}.pkl", "wb") as f:
#         pickle.dump({
#             'y': y_xgb,
#             'decision_scores': decision_scores_xgb,
#             'indices': indices_xgb,
#             'elapsed_time': elapsed_time
#         }, f)
#     print(f'{name} xgboost done in {elapsed_time:.2f} seconds')
#     del X, y, y_xgb, decision_scores_xgb
#     gc.collect()

pass_0 logistic start
Best parameters found by GridSearchCV: {'C': 100, 'penalty': 'l1'}
pass_0 logistic done in 9.18 seconds
pass_1 logistic start
Best parameters found by GridSearchCV: {'C': 100, 'penalty': 'l1'}
pass_1 logistic done in 48.89 seconds
pass_out_overlap logistic start
Best parameters found by GridSearchCV: {'C': 1, 'penalty': 'l2'}
pass_out_overlap logistic done in 3.36 seconds
car_0 logistic start
Best parameters found by GridSearchCV: {'C': 100, 'penalty': 'l1'}
car_0 logistic done in 1601.66 seconds
car_1 logistic start
Best parameters found by GridSearchCV: {'C': 100, 'penalty': 'l1'}
car_1 logistic done in 2334.54 seconds
car_2 logistic start
Best parameters found by GridSearchCV: {'C': 0.01, 'penalty': 'l1'}
car_2 logistic done in 21.75 seconds
car_out_overlap logistic start
Best parameters found by GridSearchCV: {'C': 100, 'penalty': 'l1'}
car_out_overlap logistic done in 356.86 seconds
motor_0 logistic start
