In [1]:
import pickle
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from tdamapper.core import MapperAlgorithm
from tdamapper.cover import CubicalCover
from tdamapper.plot import MapperLayoutInteractive, MapperLayoutStatic
from tdamapper.clustering import FailSafeClustering
from sklearn import metrics

from functions import *
from chi import *
from regressionP import *
from models import *

In [23]:
data1 = pd.read_csv("./Data/NPA_TMA2_1.csv", low_memory=False)[:-2]
data2 = pd.read_csv("./Data/NPA_TMA2_2.csv", low_memory=False)[:-2]
data3 = pd.read_csv("./Data/NPA_TMA2_3.csv", low_memory=False)[:-2]
data4 = pd.read_csv("./Data/NPA_TMA2_4_new.csv", low_memory=False)[:-2]
data5 = pd.read_csv("./Data/NPA_TMA2_5.csv", low_memory=False)[:-2]
data6 = pd.read_csv("./Data/NPA_TMA2_6.csv", low_memory=False)[:-2]

dataA2 = pd.concat([data1, data2, data3, data4, data5, data6], ignore_index=True)

# dataA1 = pd.read_csv("./Data/A1_617.csv")[:-2]
dataA1 = pd.read_csv("./Data/NPA_TMA1_new.csv")[:-2]

In [24]:
def preprocess(input_data, select_lst):
    sample_data = input_data[input_data['當事者順位'] == 1].reset_index(drop=True, inplace=False)
    sample_data = sample_data[sample_data['發生月份'] < 7]
    dataA = sample_data[select_lst]
    
    death_injury_data = split_death_injury(dataA['死亡受傷人數'])
    dist_df = pd.concat([dataA, death_injury_data], axis=1)
    dist_df.drop(columns=['死亡受傷人數'], inplace=True)
    
    return dist_df, sample_data

# List of columns to select
select_lst = [
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者', #'道路障礙-視距品質名稱', # 和速限共線
    '道路型態大類別名稱', '事故位置大類別名稱', 
    # '路面狀況-路面鋪裝名稱', '路面狀況-路面缺陷名稱', '道路障礙-障礙物名稱', # 分類幾乎都是無缺陷
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '死亡受傷人數',
    '經度', '緯度',
]

dist_dfA1 = preprocess(dataA1, select_lst)
dist_dfA2 = preprocess(dataA2, select_lst)

rbind_data = pd.concat([dist_dfA1[0], dist_dfA2[0]], axis=0, ignore_index=True)

rbind_data.loc[rbind_data['受傷'] > 1, '受傷'] = 2
rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)

dist_df = process_data(rbind_data)
scaler = StandardScaler()

full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
X1 = full_dist.drop(['受傷', '死亡', '經度', '緯度'], axis=1).to_numpy()

full_dist.head()

Unnamed: 0,光線名稱,道路類別-第1當事者-名稱,速限-第1當事者,道路型態大類別名稱,事故位置大類別名稱,號誌-號誌種類名稱,車道劃分設施-分向設施大類別名稱,車道劃分設施-分道設施-快車道或一般車道間名稱,車道劃分設施-分道設施-快慢車道間名稱,車道劃分設施-分道設施-路面邊線名稱,事故類型及型態大類別名稱,經度,緯度,死亡,受傷
0,0.123017,2.096435,-0.148895,1.213567,0.486423,-0.712247,-1.628151,1.775343,-1.706019,-0.916,1.800087,-0.612124,-1.839571,14.069323,-2.803115
1,1.452595,-0.257342,-0.148895,-0.812716,-0.871796,2.776686,-1.628151,-0.819002,-1.706019,-0.916,-0.025626,1.113577,0.990073,14.069323,-0.620256
2,-1.206562,2.881027,-0.148895,1.213567,1.165533,-0.712247,1.18058,1.126756,-1.706019,-0.916,1.800087,-2.416285,-0.610402,14.069323,-2.803115
3,-1.206562,-0.257342,-0.148895,-0.812716,-0.871796,2.776686,-0.223786,-0.819002,0.423384,1.091703,-0.025626,0.940543,1.031724,14.069323,-0.620256
4,1.452595,2.096435,-0.148895,1.213567,1.165533,-0.712247,0.478397,-0.819002,0.423384,-0.916,1.800087,1.826145,0.205783,14.069323,-2.803115


In [25]:
dist_dfA2[0].shape

(167918, 15)

In [26]:
start_time = time.time()

lens1 = PCA(10).fit_transform(X1)

mapper_algo1 = MapperAlgorithm(
    cover = CubicalCover(
        n_intervals = 4,
        overlap_frac = 0.4
    ),
    clustering = FailSafeClustering(
        clustering = AgglomerativeClustering(3, linkage='ward'),
        verbose = False)
)

mapper_graph1 = mapper_algo1.fit_transform(X1, lens1)

end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)

1373.2033610343933


In [2]:
# # 道路型態大類別名稱
# mapper_plot1 = MapperLayoutInteractive(
#     mapper_graph1,
#     colors = dist_df[['速限-第1當事者']].to_numpy(),
#     cmap = 'jet',
#     # agg = np.nanmean,
#     agg = most_frequent_nonan,
#     dim = 3,
#     iterations = 30,
#     seed = 6,
#     width = 800,
#     height = 500)

# fig_mean1 = mapper_plot1.plot()
# fig_mean1.show(config={'scrollZoom': True})

In [27]:
import pickle

with open('CalculatedData/道路V2.pkl', 'wb') as f:
    pickle.dump(mapper_graph1, f)

# 模型比較

In [28]:
def preprocess(input_data, select_lst):
    sample_data = input_data[input_data['當事者順位'] == 1].reset_index(drop=True, inplace=False)
    dataA = sample_data[select_lst]
    
    death_injury_data = split_death_injury(dataA['死亡受傷人數'])
    dist_df = pd.concat([dataA, death_injury_data], axis=1)
    dist_df.drop(columns=['死亡受傷人數'], inplace=True)
    
    return dist_df, sample_data

# List of columns to select
select_lst = [
    '天候名稱', 
    '路面狀況-路面狀態名稱',
    '肇因研判大類別名稱-主要', '當事者屬-性-別名稱', '當事者事故發生時年齡', 
    '車輛撞擊部位大類別名稱-最初',
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者', 
    '道路型態大類別名稱',
    '事故位置大類別名稱', 
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '死亡受傷人數',
    '經度', '緯度',
    '道路型態子類別名稱', '事故位置子類別名稱', '車道劃分設施-分向設施子類別名稱', '事故類型及型態子類別名稱', 
    '當事者行動狀態子類別名稱', '車輛撞擊部位子類別名稱-最初', '車輛撞擊部位子類別名稱-其他', '肇因研判子類別名稱-個別',
    
    '當事者區分-類別-大類別名稱-車種', '當事者區分-類別-子類別名稱-車種',
    '保護裝備名稱', '行動電話或電腦或其他相類功能裝置名稱', '當事者行動狀態大類別名稱',
    '車輛撞擊部位大類別名稱-其他', '肇因研判大類別名稱-個別', '肇事逃逸類別名稱-是否肇逃',
    '路面狀況-路面鋪裝名稱', '路面狀況-路面缺陷名稱',
    '道路障礙-障礙物名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱','號誌-號誌動作名稱',
]

dist_dfA1 = preprocess(dataA1, select_lst)
dist_dfA2 = preprocess(dataA2, select_lst)

rbind_data = pd.concat([dist_dfA1[0], dist_dfA2[0]], axis=0, ignore_index=True)

rbind_data.loc[rbind_data['受傷'] > 1, '受傷'] = 2
rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)
rbind_data = process_age(rbind_data)

dist_df = process_data(rbind_data)
scaler = StandardScaler()

full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
X1 = full_dist.drop(['受傷', '死亡'], axis=1).to_numpy()

full_dist.head()

Unnamed: 0,天候名稱,路面狀況-路面狀態名稱,肇因研判大類別名稱-主要,當事者屬-性-別名稱,當事者事故發生時年齡,車輛撞擊部位大類別名稱-最初,光線名稱,道路類別-第1當事者-名稱,速限-第1當事者,道路型態大類別名稱,...,肇因研判大類別名稱-個別,肇事逃逸類別名稱-是否肇逃,路面狀況-路面鋪裝名稱,路面狀況-路面缺陷名稱,道路障礙-障礙物名稱,道路障礙-視距品質名稱,道路障礙-視距名稱,號誌-號誌動作名稱,死亡,受傷
0,-0.467608,-0.353118,0.507204,0.744422,-1.094751,-0.371156,0.123016,2.096442,-0.148895,1.213558,...,0.506352,-0.134324,-0.045993,-0.030455,0.000242,0.121368,0.063659,0.836161,14.061035,-2.803042
1,-0.467608,-0.353118,0.507204,0.744422,0.675073,1.176366,1.452598,-0.257341,-0.148895,-0.812722,...,0.506352,-0.134324,-0.045993,-0.030455,0.000242,0.121368,0.063659,-1.194655,14.061035,-0.620227
2,1.125248,-0.353118,0.507204,0.744422,0.675073,-0.371156,-1.206566,2.881037,-0.148895,1.213558,...,0.506352,-0.134324,-0.045993,-0.030455,0.000242,0.121368,0.063659,0.836161,14.061035,-2.803042
3,-0.467608,-0.353118,0.507204,0.744422,-0.209839,-0.371156,-1.206566,-0.257341,-0.148895,-0.812722,...,0.506352,-0.134324,-0.045993,-0.030455,0.000242,0.121368,0.063659,-1.194655,14.061035,-0.620227
4,1.125248,-0.353118,0.507204,0.744422,-1.094751,-0.371156,1.452598,2.096442,-0.148895,1.213558,...,0.506352,-0.134324,-0.045993,-0.030455,0.000242,0.121368,0.063659,0.836161,14.061035,-2.803042


In [29]:
with open('CalculatedData/道路V2.pkl', 'rb') as f:
    mapper_graph1 = pickle.load(f)
    
mapper_plot1 = MapperLayoutInteractive(
    mapper_graph1,
    colors = dist_df[['速限-第1當事者']].to_numpy(),
    cmap = 'jet',
    # agg = np.nanmean,
    agg = most_frequent_nonan,
    dim = 3,
    iterations = 30,
    seed = 6,
    width = 800,
    height = 500)

In [30]:
x = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['x']
y = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['y']
z = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['z']

threeDimData = pd.DataFrame({'x': x, 'y': y, 'z': z})

import re
data_tuple = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['text']

data = []
for item in data_tuple:
    color = int(re.search(r'color: (-?\d+)', item).group(1))
    node = int(re.search(r'node: (\d+)', item).group(1))
    size = int(re.search(r'size: (\d+)', item).group(1))
    data.append({'color': color, 'node': node, 'size': size})
        
component_info = pd.DataFrame(data)

full_info = pd.concat([component_info, threeDimData], axis=1)

mp_content_origin = vars(mapper_plot1._MapperLayoutInteractive__graph)['_node']

mp_content = pd.DataFrame.from_dict(mp_content_origin, orient='index')
mp_content.reset_index(inplace=True)
mp_content.rename(columns={'index': 'node'}, inplace=True)

full_info = pd.merge(full_info, mp_content, on=['node', 'size'], how='inner')

In [97]:
# calinski_data = get_calinski_from_db(full_info, 0.045)
# labels = calinski_data[3]
# db = calinski_data[2]
# n_clusters_ = calinski_data[4]

# unique_labels = set(labels)
# core_samples_mask = np.zeros_like(labels, dtype=bool)
# core_samples_mask[db.core_sample_indices_] = True

# def matplotlib_to_plotly(cmap, alpha=1):
#     """rgba"""
#     return f'rgba({int(cmap[0]*200)}, {int(cmap[1]*200)}, {int(cmap[2]*200)}, {alpha})'

# colors = [matplotlib_to_plotly(plt.cm.Spectral(each), alpha=0.8) for each in np.linspace(0, 1, len(unique_labels))]
# fig = go.Figure()

# for k, col in zip(unique_labels, colors):
#     if k == -1:
#         col = 'rgba(0,0,0,0)'

#     class_member_mask = labels == k

#     core_samples = full_info.iloc[:, 3:6][class_member_mask & core_samples_mask]
#     fig.add_trace(go.Scatter3d(
#         x=core_samples.iloc[:, 0],
#         y=core_samples.iloc[:, 1],
#         z=core_samples.iloc[:, 2],
#         mode='markers',
#         marker=dict(
#             size=6,
#             color=col,
#             opacity=0.8
#         ),
#         name=f'Cluster {k} Core'
#     ))

#     non_core_samples = full_info.iloc[:, 3:6][class_member_mask & ~core_samples_mask]
#     fig.add_trace(go.Scatter3d(
#         x=non_core_samples.iloc[:, 0],
#         y=non_core_samples.iloc[:, 1],
#         z=non_core_samples.iloc[:, 2],
#         mode='markers',
#         marker=dict(
#             size=6,
#             color=col,
#             opacity=0.5
#         ),
#         name=f'Cluster {k} Non-Core'
#     ))

# fig.update_layout(
#     title=f"Estimated number of clusters: {n_clusters_}",
#     margin=dict(l=0, r=0, b=0, t=0)
# )

# fig.show()

In [37]:
label_0 = full_info[full_info['label'] == 0]
label_1 = full_info[full_info['label'] == 1]
label_2 = full_info[full_info['label'] == 2]
label_out = full_info[(full_info['label'] != 1) & (full_info['label'] != 2) & (full_info['label'] != 0)]

count_0 = get_count_dict(label_0)
count_1 = get_count_dict(label_1)
count_2 = get_count_dict(label_2)
count_out = get_count_dict(label_out)

print(full_info['label'].unique())

len(count_out)

[ 0  1  2 -1]


196

In [38]:
full_0 = rbind_data.loc[count_0.keys()]
full_1 = rbind_data.loc[count_1.keys()]
full_2 = rbind_data.loc[count_2.keys()]

# 離群值不需要被處理
full_out = rbind_data.loc[count_out.keys()]

lst01 = list(count_0.keys() & count_1.keys())
lst02 = list(count_0.keys() & count_2.keys())
lst12 = list(count_1.keys() & count_2.keys())
lsto0 = list(count_out.keys() & count_0.keys())
lsto1 = list(count_out.keys() & count_1.keys())
lsto2 = list(count_out.keys() & count_2.keys())

# 將重複的key另外拉出進行分析，這裡drop是為了符合卡方的獨立性前提假設
full_01 = full_0.loc[lst01]
full_02 = full_0.loc[lst02]
full_12 = full_1.loc[lst12]

full_combine = pd.concat([full_01, full_02, full_12], axis=0) # full_out
full_combine = full_combine.reset_index()
full_combine = full_combine.drop_duplicates(subset='index', keep='first')
full_combine = full_combine.drop('index', axis=1)
# 去掉連接點，使分析更嚴謹
full_0 = full_0.drop(lst01 + lst02 + lsto0, errors='ignore')
full_1 = full_1.drop(lst01 + lst12 + lsto1, errors='ignore')
full_2 = full_2.drop(lst02 + lst12 + lsto2, errors='ignore')

print('01連接點數量', len(lst01))
print('02連接點數量', len(lst02))
print('12連接點數量', len(lst12))
print('o0連接點數量', len(lsto0))
print('o1連接點數量', len(lsto1))
print('o2連接點數量', len(lsto2))
print('離群值數量', full_out.shape[0])

full_combine.shape[0] + full_0.shape[0] + full_1.shape[0] + full_2.shape[0] == rbind_data.shape[0]

01連接點數量 971
02連接點數量 0
12連接點數量 0
o0連接點數量 11
o1連接點數量 14
o2連接點數量 69
離群值數量 196


False

In [39]:
full_out.to_csv('CalculatedData/離群比較/拓樸V2.csv', index=False)

In [40]:
lst_logit = [
    '路面狀況-路面狀態名稱',
    '當事者屬-性-別名稱', '當事者事故發生時年齡', 
    '車輛撞擊部位大類別名稱-最初',
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者',
    '道路型態大類別名稱',
    '事故位置大類別名稱', 
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '事故位置子類別名稱', '事故類型及型態子類別名稱', 
    '當事者行動狀態子類別名稱',
    '車輛撞擊部位子類別名稱-最初', '肇因研判子類別名稱-個別',
    '當事者區分-類別-子類別名稱-車種',
    '保護裝備名稱', '行動電話或電腦或其他相類功能裝置名稱', 
    '當事者行動狀態大類別名稱',
    '車輛撞擊部位大類別名稱-其他', 
    '肇因研判大類別名稱-個別', '肇事逃逸類別名稱-是否肇逃',
    '路面狀況-路面鋪裝名稱', 
    '車道劃分設施-分向設施子類別名稱',
    '道路障礙-障礙物名稱', '車輛撞擊部位子類別名稱-其他'
    
    # '號誌-號誌動作名稱', '當事者區分-類別-大類別名稱-車種', '肇因研判大類別名稱-主要' # 降低預測值
    # '道路障礙-視距名稱', '車道劃分設施-分道設施-快慢車道間名稱', '車輛撞擊部位大類別名稱-其他' # 降低預測值
    # '道路型態子類別名稱', '路面狀況-路面缺陷名稱', '天候名稱' # 降低
]

def get_clusterN_logit(cluster_data, lst):
    scaler = StandardScaler()
    
    c0_for_lm = process_data(cluster_data)
    c0_for_lm_X = pd.DataFrame(scaler.fit_transform(c0_for_lm), columns=c0_for_lm.columns).reset_index(drop=True, inplace=False)
    # label設定
    c0_for_lm_y = cluster_data.apply(lambda row: 1 if row['死亡'] != 0 else 2, axis=1)
        
    c0_for_lm_X = c0_for_lm_X[lst]
    
    return c0_for_lm_X, c0_for_lm_y


full_combine_X, full_combine_y = get_clusterN_logit(full_combine, lst_logit)
full_0_X, full_0_y = get_clusterN_logit(full_0, lst_logit)
full_1_X, full_1_y = get_clusterN_logit(full_1, lst_logit)
full_2_X, full_2_y = get_clusterN_logit(full_2, lst_logit)

full_out_X, full_out_y = get_clusterN_logit(full_out, lst_logit) # 新增

print(full_combine_y.value_counts())
print(full_0_y.value_counts())
print(full_1_y.value_counts())
print(full_2_y.value_counts())

2    951
1     20
dtype: int64
2    156426
1       596
dtype: int64
2    6902
1      87
dtype: int64
2    3452
1      74
dtype: int64


In [43]:
matrix_combine, score_combine, cm_combine = logistic_cm_gridsearch(full_combine_X,  full_combine_y)
matrix_out, score_out, cm_out = logistic_cm_gridsearch(full_out_X,  full_out_y)
print(score_combine, score_out)


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_i

1.0 0.875


In [45]:
cm_combine

array([[4, 0],
       [0, 4]])

In [46]:
cm_out

array([[3, 1],
       [0, 4]])

In [47]:
start_time = time.time()
# matrix_combine, score_combine, cm_combine = logistic_cm_gridsearch(full_combine_X,  full_combine_y)
matrix_0, score_0, cm_0 = logistic_cm_gridsearch(full_0_X, full_0_y)
matrix_1, score_1, cm_1 = logistic_cm_gridsearch(full_1_X, full_1_y)
matrix_2, score_2, cm_2 = logistic_cm_gridsearch(full_2_X, full_2_y)
# print(score_combine, score_0, score_1, score_2)
print(score_0, score_1, score_2)
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_i

0.7162921348314607 0.875 1.0
1888.997601032257



The max_iter was reached which means the coef_ did not converge



In [56]:
print(cm_0, '\n', cm_1, '\n', cm_2)

[[114  64]
 [ 37 141]] 
 [[19  5]
 [ 1 23]] 
 [[20  0]
 [ 0 20]]


In [60]:
print(cm_0 + cm_1 + cm_2)

[[153  69]
 [ 38 184]]


In [61]:
print(matrix_0, '\n\n', matrix_1, '\n\n', matrix_2)

     Label  Precision    Recall  F1 Score
0  Class_0   0.754967  0.640449  0.693009
1  Class_1   0.687805  0.792135  0.736292 

      Label  Precision    Recall  F1 Score
0  Class_0   0.950000  0.791667  0.863636
1  Class_1   0.821429  0.958333  0.884615 

      Label  Precision  Recall  F1 Score
0  Class_0        1.0     1.0       1.0
1  Class_1        1.0     1.0       1.0


In [79]:
conf_matrix = cm_0 + cm_1 + cm_2

y_test, y_pred = [], []
for i in range(len(conf_matrix)):
    for j in range(len(conf_matrix[i])):
        y_test.extend([i] * conf_matrix[i][j])
        y_pred.extend([j] * conf_matrix[i][j])

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=None, zero_division=0)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

metrics_df = pd.DataFrame({
    'Label': [f'Class_{i}' for i in range(len(precision))],
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
})

### Mapper總平均和評估指標

In [95]:
print(metrics_df, '\n\n', cm_0 + cm_1 + cm_2)

     Label  Precision    Recall  F1 Score
0  Class_0   0.801047  0.689189  0.740920
1  Class_1   0.727273  0.828829  0.774737 

 [[153  69]
 [ 38 184]]


In [94]:
de = np.sum(cm_0 + cm_1 + cm_2 + cm_combine + cm_out)
logit_avg_score = (np.sum(cm_0)/de)*score_0 + (np.sum(cm_1)/de)*score_1 + (np.sum(cm_2)/de)*score_2 + (np.sum(cm_combine)/de)*score_combine + (np.sum(cm_out)/de)*score_out
print(logit_avg_score)

0.765217391304348


### 原始方法總平均和評估指標

In [41]:
origin_X, origin_y = get_clusterN_logit(rbind_data, lst_logit)

start_time = time.time()

matrix_origin, score_origin, cm_origin = logistic_cm_gridsearch(origin_X, origin_y)
print(score_origin)

end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)

0.7061224489795919
196.11265206336975


In [84]:
print(matrix_origin, '\n\n', cm_origin)

     Label  Precision    Recall  F1 Score
0  Class_0   0.724444  0.665306  0.693617
1  Class_1   0.690566  0.746939  0.717647 

 [[163  82]
 [ 62 183]]
