In [1]:
import pickle
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV


from tdamapper.core import MapperAlgorithm
from tdamapper.cover import CubicalCover
from tdamapper.plot import MapperLayoutInteractive, MapperLayoutStatic
from tdamapper.clustering import FailSafeClustering
from sklearn import metrics

from functions import *
from chi import *
from regressionP import *
from models import *

In [2]:
data1 = pd.read_csv("./Data/NPA_TMA2_1.csv", low_memory=False)[:-2]
data2 = pd.read_csv("./Data/NPA_TMA2_2.csv", low_memory=False)[:-2]
data3 = pd.read_csv("./Data/NPA_TMA2_3.csv", low_memory=False)[:-2]
data4 = pd.read_csv("./Data/NPA_TMA2_4.csv", low_memory=False)[:-2]
dataA2 = pd.concat([data1, data2, data3, data4], ignore_index=True)

dataA1 = pd.read_csv("./Data/NPA_TMA1.csv")[:-2]

In [3]:
def preprocess(input_data, select_lst, sample = 592):
    sample_data = input_data[input_data['當事者順位'] == 1].reset_index(drop=True, inplace=False).sample(sample).reset_index(drop=True)
    dataA = sample_data[select_lst]
    
    death_injury_data = split_death_injury(dataA['死亡受傷人數'])
    dist_df = pd.concat([dataA, death_injury_data], axis=1)
    dist_df.drop(columns=['死亡受傷人數'], inplace=True)
    
    return dist_df, sample_data

# List of columns to select
select_lst = [
    '天候名稱', 
    '路面狀況-路面狀態名稱',
    '肇因研判大類別名稱-主要', '當事者屬-性-別名稱', '當事者事故發生時年齡', 
    '車輛撞擊部位大類別名稱-最初',
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者', '道路障礙-視距品質名稱',
    '道路型態大類別名稱', '事故位置大類別名稱',
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '死亡受傷人數',
    '經度', '緯度',
    '道路型態子類別名稱', '事故位置子類別名稱', '車道劃分設施-分向設施子類別名稱', '事故類型及型態子類別名稱', 
    '當事者行動狀態子類別名稱', '車輛撞擊部位子類別名稱-最初', '車輛撞擊部位子類別名稱-其他', '肇因研判子類別名稱-個別',
]

# Pipeline
1. 抽樣
2. 帶入mapper
3. cluster
4. 帶入模型

In [13]:
def resample_plot(A1, A2, selected, p_age = False):
    dist_dfA1 = preprocess(dataA1, selected, sample = 592)
    dist_dfA2 = preprocess(dataA2, selected, sample = 11940)

    rbind_data = pd.concat([dist_dfA1[0], dist_dfA2[0]], axis=0, ignore_index=True)
    
    rbind_data.loc[rbind_data['受傷'] > 1, '受傷'] = 2
    rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)
    if p_age == True:
        rbind_data = process_age(rbind_data)

    dist_df = process_data(rbind_data)
    scaler = StandardScaler()

    full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
    X1 = full_dist.drop(['受傷', '死亡', '經度', '緯度'], axis=1).to_numpy()

    lens1 = PCA(10).fit_transform(X1)

    mapper_algo1 = MapperAlgorithm(
        cover = CubicalCover(
            n_intervals = 3,
            overlap_frac = 0.2
        ),
        clustering = FailSafeClustering(
            clustering = AgglomerativeClustering(3, linkage='ward'),
            verbose = False)
    )
    mapper_graph1 = mapper_algo1.fit_transform(X1, lens1)

    mapper_plot1 = MapperLayoutInteractive(
        mapper_graph1,
        colors = rbind_data[['速限-第1當事者']].to_numpy(),
        cmap = 'jet',
        agg = np.nanmean,
        # agg = most_frequent_nonan,
        dim = 3,
        iterations = 30,
        seed = 15,
        width = 800,
        height = 500)
    
    return mapper_plot1, rbind_data

def get_full_info(mapper_plot):
    x = vars(mapper_plot._MapperLayoutInteractive__fig)['_data_objs'][1]['x']
    y = vars(mapper_plot._MapperLayoutInteractive__fig)['_data_objs'][1]['y']
    z = vars(mapper_plot._MapperLayoutInteractive__fig)['_data_objs'][1]['z']

    threeDimData = pd.DataFrame({'x': x, 'y': y, 'z': z})

    import re
    data_tuple = vars(mapper_plot._MapperLayoutInteractive__fig)['_data_objs'][1]['text']

    data = []
    for item in data_tuple:
        color = int(re.search(r'color: (\d+)', item).group(1))
        node = int(re.search(r'node: (\d+)', item).group(1))
        size = int(re.search(r'size: (\d+)', item).group(1))
        data.append({'color': color, 'node': node, 'size': size})
    component_info = pd.DataFrame(data)

    full_info = pd.concat([component_info, threeDimData], axis=1)

    mp_content_origin = vars(mapper_plot._MapperLayoutInteractive__graph)['_node']

    mp_content = pd.DataFrame.from_dict(mp_content_origin, orient='index')
    mp_content.reset_index(inplace=True)
    mp_content.rename(columns={'index': 'node'}, inplace=True)

    full_info = pd.merge(full_info, mp_content, on=['node', 'size'], how='inner')
    full_info.iloc[:, 3:6]
    
    return full_info

In [18]:
# mapper_plot1, rbind_data, full_info, calinski_data = process_with_adjusted_threshold(dataA1, dataA2, select_lst)
# mapper_plot1, rbind_data = resample_plot(dataA1, dataA2, select_lst)
# fig_mean1 = mapper_plot1.plot()
# fig_mean1.show(config={'scrollZoom': True})

In [19]:
# full_info = get_full_info(mapper_plot1)
# calinski_data = get_calinski_from_db(full_info, 0.01)
# print(calinski_data[0])

# labels = calinski_data[3]
# db = calinski_data[2]
# n_clusters_ = calinski_data[4]

# unique_labels = set(labels)
# core_samples_mask = np.zeros_like(labels, dtype=bool)
# core_samples_mask[db.core_sample_indices_] = True

# def matplotlib_to_plotly(cmap, alpha=1):
#     """rgba"""
#     return f'rgba({int(cmap[0]*255)}, {int(cmap[1]*255)}, {int(cmap[2]*255)}, {alpha})'

# # colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]  
# colors = [matplotlib_to_plotly(plt.cm.Spectral(each), alpha=0.8) for each in np.linspace(0, 1, len(unique_labels))]
# fig = go.Figure()

# for k, col in zip(unique_labels, colors):
#     if k == -1:
#         # col = 'rgba(0,0,0,0)'
#         col = 'rgba(0,0,0,0)'

#     class_member_mask = labels == k

#     core_samples = full_info.iloc[:, 3:6][class_member_mask & core_samples_mask]
#     fig.add_trace(go.Scatter3d(
#         x=core_samples.iloc[:, 0],
#         y=core_samples.iloc[:, 1],
#         z=core_samples.iloc[:, 2],
#         mode='markers',
#         marker=dict(
#             size=6,
#             color=col,
#             opacity=0.8
#         ),
#         name=f'Cluster {k} Core'
#     ))

#     non_core_samples = full_info.iloc[:, 3:6][class_member_mask & ~core_samples_mask]
#     fig.add_trace(go.Scatter3d(
#         x=non_core_samples.iloc[:, 0],
#         y=non_core_samples.iloc[:, 1],
#         z=non_core_samples.iloc[:, 2],
#         mode='markers',
#         marker=dict(
#             size=6,
#             color=col,
#             opacity=0.5
#         ),
#         name=f'Cluster {k} Non-Core'
#     ))

# fig.update_layout(
#     title=f"Estimated number of clusters: {n_clusters_}",
#     margin=dict(l=0, r=0, b=0, t=0)
# )

# fig.show()

In [61]:
def get_counts(full_info):
    label_0 = full_info[full_info['label'] == 0]
    label_1 = full_info[full_info['label'] == 1]
    label_out = full_info[(full_info['label'] != 1) & (full_info['label'] != 0)]

    count_0 = get_count_dict(label_0)
    count_1 = get_count_dict(label_1)
    count_out = get_count_dict(label_out)

#     print(full_info['label'].unique())

#     print(len(count_0), len(count_1), len(count_2))
    
    full_0 = rbind_data.loc[count_0.keys()]
    full_1 = rbind_data.loc[count_1.keys()]
    # 離群值不需要被處理
    full_out = rbind_data.loc[count_out.keys()]

    lst01 = list(count_0.keys() & count_1.keys())
    lsto0 = list(count_out.keys() & count_0.keys())
    lsto1 = list(count_out.keys() & count_1.keys())

    # 將重複的key另外拉出進行分析，這裡drop是為了符合卡方的獨立性前提假設
    full_01 = full_0.loc[lst01]

    full_combine = pd.concat([full_01, full_out], axis=0)

    # 四掉連接點，使分析更嚴謹
    full_0 = full_0.drop(lst01, errors='ignore')
    full_0 = full_0.drop(lsto0, errors='ignore')

    full_1 = full_1.drop(lst01, errors='ignore')
    full_1 = full_1.drop(lsto1, errors='ignore')

    # print(full_0.shape, full_1.shape, full_2.shape)
    # print('01連接點數量', len(lst01))
    # print('02連接點數量', len(lst02))
    # print('12連接點數量', len(lst12))
    # print('o0連接點數量', len(lsto0))
    # print('o1連接點數量', len(lsto1))
    # print('o2連接點數量', len(lsto2))
    # print('離群值數量', full_out.shape[0])
    
    return full_0, full_1, full_combine

In [6]:
def get_clusterN_logit(cluster_data, lst):
    scaler = StandardScaler()
    
    c0_for_lm = process_data(cluster_data)
    c0_for_lm_X = pd.DataFrame(scaler.fit_transform(c0_for_lm), columns=c0_for_lm.columns)
    
    # 設置三個等級的label
    # c0_for_lm_y = cluster_data['受傷']
    # c0_for_lm_y = cluster_data.apply(lambda row: 2 if row['受傷'] >= 2 else 1, axis=1)
    c0_for_lm_y = cluster_data.apply(lambda row: 2 if row['死亡'] != 0 else (2 if row['受傷'] >= 2 else 1), axis=1)
        
    c0_for_lm_X = c0_for_lm_X[lst]
    
    return c0_for_lm_X, c0_for_lm_y

# Pipeline

In [62]:
# def process_with_adjusted_threshold(dataA1, dataA2, select_lst, initial_threshold=0.03, min_threshold=0.1, decrement=0.01):
#     threshold = initial_threshold
#     while threshold >= min_threshold:
#         mapper_plot1, rbind_data = resample_plot(dataA1, dataA2, select_lst)
#         full_info = get_full_info(mapper_plot1)
#         calinski_data = get_calinski_from_db(full_info, threshold)

#         if len(full_info['label'].unique()) >= 3:
#             break  # 停止迴圈，因為已達到條件
#         threshold -= decrement

#     mapper_plot1, rbind_data = resample_plot(dataA1, dataA2, select_lst)
#     full_info = get_full_info(mapper_plot1)
#     calinski_data = get_calinski_from_db(full_info, threshold)

#     return mapper_plot1, rbind_data, full_info, calinski_data

def process_with_adjusted_threshold(dataA1, dataA2, select_lst, initial_threshold=0.03, min_threshold=0.15, decrement=0.01):

    mapper_plot1, rbind_data = resample_plot(dataA1, dataA2, select_lst)
    full_info = get_full_info(mapper_plot1)
    calinski_data = get_calinski_from_db(full_info, initial_threshold)

    return mapper_plot1, rbind_data, full_info, calinski_data

In [64]:
lst_logit = [
    '天候名稱', 
    '路面狀況-路面狀態名稱',
    '肇因研判大類別名稱-主要', '當事者屬-性-別名稱', '當事者事故發生時年齡', 
    '車輛撞擊部位大類別名稱-最初',
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者',
    '道路型態大類別名稱', '事故位置大類別名稱',
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    # '死亡受傷人數',
    # '經度', '緯度',
    '道路型態子類別名稱', '事故位置子類別名稱', '車道劃分設施-分向設施子類別名稱', '事故類型及型態子類別名稱', 
    '當事者行動狀態子類別名稱', '車輛撞擊部位子類別名稱-最初', '車輛撞擊部位子類別名稱-其他', '肇因研判子類別名稱-個別',
]

In [70]:
TDA_scores_lg = []
TDA_scores_rf = []
TDA_scores_svm = []
Origin_scores_lg = []
Origin_scores_rf = []
Origin_scores_svm = []

# for i in range(2):
#     start_time = time.time()

#     mapper_plot1, rbind_data, full_info, calinski_data = process_with_adjusted_threshold(dataA1, dataA2, select_lst)

#     end_time = time.time()
#     elapsed_time = end_time - start_time
#     print(elapsed_time)

#     full_0, full_1, full_combine = get_counts(full_info)

#     # print(full_combine.shape[0] + full_0.shape[0] + full_1.shape[0] + full_2.shape[0] == rbind_data.shape[0])

#     full_combine_X,  full_combine_y = get_clusterN_logit(full_combine, lst_logit)
#     full_0_X, full_0_y = get_clusterN_logit(full_0, lst_logit)
#     full_1_X, full_1_y = get_clusterN_logit(full_1, lst_logit)

#     # 帶入模型
#     matrix_0, score_0 = logistic_cm_gridsearch(full_0_X, full_0_y)
#     rf_matrix_0, rf_score_0 = decision_tree_cm_with_gridsearch(full_0_X, full_0_y)
#     svc_matrix_0, svc_score_0 = svc_cm_with_grid_search(full_0_X, full_0_y)

#     matrix_1, score_1 = logistic_cm_gridsearch(full_1_X, full_1_y)
#     rf_matrix_1, rf_score_1 = decision_tree_cm_with_gridsearch(full_1_X, full_1_y)
#     svc_matrix_1, svc_score_1 = svc_cm_with_grid_search(full_1_X, full_1_y)

#     matrix_combine, score_combine = logistic_cm_gridsearch(full_combine_X, full_combine_y)
#     rf_matrix_combine, rf_score_combine = decision_tree_cm_with_gridsearch(full_combine_X, full_combine_y)
#     svc_matrix_combine, svc_score_combine = svc_cm_with_grid_search(full_combine_X, full_combine_y)

#     de = full_0_X.shape[0] + full_1_X.shape[0] + full_2_X.shape[0]
#     logit_avg_score = (full_0_X.shape[0]/de)*score_0 + (full_1_X.shape[0]/de)*score_1 + (full_combine_X.shape[0]/de)*score_combine
#     rf_avg_score = (full_0_X.shape[0]/de)*rf_score_0 + (full_1_X.shape[0]/de)*rf_score_1 + (full_combine_X.shape[0]/de)*rf_score_combine
#     svc_avg_score = (full_0_X.shape[0]/de)*svc_score_0 + (full_1_X.shape[0]/de)*svc_score_1 + (full_combine_X.shape[0]/de)*svc_score_combine
#     print(f'Logistic : {round(logit_avg_score, 3)}\nRF : {round(rf_avg_score, 3)}\nSVM : {round(svc_avg_score, 3)}')

#     origin_X, origin_y = get_clusterN_logit(rbind_data, lst_logit)
#     matrix_origin, score_origin = logistic_cm(origin_X, origin_y)
#     rf_matrix_origin, rf_score_origin = decision_tree_cm(origin_X, origin_y)
#     svc_matrix_origin, svc_score_origin = svc_cm(origin_X, origin_y)
#     print(f'Logistic : {round(score_origin, 3)}\nRF : {round(rf_score_origin, 3)}\nSVM : {round(svc_score_origin, 3)}')
    
#     TDA_scores_lg.append(round(logit_avg_score, 3))
#     TDA_scores_rf.append(round(rf_avg_score, 3))
#     TDA_scores_svm.append(round(svc_avg_score, 3))
#     Origin_scores_lg.append(round(score_origin, 3))
#     Origin_scores_rf.append(round(rf_score_origin, 3))
#     Origin_scores_svm.append(round(svc_score_origin, 3))

In [68]:
def svc_cm_with_grid_search(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

    # Set up the parameter grid
    param_grid = {
        'C': [1],
        'multi_class': ['ovr', 'crammer_singer']
    }

    # Initialize the GridSearchCV object
    grid_search = GridSearchCV(LinearSVC(penalty='l2', dual=True, fit_intercept=True, random_state=42, max_iter=3000), param_grid, cv=5, scoring='accuracy')
    
    # Fit the grid search to the data
    grid_search.fit(X_resampled, y_resampled)
    
    # Extract the best model
    best_model = grid_search.best_estimator_
    
    # Make predictions
    y_pred = best_model.predict(X_test)

    # Calculate confusion matrix and accuracy
    conf_matrix = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    # Prepare detailed performance metrics
    cm_df = pd.DataFrame(conf_matrix, index=[f'Actual_{i}' for i in range(conf_matrix.shape[0])], columns=[f'Predicted_{i}' for i in range(conf_matrix.shape[1])])
    precision = precision_score(y_test, y_pred, average=None, zero_division=0)
    recall = recall_score(y_test, y_pred, average=None)
    f1 = f1_score(y_test, y_pred, average=None)

    metrics = {
        'Label': [f'Class_{i}' for i in range(len(precision))],
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
    
    metrics_df = pd.DataFrame(metrics)
    
    return metrics_df, accuracy

def logistic_cm_gridsearch(X, y):
    # 分割資料集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 使用RandomOverSampler來平衡資料
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

    # 建立邏輯回歸模型並使用GridSearchCV來找到最佳參數
    model = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=1000)
    parameters = {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1, 10]
    }
    grid_search = GridSearchCV(model, parameters, cv=5, scoring='accuracy')
    grid_search.fit(X_resampled, y_resampled)

    # 使用最佳模型進行預測
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # 生成並打印混淆矩陣和各項度量指標
    conf_matrix = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None, zero_division=0)
    recall = recall_score(y_test, y_pred, average=None)
    f1 = f1_score(y_test, y_pred, average=None)

    metrics_df = pd.DataFrame({
        'Label': [f'Class_{i}' for i in range(len(precision))],
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

    return metrics_df, accuracy

def decision_tree_cm_with_gridsearch(X, y):
    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Handling class imbalance
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
    
    # Scaling features
    scaler = StandardScaler()
    X_resampled_scaled = scaler.fit_transform(X_resampled)
    X_test_scaled = scaler.transform(X_test)

    # Defining the model and parameters for grid search
    dt_model = RandomForestClassifier(random_state=43)
    param_grid = {
        'n_estimators': [100, 200],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [5, 10],
        'min_samples_leaf': [2, 4]
    }

    # Grid search
    grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_resampled_scaled, y_resampled)

    # Predicting labels for the test set
    y_pred = grid_search.predict(X_test_scaled)

    # Calculating the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_matrix)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Formatting the confusion matrix for display
    cm_df = pd.DataFrame(conf_matrix, index=[f'Actual_{i}' for i in range(conf_matrix.shape[0])], 
                         columns=[f'Predicted_{i}' for i in range(conf_matrix.shape[1])])
    precision = precision_score(y_test, y_pred, average=None, zero_division=0)
    recall = recall_score(y_test, y_pred, average=None)
    f1 = f1_score(y_test, y_pred, average=None)

    metrics = {
        'Label': [f'Class_{i}' for i in range(len(precision))],
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
    
    metrics_df = pd.DataFrame(metrics)
    
    return metrics_df, accuracy

In [31]:
matrix_0, score_0,  cm_0 = logistic_cm_gridsearch(full_0_X, full_0_y)
rf_matrix_0, rf_score_0, rf_cm_0 = rf_with_gridsearch(full_0_X, full_0_y)
svc_matrix_0, svc_score_0, svc_cm_0 = svc_cm_with_grid_search(full_0_X, full_0_y)

matrix_1, score_1, cm_1 = logistic_cm_gridsearch(full_1_X, full_1_y)
rf_matrix_1, rf_score_1, rf_cm_1 = rf_with_gridsearch(full_1_X, full_1_y)
svc_matrix_1, svc_score_1, svc_cm_1 = svc_cm_with_grid_search(full_1_X, full_1_y)

matrix_2, score_2, cm_2 = logistic_cm_gridsearch(full_2_X, full_2_y)
rf_matrix_2, rf_score_2, rf_cm_2 = rf_with_gridsearch(full_2_X, full_2_y)
svc_matrix_2, svc_score_2, svc_cm_2 = svc_cm_with_grid_search(full_2_X, full_2_y)

de = full_0_X.shape[0] + full_1_X.shape[0] + full_2_X.shape[0]
logit_avg_score = (full_0_X.shape[0]/de)*score_0 + (full_1_X.shape[0]/de)*score_1 + (full_2_X.shape[0]/de)*score_cb
rf_avg_score = (full_0_X.shape[0]/de)*rf_score_0 + (full_1_X.shape[0]/de)*rf_score_1 + (full_2_X.shape[0]/de)*rf_score_cb
svc_avg_score = (full_0_X.shape[0]/de)*svc_score_0 + (full_1_X.shape[0]/de)*svc_score_1 + (full_2_X.shape[0]/de)*svm_score_cb
print(f'Logistic : {round(logit_avg_score, 3)}\nRF : {round(rf_avg_score, 3)}\nSVM : {round(svc_avg_score, 3)}')

origin_X, origin_y = get_clusterN_logit(rbind_data, lst_logit)

matrix_origin, score_origin, cm_origin = logistic_cm_gridsearch(origin_X, origin_y)
rf_matrix_origin, rf_score_origin, rf_cm_origin = rf_with_gridsearch(origin_X, origin_y)
svc_matrix_origin, svc_score_origin, svc_cm_origin = svc_cm_with_grid_search(origin_X, origin_y)
print(f'Logistic : {round(score_origin, 3)}\nRF : {round(rf_score_origin, 3)}\nSVM : {round(svc_score_origin, 3)}')

[[1612   23]
 [ 677    8]]
[[1526  109]
 [ 613   72]]
[[1635    0]
 [ 685    0]]
[[33  1]
 [14  1]]
[[28  6]
 [13  2]]
[[33  1]
 [14  1]]
[[17  1]
 [13  1]]
[[17  1]
 [10  4]]
[[18  0]
 [14  0]]
Logistic : 0.697
RF : 0.688
SVM : 0.703
[[1671   29]
 [ 688   12]]
[[1529  171]
 [ 619   81]]
[[1700    0]
 [ 700    0]]
Logistic : 0.701
RF : 0.671
SVM : 0.708
