In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.0 colorlog-6.7.0 optuna-3.4.0


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import os
import random
from tqdm.auto import tqdm

# ignore warning
import warnings
warnings.filterwarnings('ignore')

# ML
from sklearn.ensemble import RandomForestClassifier  # Bagging
from sklearn.linear_model import LogisticRegression  # LogisticRegression
# from sklearn.svm import SVC                          # SVM

from xgboost.sklearn import XGBClassifier            # GBM
from lightgbm.sklearn import LGBMClassifier          # LGBM

# train_test_split
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

# DL
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, ReLU, Softmax, Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# for checking multi-collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# KFold(CV), partial : for optuna
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from functools import partial
from imblearn.over_sampling import SMOTE

# 피처 스케일링
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# AutoML framework
import optuna
from optuna.samplers import TPESampler

In [3]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)

seed_everything()

In [4]:
is_IQR = False
is_scaler = False
is_dropcol = True
is_012 = False
# ======= Set K of K-fold =======
K = 10

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
base_path = '/content/drive/MyDrive/Colab Notebooks/크몽/바이오/'

df_tc = pd.read_csv(base_path + '(20230923)+Test+cohort+(all+biomarkers)_췌장암.csv')
df_td = pd.read_csv(base_path + '(20230923)+Test+cohort+(all+biomarkers)_대조군.csv')
df_vc = pd.read_csv(base_path + '(20230923)+Validation+cohort+(all+biomarkers)_췌장암.csv')
df_vd = pd.read_csv(base_path + '(20230923)+Validation+cohort+(all+biomarkers)_대조군.csv')

In [None]:
# 컬럼 이름 변경
import re

# Function to clean the column names
def clean_column_name(col):
    if col == 'Stage(TNM)':  # Skip this specific column
      return col
    col = re.sub(r'\([^)]*\)', '', col)  # Remove content within parentheses
    col = re.sub(r'^\d+-', '', col)  # Remove leading numbers and dash
    col = col.replace('\n', '')  # Remove newline characters
    return col.strip()

# # 기존 컬럼 리스트
original_columns1 = df_tc.columns.tolist()
original_columns2 = df_td.columns.tolist()

# Clean the column names
new_columns1 = [clean_column_name(col) for col in original_columns1]
new_columns2 = [clean_column_name(col) for col in original_columns2]

df_tc.columns = new_columns1
df_td.columns = new_columns2

# EDA

## Knn imputer (결측치 채우기)

In [None]:
imputer = KNNImputer(n_neighbors=5)

df_tc_Stage = df_tc.pop('Stage(TNM)')

imputed_data = imputer.fit_transform(df_tc)
imputed_df = pd.DataFrame(imputed_data, columns=df_tc.columns)

In [None]:
imputer2 = KNNImputer(n_neighbors=5)
imputed_data2 = imputer2.fit_transform(df_td)
imputed_df2 = pd.DataFrame(imputed_data2, columns=df_td.columns)

In [None]:
df_tc = imputed_df

In [None]:
df_td = imputed_df2

In [None]:
print(df_td.shape)
print(df_tc.shape)

## 컬럼 제거



In [None]:
# No. 컬럼 제거
df_tc = df_tc.drop(columns='No.')
df_td = df_td.drop(columns='No.')

#### 췌장암

In [None]:
if is_dropcol:
  # df_tc = df_tc.drop(columns=['TSP-2',
  #                                   'G-CSF',
  #                                   'FGF-1',
  #                                   'MIF',
  #                                   'IL-6',
  #                                   'TNFa',
  #                                   'Cyfra21-1',
  #                                   'FGF2',
  #                                   'bHCG',
  #                                   'HE4',
  #                                   'TGFa'])


  df_tc_drop = df_tc[['CA19-9','IL-8','GDF15']]

#### 대조군

In [None]:
if is_dropcol:
  # df_td = df_td.drop(columns=['TSP-2',
  #                                   'G-CSF',
  #                                   'FGF-1',
  #                                   'MIF',
  #                                   'IL-6',
  #                                   'TNFa',
  #                                   'Cyfra21-1',
  #                                   'FGF2',
  #                                   'bHCG',
  #                                   'HE4',
  #                                   'TGFa'])


  df_td_drop = df_td[['CA19-9','IL-8','GDF15']]

### histplot

In [None]:
# for i in df_td.columns:
#   plt.figure(figsize=(12,6))
#   sns.histplot(df_td[i],bins=30)

In [None]:
# for i in df_tc.columns:
#   plt.figure(figsize=(12,6))
#   sns.histplot(df_tc[i],bins=30)

## 이상치 제거(IQR)

In [None]:
print(df_td.shape)
print(df_tc.shape)

In [None]:
if is_IQR:
  # IQR 계산
  for i in df_tc.columns:
    Q1 = df_tc[i].quantile(0.25)
    Q3 = df_tc[i].quantile(0.75)
    IQR = Q3 - Q1

    # 이상치 임계값 설정 (일반적으로 IQR * 1.5 사용)
    threshold = 4 * IQR

    # 이상치 식별
    outliers = (df_tc[i] < Q1 - threshold) | (df_tc[i] > Q3 + threshold)

    # 이상치 제거
    df_tc = df_tc[~outliers]

In [None]:
if is_IQR:
  # IQR 계산
  for i in df_td.columns:
    Q1 = df_td[i].quantile(0.25)
    Q3 = df_td[i].quantile(0.75)
    IQR = Q3 - Q1

    # 이상치 임계값 설정 (일반적으로 IQR * 1.5 사용)
    threshold = 4 * IQR

    # 이상치 식별
    outliers = (df_td[i] < Q1 - threshold) | (df_td[i] > Q3 + threshold)

    # 이상치 제거
    df_td = df_td[~outliers]

In [None]:
print(df_td.shape)
print(df_tc.shape)

## 표준화(데이터 합쳐서 진행)

In [None]:
# 데이터셋 합치기
df_t = pd.concat([df_tc, df_td], ignore_index=True)

In [None]:
# 표준화
scaler = StandardScaler()

df_ts = scaler.fit_transform(df_t)
df_ts = pd.DataFrame(df_ts, columns=df_t.columns)

# 표준화 시킨거에 Stage(TNM) 칼럼 다시 합치기
df_ts['Stage(TNM)'] = df_tc_Stage
df_ts['Stage(TNM)'] = df_ts['Stage(TNM)'].fillna(0)

df_ts

In [None]:
df_t['Stage(TNM)'] = df_tc_Stage
df_t['Stage(TNM)'] = df_t['Stage(TNM)'].fillna(0)

if is_dropcol:
  # 데이터셋 합치기
  df_t_drop = pd.concat([df_tc_drop, df_td_drop], ignore_index=True)
  df_t_drop['Stage(TNM)'] = df_tc_Stage
  df_t_drop['Stage(TNM)'] = df_t_drop['Stage(TNM)'].fillna(0)

In [None]:
# # CSV 파일로 저장
# df_t.to_csv(base_path + "concat_data.csv", index=False)

### 01로 변경

In [None]:
df_t01 = df_t.copy()
df_t01.loc[df_t01['Stage(TNM)'] >= 1, 'Stage(TNM)'] = 1

df_ts01 = df_ts.copy()
df_ts01.loc[df_ts01['Stage(TNM)'] >= 1, 'Stage(TNM)'] = 1

if is_dropcol:
  df_t01_drop = df_t_drop.copy()
  df_t01_drop.loc[df_t01_drop['Stage(TNM)'] >= 1, 'Stage(TNM)'] = 1

In [None]:
# 'Stage(TNM)' 열을 기준으로 데이터를 분할
df_t0 = df_t01[df_t01['Stage(TNM)'] == 0]
df_t1 = df_t01[df_t01['Stage(TNM)'] == 1]

# 'Stage(TNM)' 열을 기준으로 데이터를 분할
df_ts0 = df_ts01[df_ts01['Stage(TNM)'] == 0]
df_ts1 = df_ts01[df_ts01['Stage(TNM)'] == 1]

if is_dropcol:
  df_t0_drop = df_t01_drop[df_t01_drop['Stage(TNM)'] == 0]
  df_t1_drop = df_t01_drop[df_t01_drop['Stage(TNM)'] == 1]

In [None]:
df_t01

### 012로 변경

In [None]:
df_t00 = df_t[df_t['Stage(TNM)'] == 0]
df_t12 = df_t[(df_t['Stage(TNM)'] == 1) | (df_t['Stage(TNM)'] == 2)]
df_t34 = df_t[(df_t['Stage(TNM)'] == 3) | (df_t['Stage(TNM)'] == 4)]

df_ts00 = df_ts[df_ts['Stage(TNM)'] == 0]
df_ts12 = df_ts[(df_ts['Stage(TNM)'] == 1) | (df_ts['Stage(TNM)'] == 2)]
df_ts34 = df_ts[(df_ts['Stage(TNM)'] == 3) | (df_ts['Stage(TNM)'] == 4)]

if is_dropcol:
  df_t00_drop = df_t_drop[df_t_drop['Stage(TNM)'] == 0]
  df_t12_drop = df_t_drop[(df_t_drop['Stage(TNM)'] == 1) | (df_t_drop['Stage(TNM)'] == 2)]
  df_t34_drop = df_t_drop[(df_t_drop['Stage(TNM)'] == 3) | (df_t_drop['Stage(TNM)'] == 4)]

# 히트맵

In [None]:
sns.clustermap(data=df_ts)

In [None]:
sns.clustermap(data=df_t, z_score=1)

In [None]:
lut = dict(zip(df_ts['Stage(TNM)'].unique(), "rgb"))

row_colors = df_ts['Stage(TNM)'].map(lut)

# 데이터를 전치하여 가로축과 세로축을 바꿈
df_trans = df_ts.transpose()

# sns.clustermap(df_ts, row_colors=row_colors)

# clustermap 그리기 (전치된 데이터 사용)
sns.clustermap(df_trans, row_cluster=False, col_cluster=False, row_colors=row_colors, figsize=(6, 6))

# 그래프 출력
plt.show()

In [None]:
lut = dict(zip(df_ts['Stage(TNM)'].unique(), "rgb"))

row_colors = df_ts['Stage(TNM)'].map(lut)

# sns.clustermap(df_ts, row_colors=row_colors)

# clustermap 그리기 (row_colors로 행을 정렬)
sns.clustermap(df_ts.drop(columns=['Stage(TNM)']).T, col_colors=row_colors, row_cluster=True, col_cluster=False)

In [None]:
lut = dict(zip(df_ts01['Stage(TNM)'].unique(), "rgb"))

row_colors = df_ts01['Stage(TNM)'].map(lut)

# sns.clustermap(df_ts, row_colors=row_colors)

# clustermap 그리기 (row_colors로 행을 정렬)
sns.clustermap(df_ts01.drop(columns=['Stage(TNM)']).T, col_colors=row_colors, row_cluster=True, col_cluster=False)

In [None]:
# 색상 매핑 딕셔너리 생성
# 색상 매핑 딕셔너리 생성
lut = {0: "r", 1: "g"}

col_colors0 = df_ts0['Stage(TNM)'].map(lut)
col_colors1 = df_ts1['Stage(TNM)'].map(lut)

# 각각의 데이터프레임에 대해 clustermap 그리기
g_0 = sns.clustermap(df_ts0.T, col_colors=col_colors0, row_cluster=False, col_cluster=False)
g_1 = sns.clustermap(df_ts1.T, col_colors=col_colors1, row_cluster=True, col_cluster=False)

# 그래프 출력
plt.show()

In [None]:
import matplotlib.gridspec as gridspec

# 각각의 데이터프레임에 대해 clustermap 그리기
g_0 = sns.clustermap(df_t0.T, col_colors=col_colors0, row_cluster=True, col_cluster=False)
g_1 = sns.clustermap(df_t1.T, col_colors=col_colors1, row_cluster=True, col_cluster=False)

# 그래프를 하나의 figure에 합치기
fig = plt.figure(figsize=(12, 6))
gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1])
ax0 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])

# 각각의 그래프를 figure에 추가
ax0.imshow(g_0.data2d, aspect='auto', cmap='coolwarm', origin='lower')
ax1.imshow(g_1.data2d, aspect='auto', cmap='coolwarm', origin='lower')

# 각각의 그래프에 타이틀 추가
ax0.set_title('Clustermap 1')
ax1.set_title('Clustermap 2')

# 축 숨기기
ax0.axis('off')
ax1.axis('off')

# 그래프 출력
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.gridspec as gridspec
# 그래프를 하나의 figure에 합치기
fig = plt.figure(figsize=(20, 11))
gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1])
ax0 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])

# 'Stage(TNM)' 열을 기준으로 데이터를 분할
df_0 = df_ts01[df_ts01['Stage(TNM)'] == 0]
df_1 = df_ts01[df_ts01['Stage(TNM)'] == 1]

# 색상 매핑 딕셔너리 생성
lut = {0: "r", 1: "g"}

col_colors0 = df_ts0['Stage(TNM)'].map(lut)
col_colors1 = df_ts1['Stage(TNM)'].map(lut)

# 각각의 데이터프레임에 대해 clustermap 그리기
g_0 = sns.clustermap(df_ts0.T, col_colors=col_colors0, row_cluster=True, col_cluster=False)
g_1 = sns.clustermap(df_ts1.T, col_colors=col_colors1, row_cluster=True, col_cluster=False)


# # 각각의 그래프를 figure에 추가
ax0.imshow(g_0.data2d,aspect='auto',cmap='RdBu', origin='lower')
ax1.imshow(g_1.data2d,aspect='auto',cmap='RdBu', origin='lower')


# # 각각의 그래프에 타이틀 추가
# ax0.set_title('Clustermap 1')
# ax1.set_title('Clustermap 2')

# # 축 숨기기
# ax0.axis('off')
ax1.axis('off')

# # 그래프 출력
plt.tight_layout()
plt.show()

In [None]:
from matplotlib import gridspec
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

# Create a new figure with constrained layout
fig = plt.figure(figsize=(20, 15), constrained_layout=True)

# Create a gridspec
gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1], wspace=0.03)  # wspace is the width space

# Create separate clustermaps
g_0 = sns.clustermap(df_ts0.T, col_colors=col_colors0, row_cluster=False, col_cluster=False, figsize=(10, 15), cbar_pos=None)
plt.close()  # Close the plot to prevent it from displaying
g_1 = sns.clustermap(df_ts1.T, col_colors=col_colors1, row_cluster=False, col_cluster=False, figsize=(10, 15), cbar_pos=None)
plt.close()  # Close the plot to prevent it from displaying

# Create a new axes
ax0 = fig.add_subplot(gs[0])
ax1 = fig.add_subplot(gs[1])

# Draw the clustermaps onto the new axes
canvas0 = FigureCanvas(g_0.fig)
canvas0.draw()
ax0.imshow(canvas0.buffer_rgba())

canvas1 = FigureCanvas(g_1.fig)
canvas1.draw()
ax1.imshow(canvas1.buffer_rgba())

# Remove x and y ticks from the new axes
ax0.set_xticks([])
ax0.set_yticks([])
ax1.set_xticks([])
ax1.set_yticks([])

# Show the final plot
plt.show()

In [None]:
# Create a figure to contain the subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 15), gridspec_kw={'width_ratios': [1, 1], 'wspace': 0.003})

# Create the first heatmap
sns.heatmap(df_t0.T, ax=ax1, cmap="RdBu", cbar=False)

# Add color bar to separate samples based on Stage(TNM)
ax1.add_patch(plt.Rectangle((0, 0), df_t0.shape[0], 1, edgecolor="red", facecolor="red", lw=0))

# Create the second heatmap
sns.heatmap(df_t1.T, ax=ax2, cmap="RdBu", cbar=True, yticklabels=False)

# Add color bar to separate samples based on Stage(TNM)
ax2.add_patch(plt.Rectangle((0, 0), df_t1.shape[0], 1, edgecolor="green", facecolor="green", lw=0))

# Add column names to the left of the first heatmap
ax1.set_yticklabels(df_0.columns, rotation=0)

# Hide the axes
ax2.axis('off')

plt.show()

In [None]:
# Create a figure to contain the subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 15), gridspec_kw={'width_ratios': [1, 1], 'wspace': 0.01})

# Create the first heatmap
sns.heatmap(df_t0.T, ax=ax1, cmap="coolwarm", cbar=False)

# Add color bar to separate samples based on Stage(TNM)
ax1.add_patch(plt.Rectangle((0, 0), df_t0.shape[0], 1, edgecolor="red", facecolor="red", lw=0))

# Create the second heatmap
sns.heatmap(df_1.T, ax=ax2, cmap="coolwarm", cbar=True, yticklabels=False)

# Add color bar to separate samples based on Stage(TNM)
ax2.add_patch(plt.Rectangle((0, 0), df_t1.shape[0], 1, edgecolor="green", facecolor="green", lw=0))

# Add column names to the left of the first heatmap
ax1.set_yticklabels(df_t0.columns, rotation=0)

# Remove x-axis labels and ticks from the first heatmap
ax1.set_xticks([])
ax1.set_xticklabels([])
ax1.set_xlabel("")

# Hide the axes of the second heatmap
ax2.axis('off')

plt.show()

In [None]:
# Create a figure to contain the subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 15), gridspec_kw={'width_ratios': [1, 1], 'wspace': 0.005})

# Create the first heatmap
sns.heatmap(df_t0.T, ax=ax1, cmap="coolwarm", cbar=False, vmin = -2.5, vmax=3)

# Create the second heatmap
sns.heatmap(df_t1.T, ax=ax2, cmap="coolwarm", cbar=True, yticklabels=False, vmin = -2.5, vmax=3)

# Add column names to the left of the first heatmap
ax1.set_yticklabels(df_t0.columns, rotation=0)

# Remove x-axis labels and ticks from the first heatmap
ax1.set_xticks([])
ax1.set_xticklabels([])
ax1.set_xlabel("")

# Hide the axes of the second heatmap
ax2.axis('off')

# Add a color bar above the heatmaps to separate samples based on Stage(TNM)
colorbar_ax1 = fig.add_axes([ax1.get_position().x0, ax1.get_position().y1 + 0.005, ax1.get_position().width, 0.02])
colorbar_ax2 = fig.add_axes([ax2.get_position().x0, ax2.get_position().y1 + 0.005, ax2.get_position().width, 0.02])

colorbar_ax1.text(-0.13, 0.5, 'Stage(TNM)', verticalalignment='center', horizontalalignment='left', fontsize=12, va='center')
colorbar_ax1.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="red", edgecolor="red"))
colorbar_ax1.axis('off')

colorbar_ax2.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="green", edgecolor="green"))
colorbar_ax2.axis('off')



plt.show()


In [None]:
# Create a figure to contain the subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 15), gridspec_kw={'width_ratios': [1, 1], 'wspace': 0.005})

# Verify if 'Stage(TNM)' exists in the DataFrame, if so, drop it
if 'Stage(TNM)' in df_t0.columns:
    df_0 = df_t0.drop(columns=['Stage(TNM)'])
if 'Stage(TNM)' in df_t1.columns:
    df_1 = df_t1.drop(columns=['Stage(TNM)'])

# Create the first heatmap
sns.heatmap(df_0.T, ax=ax1, cmap="coolwarm", cbar=False, vmin = -2, vmax=2)

# Create the second heatmap
sns.heatmap(df_1.T, ax=ax2, cmap="coolwarm", cbar=True, yticklabels=False, vmin = -2, vmax=2)

# Add column names to the left of the first heatmap
ax1.set_yticklabels(df_0.columns, rotation=0)

# Remove x-axis labels and ticks from the first heatmap
ax1.set_xticks([])
ax1.set_xticklabels([])
ax1.set_xlabel("")

# Hide the axes of the second heatmap
ax2.axis('off')

# Add a color bar above the heatmaps to separate samples based on Stage(TNM)
colorbar_ax1 = fig.add_axes([ax1.get_position().x0, ax1.get_position().y1 + 0.005, ax1.get_position().width, 0.02])
colorbar_ax2 = fig.add_axes([ax2.get_position().x0, ax2.get_position().y1 + 0.005, ax2.get_position().width, 0.02])

colorbar_ax1.text(-0.13, 0.5, 'Stage(TNM)', verticalalignment='center', horizontalalignment='left', fontsize=12, va='center')
colorbar_ax1.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="red", edgecolor="red"))
colorbar_ax1.axis('off')

colorbar_ax2.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="green", edgecolor="green"))
colorbar_ax2.axis('off')

plt.show()

In [None]:
# Create a figure to contain the subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 15), gridspec_kw={'width_ratios': [1, 1], 'wspace': 0.005})

# Create the first heatmap
cax1 = sns.heatmap(df_t0.T, ax=ax1, cmap="coolwarm", cbar=False, vmin = -2.5, vmax=3)
cax1.add_patch(plt.Rectangle((0, 0), df_t0.shape[0], len(df_t0.columns), fill=False, edgecolor="black", lw=2))

# Create the second heatmap
cax2 = sns.heatmap(df_t1.T, ax=ax2, cmap="coolwarm", cbar=False, yticklabels=False, vmin = -2.5, vmax=3)
cax2.add_patch(plt.Rectangle((0, 0), df_t1.shape[0], len(df_t1.columns), fill=False, edgecolor="black", lw=2))

# Add column names to the left of the first heatmap
ax1.set_yticklabels(df_t0.columns, rotation=0)

# Remove x-axis labels and ticks from the first heatmap
ax1.set_xticks([])
ax1.set_xticklabels([])
ax1.set_xlabel("")

# Hide the axes of the second heatmap
ax2.axis('off')

# Add a color bar above the heatmaps to separate samples based on Stage(TNM)
colorbar_ax1 = fig.add_axes([ax1.get_position().x0, ax1.get_position().y1 + 0.005, ax1.get_position().width, 0.02])
colorbar_ax2 = fig.add_axes([ax2.get_position().x0, ax2.get_position().y1 + 0.005, ax2.get_position().width, 0.02])

# Add black border around color bars
colorbar_ax1.add_patch(plt.Rectangle((-1, -1), 2, 2, facecolor="none", edgecolor="black", linewidth=2))
colorbar_ax2.add_patch(plt.Rectangle((-1, -1), 2, 2, facecolor="none", edgecolor="black", linewidth=2))

colorbar_ax1.text(-0.13, 0.5, 'Stage(TNM)', verticalalignment='center', horizontalalignment='left', fontsize=12, va='center')
colorbar_ax1.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="red", edgecolor="red"))
colorbar_ax1.axis('off')

colorbar_ax2.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="green", edgecolor="green"))
colorbar_ax2.axis('off')

# Add a separate color bar (cbar) for the z-score
cbar_ax = fig.add_axes([0.92, 0.4, 0.02, 0.2])
cbar = plt.colorbar(ax2.collections[0], cax=cbar_ax)
cbar_ax.set_title('z-score', pad=10, fontsize=14)

plt.show()

In [None]:
from matplotlib.patches import Patch

# Create a figure to contain the subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 15), gridspec_kw={'width_ratios': [1, 1], 'wspace': 0.007})

# Create the first heatmap
cax1 = sns.heatmap(df_t0.T, ax=ax1, cmap="coolwarm", cbar=False, vmin = -2, vmax=2)
cax1.add_patch(plt.Rectangle((0, 0), df_t0.shape[0], len(df_t0.columns), fill=False, edgecolor="black", lw=3))

# Create the second heatmap
cax2 = sns.heatmap(df_t1.T, ax=ax2, cmap="coolwarm", cbar=False, yticklabels=False, vmin = -2, vmax=2)
cax2.add_patch(plt.Rectangle((0, 0), df_t1.shape[0], len(df_t1.columns), fill=False, edgecolor="black", lw=3))

# Add column names to the left of the first heatmap
ax1.set_yticklabels(df_t0.columns, rotation=0)

# Remove x-axis labels and ticks from the first heatmap
ax1.set_xticks([])
ax1.set_xticklabels([])
ax1.set_xlabel("")

# Hide the axes of the second heatmap
ax2.axis('off')

# Add a color bar above the heatmaps to separate samples based on Stage(TNM)
colorbar_ax1 = fig.add_axes([ax1.get_position().x0, ax1.get_position().y1 + 0.005, ax1.get_position().width, 0.025])
colorbar_ax2 = fig.add_axes([ax2.get_position().x0, ax2.get_position().y1 + 0.005, ax2.get_position().width, 0.025])

colorbar_ax1.text(-0.075, 0.5, 'Group', verticalalignment='center', horizontalalignment='left', fontsize=12, va='center')
colorbar_ax1.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="#B22222", edgecolor="black", lw=3))
colorbar_ax1.axis('off')

colorbar_ax2.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="#1874CD", edgecolor="black", lw=3))
colorbar_ax2.axis('off')

# Add a separate color bar (cbar) for the z-score
cbar_ax = fig.add_axes([0.92, 0.4, 0.02, 0.2])
cbar = plt.colorbar(ax2.collections[0], cax=cbar_ax)
cbar_ax.set_title('z-score', pad=10, fontsize=14)

# Add a legend for Stage(TNM)
legend_labels = [Patch(facecolor="#B22222", edgecolor="black", label='PDA'),
                 Patch(facecolor="#1874CD", edgecolor="black", label='HC')]
fig.legend(handles=legend_labels, loc='upper right', fontsize=12, frameon=False)

plt.show()


In [None]:
# # Create a figure to contain the subplots
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 15), gridspec_kw={'width_ratios': [1, 1], 'wspace': 0.007})

# # Verify if 'Stage(TNM)' exists in the DataFrame, if so, drop it
# if 'Stage(TNM)' in df_ts0.columns:
#     df_ts0 = df_ts0.drop(columns=['Stage(TNM)'])
# if 'Stage(TNM)' in df_ts1.columns:
#     df_ts1 = df_ts1.drop(columns=['Stage(TNM)'])

# # Create the first heatmap
# cax1 = sns.heatmap(df_ts0.T, ax=ax1, cmap="coolwarm", cbar=False, vmin = -2, vmax=2)
# cax1.add_patch(plt.Rectangle((0, 0), df_ts.shape[0], len(df_ts.columns), fill=False, edgecolor="black", lw=3))

# # Create the second heatmap
# cax2 = sns.heatmap(df_ts1.T, ax=ax2, cmap="coolwarm", cbar=False, yticklabels=False, vmin = -2, vmax=2)
# cax2.add_patch(plt.Rectangle((0, 0), df_ts1.shape[0], len(df_ts.columns), fill=False, edgecolor="black", lw=3))

# # Add column names to the left of the first heatmap
# ax1.set_yticklabels(df_ts.columns, rotation=0)

# # Remove x-axis labels and ticks from the first heatmap
# ax1.set_xticks([])
# ax1.set_xticklabels([])
# ax1.set_xlabel("")

# # Hide the axes of the second heatmap
# ax2.axis('off')

# # Add a color bar above the heatmaps to separate samples based on Stage(TNM)
# colorbar_ax1 = fig.add_axes([ax1.get_position().x0, ax1.get_position().y1 + 0.005, ax1.get_position().width, 0.025])
# colorbar_ax2 = fig.add_axes([ax2.get_position().x0, ax2.get_position().y1 + 0.005, ax2.get_position().width, 0.025])

# colorbar_ax1.text(-0.085, 0.5, 'Group', verticalalignment='center', horizontalalignment='left', fontsize=14, va='center')
# colorbar_ax1.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="#B22222", edgecolor="black", lw=3))
# colorbar_ax1.axis('off')

# colorbar_ax2.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="#1874CD", edgecolor="black", lw=3))
# colorbar_ax2.axis('off')

# # Add a separate color bar (cbar) for the z-score
# cbar_ax = fig.add_axes([0.92, 0.6, 0.015, 0.1])
# cbar = plt.colorbar(ax2.collections[0], cax=cbar_ax)
# cbar_ax.set_title('Z-score', pad=10, fontsize=15, fontweight='bold')
# cbar.ax.tick_params(labelsize=15)  # Set fontsize for cbar ticks to 15

# # Add a legend for Stage(TNM)
# legend_labels = [Patch(facecolor="#B22222", edgecolor="black", label='PDA'),
#                  Patch(facecolor="#1874CD", edgecolor="black", label='HC')]
# legend = fig.legend(handles=legend_labels, loc=(0.92, 0.80), fontsize=15, frameon=False, title='Group')
# legend.get_title().set_fontsize('15') # Legend title fontsize
# legend.get_title().set_fontweight('bold') # Legend title font weight

# plt.show()

In [None]:
# Create a figure to contain the subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 15), gridspec_kw={'width_ratios': [1, 1], 'wspace': 0.007})

# Verify if 'Stage(TNM)' exists in the DataFrame, if so, drop it
if 'Stage(TNM)' in df_ts0.columns:
    df_ts0 = df_ts0.drop(columns=['Stage(TNM)'])
if 'Stage(TNM)' in df_ts1.columns:
    df_ts1 = df_ts1.drop(columns=['Stage(TNM)'])

# Create the first heatmap
cax1 = sns.heatmap(df_ts0.T, ax=ax1, cmap="coolwarm", cbar=False, vmin = -2, vmax=2)
cax1.add_patch(plt.Rectangle((0, 0), df_ts0.shape[0], len(df_ts0.columns), fill=False, edgecolor="black", lw=3))

# Create the second heatmap
cax2 = sns.heatmap(df_ts1.T, ax=ax2, cmap="coolwarm", cbar=False, yticklabels=False, vmin = -2, vmax=2)
cax2.add_patch(plt.Rectangle((0, 0), df_ts1.shape[0], len(df_ts1.columns), fill=False, edgecolor="black", lw=3))

# Add column names to the left of the first heatmap
ax1.set_yticklabels(df_ts0.columns, rotation=0)

# Remove x-axis labels and ticks from the first heatmap
ax1.set_xticks([])
ax1.set_xticklabels([])
ax1.set_xlabel("")

# 기존 y축 라벨을 가져옵니다.
old_y_labels = ax1.get_yticklabels()
# 줄바꿈을 공백으로 대체합니다.
new_y_labels = [label.get_text().replace("\n", " ") for label in old_y_labels]
# 새로운 y축 라벨을 설정합니다.
ax1.set_yticklabels(new_y_labels, fontsize=13)

# Hide the axes of the second heatmap
ax2.axis('off')

# Add a color bar above the heatmaps to separate samples based on Stage(TNM)
colorbar_ax1 = fig.add_axes([ax1.get_position().x0, ax1.get_position().y1 + 0.005, ax1.get_position().width, 0.025])
colorbar_ax2 = fig.add_axes([ax2.get_position().x0, ax2.get_position().y1 + 0.005, ax2.get_position().width, 0.025])

colorbar_ax1.text(-0.085, 0.5, 'Group', verticalalignment='center', horizontalalignment='left', fontsize=14, va='center')
colorbar_ax1.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="#1874CD", edgecolor="black", lw=3))
colorbar_ax1.axis('off')

colorbar_ax2.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="#B22222", edgecolor="black", lw=3))
colorbar_ax2.axis('off')

# Add a separate color bar (cbar) for the z-score
cbar_ax = fig.add_axes([0.92, 0.6, 0.015, 0.1])
cbar = plt.colorbar(ax2.collections[0], cax=cbar_ax)
cbar_ax.set_title('Z-score', pad=10, fontsize=15, fontweight='bold')
cbar.ax.tick_params(labelsize=15)  # Set fontsize for cbar ticks to 15

# Add a legend for Stage(TNM)
legend_labels = [Patch(facecolor="#1874CD", edgecolor="black", label='HC'),
                Patch(facecolor="#B22222", edgecolor="black", label='PDA')]
legend = fig.legend(handles=legend_labels, loc=(0.92, 0.80), fontsize=15, frameon=False, title='Group')
legend.get_title().set_fontsize('15') # Legend title fontsize
legend.get_title().set_fontweight('bold') # Legend title font weight

plt.savefig(base_path + 'img/heatmap.png')
plt.show()

In [None]:
from matplotlib import gridspec
from matplotlib.patches import Patch

# # Create a figure to contain the subplots
# fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 15))

# Verify if 'Stage(TNM)' exists in the DataFrame, if so, drop it
if 'Stage(TNM)' in df_ts00.columns:
    df_ts00 = df_ts00.drop(columns=['Stage(TNM)'])
if 'Stage(TNM)' in df_ts12.columns:
    df_ts12 = df_ts12.drop(columns=['Stage(TNM)'])
if 'Stage(TNM)' in df_ts34.columns:
    df_ts34 = df_ts34.drop(columns=['Stage(TNM)'])

# GridSpec 설정
fig = plt.figure(figsize=(20, 15))
gs = gridspec.GridSpec(1, 3, width_ratios=[len(df_ts00), len(df_ts12), len(df_ts34)])
plt.subplots_adjust(wspace=0.007)

# 각 heatmap을 그립니다.
ax1 = plt.subplot(gs[0])
ax2 = plt.subplot(gs[1])
ax3 = plt.subplot(gs[2])

# Create the first heatmap
cax1 = sns.heatmap(df_ts00.T, ax=ax1, cmap="coolwarm", cbar=False, vmin = -2, vmax=2)
cax1.add_patch(plt.Rectangle((0, 0), df_ts00.shape[0], len(df_ts00.columns), fill=False, edgecolor="black", lw=3))

# Create the second heatmap
cax2 = sns.heatmap(df_ts12.T, ax=ax2, cmap="coolwarm", cbar=False, yticklabels=False, vmin = -2, vmax=2)
cax2.add_patch(plt.Rectangle((0, 0), df_ts12.shape[0], len(df_ts12.columns), fill=False, edgecolor="black", lw=3))

# Create the second heatmap
cax3 = sns.heatmap(df_ts34.T, ax=ax3, cmap="coolwarm", cbar=False, yticklabels=False, vmin = -2, vmax=2)
cax3.add_patch(plt.Rectangle((0, 0), df_ts34.shape[0], len(df_ts34.columns), fill=False, edgecolor="black", lw=3))


# Add column names to the left of the first heatmap
ax1.set_yticklabels(df_ts00.columns, rotation=0)

# Remove x-axis labels and ticks from the first heatmap
ax1.set_xticks([])
ax1.set_xticklabels([])
ax1.set_xlabel("")

# Hide the axes of the second heatmap
ax2.axis('off')
ax3.axis('off')

# Add a color bar above the heatmaps to separate samples based on Stage(TNM)
colorbar_ax1 = fig.add_axes([ax1.get_position().x0, ax1.get_position().y1 + 0.003, ax1.get_position().width, 0.025])
colorbar_ax2 = fig.add_axes([ax2.get_position().x0, ax2.get_position().y1 + 0.003, ax2.get_position().width, 0.025])
colorbar_ax3 = fig.add_axes([ax3.get_position().x0, ax3.get_position().y1 + 0.003, ax3.get_position().width, 0.025])

colorbar_ax1.text(-0.085, 0.5, 'Group', verticalalignment='center', horizontalalignment='left', fontsize=14, va='center')
colorbar_ax1.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="#1874CD", edgecolor="black", lw=3))
colorbar_ax1.axis('off')

colorbar_ax2.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="#FF9614", edgecolor="black", lw=3))
colorbar_ax2.axis('off')

colorbar_ax3.add_patch(plt.Rectangle((0, 0), 1, 1, facecolor="#B22222", edgecolor="black", lw=3))
colorbar_ax3.axis('off')

# Add a separate color bar (cbar) for the z-score
cbar_ax = fig.add_axes([0.92, 0.6, 0.015, 0.1])
cbar = plt.colorbar(ax2.collections[0], cax=cbar_ax)
cbar_ax.set_title('Z-score', pad=10, fontsize=15, fontweight='bold')
cbar.ax.tick_params(labelsize=15)  # Set fontsize for cbar ticks to 15

# Add a legend for Stage(TNM)
legend_labels = [Patch(facecolor="#1874CD", edgecolor="black", label='HC'),
                Patch(facecolor="#FF9614", edgecolor="black", label='(stage I/II)\nPDA'),
                Patch(facecolor="#B22222", edgecolor="black", label='(stage III/IV)\nPDA')]
legend = fig.legend(handles=legend_labels, loc=(0.895, 0.80), fontsize=15, frameon=False, title='Group')
legend.get_title().set_fontsize('15') # Legend title fontsize
legend.get_title().set_fontweight('bold') # Legend title font weight

plt.savefig(base_path + 'img/heatmap2.png')
plt.show()

# 2. boxplot

In [None]:
df_ts01.columns

In [None]:
# 데이터프레임을 'Stage(TNM)' 열을 x 축으로 하고 다른 열을 y 축으로 하는 형태로 재구성
df_ts01_box1 = df_ts01.drop(columns=['TNFa',
                                    'sEGFR',
                                    'FGF-1',
                                    'Ferritin',
                                    'Kallikrein6',
                                    'Endoglin',
                                    'MIF',
                                    'ALDH1A1',
                                    'CD44',
                                    'VEGF',
                                    'sAXL',
                                    'G-CSF',
                                    'DKK-1',
                                    'sFas',
                                    'sPECAM-1',
                                    'bHCG',
                                    'IGFBP3',
                                    'FGF2',
                                    'sVEGFR1',
                                    'sHer2',
                                    'sE-Selectin',
                                    'EpCAM',
                                    'HE4',
                                    ])

df_ts01_box2 = df_ts01[['TNFa',
                        'sEGFR',
                        'FGF-1',
                        'Ferritin',
                        'Kallikrein6',
                        'Endoglin',
                        'MIF',
                        'ALDH1A1',
                        'CD44',
                        'VEGF',
                        'sAXL',
                        'G-CSF',
                        'DKK-1',
                        'sFas',
                        'sPECAM-1',
                        'bHCG',
                        'IGFBP3',
                        'FGF2',
                        'sVEGFR1',
                        'sHer2',
                        'sE-Selectin',
                        'EpCAM',
                        'HE4',
                        'Stage(TNM)']]


df_melted1 = df_ts01_box1.melt(id_vars='Stage(TNM)', var_name='Variable', value_name='Value')
df_melted2 = df_ts01_box2.melt(id_vars='Stage(TNM)', var_name='Variable', value_name='Value')

# boxplot 그리기
plt.figure(figsize=(12,6))
sns.boxplot(x='Variable', y='Value', hue='Stage(TNM)', data=df_melted1, notch=0.5)
plt.xticks(rotation=45)  # x 축 라벨 회전

plt.figure(figsize=(12,6))
sns.boxplot(x='Variable', y='Value', hue='Stage(TNM)', data=df_melted2, notch=0.5)
plt.xticks(rotation=45)  # x 축 라벨 회전

# plt.ylim(-3,7.5)
plt.show()

In [None]:
df_ts01

In [None]:
df_melted1

In [None]:
# boxplot 그리기
plt.figure(figsize=(12,6))
sns.boxplot(x='Variable', y='Value', hue='Stage(TNM)', data=df_melted1, showcaps=False, flierprops={"marker": ".", "markersize": 3})
plt.xticks(rotation=45)  # x 축 라벨 회전

plt.figure(figsize=(12,6))
sns.boxplot(x='Variable', y='Value', hue='Stage(TNM)', data=df_melted1, showcaps=False, flierprops={"marker": ".", "markersize": 3})
plt.xticks(rotation=45)  # x 축 라벨 회전

plt.ylim(-3,7.5)
plt.show()

In [None]:
plt.figure(figsize=(30,6))

# 기본 boxplot 그리기
sns.boxplot(x='Variable', y='Value', hue='Stage(TNM)', data=df_melted1, flierprops={"marker": ".", "markersize": 8})

# ylim을 벗어나는 이상치에 별표 표시
ylim_lower, ylim_upper = -3, 7.5
for i, variable in enumerate(df_melted1['Variable'].unique()):
    for stage in df_melted1['Stage(TNM)'].unique():
        sub_df = df_melted1[(df_melted1['Variable'] == variable) & (df_melted1['Stage(TNM)'] == stage)]
        outliers = sub_df['Value'][(sub_df['Value'] < ylim_lower) | (sub_df['Value'] > ylim_upper)]

        x_position = i - 0.2 if stage == 0 else i + 0.2  # hue에 따라 x 위치 조절
        plt.scatter([x_position]*len(outliers), outliers, marker='*')

plt.xticks(rotation=45)
plt.ylim(ylim_lower, ylim_upper)
plt.show()

In [None]:
# 색깔 설정
my_palette = {0: '#1874CD', 1: '#B22222'}

plt.figure(figsize=(20,6))
sns.boxplot(x='Variable', y='Value', hue='Stage(TNM)', data=df_melted1, showcaps=False, flierprops={"marker": ".", "markersize": 3}, palette=my_palette)
plt.xticks(rotation=45)  # x 축 라벨 회전

plt.ylim(-3,7.5)
plt.legend().remove()  # 범례 제거
plt.savefig(base_path + 'img/boxplot1.png')
plt.show()

#----

plt.figure(figsize=(20,6))
sns.boxplot(x='Variable', y='Value', hue='Stage(TNM)', data=df_melted2, showcaps=False, flierprops={"marker": ".", "markersize": 3}, palette=my_palette)
plt.xticks(rotation=45)  # x 축 라벨 회전

plt.ylim(-3,7.5)
plt.legend().remove()  # 범례 제거
plt.savefig(base_path + 'img/boxplot2.png')
plt.show()

In [None]:
df_ts012 = df_ts.copy()
# df_ts012.loc[(df_ts012['Stage(TNM)'] == 1) | (df_ts012['Stage(TNM)'] == 2), 'Stage(TNM)'] = 1
# df_ts012.loc[(df_ts012['Stage(TNM)'] == 3) | (df_ts012['Stage(TNM)'] == 4), 'Stage(TNM)'] = 2

df_ts012.loc[df_ts012['Stage(TNM)'].isin([1, 2]), 'Stage(TNM)'] = 1
df_ts012.loc[df_ts012['Stage(TNM)'].isin([3, 4]), 'Stage(TNM)'] = 2


df_ts012_box1 = df_ts012.drop(columns=['TNFa',
                                    'sEGFR',
                                    'FGF-1',
                                    'Ferritin',
                                    'Kallikrein6',
                                    'Endoglin',
                                    'MIF',
                                    'ALDH1A1',
                                    'CD44',
                                    'VEGF',
                                    'sAXL',
                                    'G-CSF',
                                    'DKK-1',
                                    'sFas',
                                    'sPECAM-1',
                                    'bHCG',
                                    'IGFBP3',
                                    'FGF2',
                                    'sVEGFR1',
                                    'sHer2',
                                    'sE-Selectin',
                                    'EpCAM',
                                    'HE4',
                                    ])

df_ts012_box2 = df_ts012[['TNFa',
                        'sEGFR',
                        'FGF-1',
                        'Ferritin',
                        'Kallikrein6',
                        'Endoglin',
                        'MIF',
                        'ALDH1A1',
                        'CD44',
                        'VEGF',
                        'sAXL',
                        'G-CSF',
                        'DKK-1',
                        'sFas',
                        'sPECAM-1',
                        'bHCG',
                        'IGFBP3',
                        'FGF2',
                        'sVEGFR1',
                        'sHer2',
                        'sE-Selectin',
                        'EpCAM',
                        'HE4',
                        'Stage(TNM)']]

# 데이터프레임을 'Stage(TNM)' 열을 x 축으로 하고 다른 열을 y 축으로 하는 형태로 재구성
df_melted1 = df_ts012_box1.melt(id_vars='Stage(TNM)', var_name='Variable', value_name='Value')
df_melted2 = df_ts012_box2.melt(id_vars='Stage(TNM)', var_name='Variable', value_name='Value')

# 색깔 설정
my_palette = {0: '#1874CD', 1:'#FF9614', 2: '#B22222'}

plt.figure(figsize=(20,6))
sns.boxplot(x='Variable', y='Value', hue='Stage(TNM)', data=df_melted1, showcaps=False, flierprops={"marker": ".", "markersize": 3}, palette=my_palette)
plt.xticks(rotation=45)  # x 축 라벨 회전

plt.ylim(-3,7.5)
plt.legend().remove()  # 범례 제거
plt.savefig(base_path + 'img/boxplot3.png')
plt.show()

# ----

# 색깔 설정
# my_palette = {0: '#B22222', 1: '#1874CD'}

plt.figure(figsize=(20,6))
sns.boxplot(x='Variable', y='Value', hue='Stage(TNM)', data=df_melted2, showcaps=False, flierprops={"marker": ".", "markersize": 3}, palette=my_palette)
plt.xticks(rotation=45)  # x 축 라벨 회전

plt.ylim(-3,7.5)
plt.legend().remove()  # 범례 제거
plt.savefig(base_path + 'img/boxplot4.png')
plt.show()

# SHAP

In [None]:
!pip install shap

In [None]:
import xgboost
import shap

# 데이터 준비 (X, y)
# ...
X = df_ts.drop(columns='Stage(TNM)')
y = df_ts['Stage(TNM)']

# 모델 학습
model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 100)

# SHAP explainer 객체 생성
explainer = shap.Explainer(model)

# SHAP 값 계산
shap_values = explainer(X)  # Use X here, not df_ts

# SHAP summary plot 그리기
shap.summary_plot(shap_values, X)

plt.savefig(base_path + 'img/SHAP.png')
plt.show()

# AUC

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import pandas as pd
import xgboost
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression

In [None]:
def rf_optimizer(trial, X, y, K):
    # define parameter to tune
    n_estimators = trial.suggest_categorical('n_estimators', [50, 100, 200])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    max_features = trial.suggest_categorical('max_features', [0.6, 0.7, 0.8])


    # set model
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   criterion='log_loss',
                                   class_weight='balanced'
                                  )

    # K-Fold Cross validation
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []

    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]

        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]

        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)


    # return mean score of CV
    return np.mean(losses)

In [None]:
def xgb_optimizer(trial, X, y, K):

    n_estimators = trial.suggest_categorical('n_estimators', [500, 1000, 2000])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    colsample_bytree = trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-2)
    reg_lambda = trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1, 2])


    model = XGBClassifier(n_estimators=n_estimators,
                          max_depth=max_depth,
                          colsample_bytree=colsample_bytree,
                          learning_rate=learning_rate,
                          reg_lambda=reg_lambda)
#                          scale_pos_weight=4.71)  ## we set class imbalance by using sampling method.


    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []

    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]

        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]

        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)


    return np.mean(losses)

In [None]:
# ### SVM

# if is_tuning:
#     best_loss = 9999.0
#     best_C = 0
#     kernel = 'linear'
#     folds = StratifiedKFold(n_splits=K, random_state=42, shuffle=True)

#     # for Linear SVM
#     for C in tqdm([1, 2, 5, 10, 100]):
#         losses = []
#         l_svm = LinearSVC(C=C, probability=True) ## cuml version. (faster model)

#         for train_idx, val_idx in folds.split(X, y):
#             X_train = X.iloc[train_idx, :]
#             y_train = y.iloc[train_idx]
#             X_val = X.iloc[val_idx, :]
#             y_val = y.iloc[val_idx]

#             l_svm.fit(X_train, y_train)
#             preds = l_svm.predict_proba(X_val).values
#             loss = evaluation_metric(y_val, preds)
#             losses.append(loss)

#         avg_loss = np.mean(losses)
#         if avg_loss < best_loss:
#             best_loss = avg_loss
#             best_C = C

#     # for SVM with RBF kernel.
#     for C in tqdm([1, 2, 5, 10, 100]):
#         losses = []
#         r_svm = SVC(C=C, probability=True) ## cuml version. (with rbf kernel)

#         for train_idx, val_idx in folds.split(X, y):
#             X_train = X.iloc[train_idx, :]
#             y_train = y.iloc[train_idx]
#             X_val = X.iloc[val_idx, :]
#             y_val = y.iloc[val_idx]

#             r_svm.fit(X_train, y_train)
#             preds = r_svm.predict_proba(X_val).values
#             loss = evaluation_metric(y_val, preds)
#             losses.append(loss)

#         avg_loss = np.mean(losses)
#         if avg_loss < best_loss:
#             best_loss = avg_loss
#             best_C = C
#             kernel = 'rbf'

#     print("SVM(%s) log loss : %.4f" % (kernel, best_loss))

In [None]:
# Dummy BINN Classifier (random predictions)
class BINN:
    def predict_proba(self, X):
        return np.random.rand(len(X), 2)

# # Load and preprocess your data (you'll replace this part with your actual data loading)
# # Here df1 and df2 are your two dataframes
# df1 = pd.read_csv('your_first_file.csv')
# df2 = pd.read_csv('your_second_file.csv')

# Initialize KNN imputer and Standard Scaler
# imputer = KNNImputer(n_neighbors=5)
scaler = StandardScaler()

# # Stage(TNM) 칼럼 빼기
# df_tc_Stage = df_tc.pop('Stage(TNM)')

# # Impute missing values
# df_tc_imputed = pd.DataFrame(imputer.fit_transform(df_tc), columns=df_tc.columns)
# df_td_imputed = pd.DataFrame(imputer.fit_transform(df_td), columns=df_td.columns)

# # Remove outliers using IQR
# df_tc_filtered = remove_outliers(df_tc_imputed)
# df_td_filtered = remove_outliers(df_td_imputed)

# # Standardize the data
# df_tc_standardized = pd.DataFrame(scaler.fit_transform(df_tc_filtered), columns=df_tc_filtered.columns)
# df_td_standardized = pd.DataFrame(scaler.fit_transform(df_td_filtered), columns=df_td_filtered.columns)

# # Standardize the data
# df_tc_standardized = pd.DataFrame(scaler.fit_transform(df_tc), columns=df_tc.columns)
# df_td_standardized = pd.DataFrame(scaler.fit_transform(df_td), columns=df_td.columns)

# df_tc_standardized['Stage(TNM)'] = df_tc_Stage

# # Combine the data and prepare for modeling
# df_combined = pd.concat([df_tc_standardized, df_td_standardized], ignore_index=True)

# print(df_combined.columns)

# df_combined['Stage(TNM)'] = df_combined['Stage(TNM)'].fillna(0)

# df_combined01 = df_combined.copy()
# df_combined01.loc[df_combined01['Stage(TNM)'] >= 1, 'Stage(TNM)'] = 1

### BINN

In [None]:
pip install binn

In [None]:
# from binn import BINN, Network
# from binn import BINNClassifier

# network = Network(
#     input_data=df_t,
#     pathways=None,
#     mapping=None
#     # verbose=True
# )

# binn = BINNClassifier(
#     pathways=network,
#     n_layers=4,
#     dropout=0.2,
#     validate=True,
#     epochs=10,
#     threads=10,
# )

### X, y 분리

In [None]:
# Split the data
X01 = df_t01.drop('Stage(TNM)', axis=1)
y01 = df_t01['Stage(TNM)']

X_train01, X_test01, y_train01, y_test01 = train_test_split(X01, y01, test_size=0.2, random_state=0) # 랜덤 스테이트 42?

if is_dropcol:
  X01 = df_t01_drop.drop('Stage(TNM)', axis=1)
  y01 = df_t01_drop['Stage(TNM)']

  X_train01, X_test01, y_train01, y_test01 = train_test_split(X01, y01, test_size=0.2, random_state=0) # 랜덤 스테이트 42?

# if is_012:
#   X = df_t012.drop('Stage(TNM)', axis=1)
#   y = df_t012['Stage(TNM)']

#   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # 랜덤 스테이트 42?

In [None]:
y01

# 모델 학습

In [None]:
# from binn import Network, BINN
# from binn import BINNClassifier
# import pandas as pd

# # Create a Network object with only input_data
# # Assuming that pathways and mapping are optional parameters, we leave them out

# pathway = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/크몽/바이오/pathways.csv')
# translation = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/크몽/바이오/translation.csv')

# network = Network(
#     input_data=df_t,
#     pathways=pathway,  # Empty DataFrame as there's no pathway info
#     mapping=translation,  # Empty DataFrame as there's no mapping info
#     input_data_column = "Stage(TNM)", # specify the column for entities in input data
#     source_column = "child", # defined by our pathways-file
#     target_column = "parent",
# )

In [None]:
# from binn import BINN, Network, BINNClassifier
# import pandas as pd
# import numpy as np

# network = Network(
#     input_data=df_t,  # X_train from the previous split
#     pathways=pd.DataFrame(),  # Empty DataFrame as there's no pathway info
#     mapping=pd.DataFrame(),  # Empty DataFrame as there's no mapping info
#     input_data_column= 'CA19-9',
#     source_column = 'Stage(TNM)',
#     # target_column = "Stage(TNM)"
# )

In [None]:
# binn = BINNClassifier(
#     pathways=network,
#     n_layers=4,
#     dropout=0.2,
#     validate=True,
#     epochs=10,
#     threads=10,
# )
# binn.fit(X_train, y_train)

# y_pred_binn = binn.predict(X_test)

# conf_matrix_binn = confusion_matrix(y_test, y_pred_binn)
# class_report_binn = classification_report(y_test, y_pred_binn)

# conf_matrix_binn, class_report_binn

In [None]:
# rf_run = RandomForestClassifier(random_state=0, max_depth=5, min_simples_leaf=8,min_samples_split=8,n_estimators=200)
# rf_run.fit

# folds = StratifiedKFold(n_splits=K, shuffle=True)
# losses = []

# ROC 커브

In [None]:
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Initialize classifiers with optimized hyperparameters
classifiers_optimized = {
    'XGBoost': xgboost.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=50, max_depth=3),
    'SVM': SVC(probability=True, C=0.5),
    'K-NN': KNeighborsClassifier(n_neighbors=3),
    'LightGBM': lgb.LGBMClassifier(n_estimators=50, max_depth=3,  verbose=-1),
    'Logistic Regression': LogisticRegression(),  # Add Logistic Regression
    'Random Forest': RandomForestClassifier(random_state=0, max_depth=5, min_samples_leaf=8, min_samples_split=8, n_estimators=200),
}

colors = {
    'XGBoost': '#3232FF',  # Blue
    'SVM': '#5050FF',  # Green
    'K-NN': '#1E82FF',  # Red
    'LightGBM': '#E6A055',  # Purple
    'Logistic Regression': '#EF904C',  # Orange
    'Random Forest': '#B93232'  # Cyan
}

# Initialize figure for ROC curve
plt.figure(figsize=(8, 8))

# Initialize dictionaries to store FPR, TPR, and AUC values for each classifier
roc_curves = defaultdict(list)
roc_aucs = defaultdict(list)
mean_fpr = np.linspace(0, 1, 100)

# Loop through classifiers to plot ROC curve
for name, clf in classifiers_optimized.items():
    # print(y_train)
    clf.fit(X_train01, y_train01)
    y_pred_proba = clf.predict_proba(X_test01)[:, 1]

    fpr, tpr, _ = roc_curve(y_test01, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Interpolate the TPR to be the same length as mean_fpr
    interp_tpr = np.interp(mean_fpr, fpr, tpr)
    interp_tpr[0] = 0.0

    roc_curves[name].append(interp_tpr)
    roc_aucs[name].append(roc_auc)

    # plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")

for name, tprs in roc_curves.items():
    color = colors.get(name, 'aqua')  # Default color if not found
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(roc_aucs[name])
    std_tpr = np.std(tprs, axis=0)

    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color=color, alpha=.2)
    plt.plot(mean_fpr, mean_tpr, color=color, label=f"{name}: {mean_auc:.2f} +/- {std_auc:.2f}")

# Plot random chance line
plt.plot([0, 1], [0, 1], 'k--', label="Chance (AUC = 0.5)")

# Set plot options
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1 - Specificity (False Positive Rate)')
plt.ylabel('Sensitivity (True Positive Rate)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")

plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Initialize classifiers with optimized hyperparameters
classifiers_optimized = {
    'XGBoost': xgboost.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=50, max_depth=3),
    'SVM': SVC(probability=True, C=0.5),
    'K-NN': KNeighborsClassifier(n_neighbors=3),
    'LightGBM': lgb.LGBMClassifier(n_estimators=50, max_depth=3,  verbose=-1),
    'Logistic Regression': LogisticRegression(),  # Add Logistic Regression
    'Random Forest': RandomForestClassifier(random_state=0, max_depth=5, min_samples_leaf=8, min_samples_split=8, n_estimators=200),
}

colors = {
    'XGBoost': '#3232FF',  # Blue
    'SVM': '#5050FF',  # Green
    'K-NN': '#1E82FF',  # Red
    'LightGBM': '#E6A055',  # Purple
    'Logistic Regression': '#EF904C',  # Orange
    'Random Forest': '#B93232'  # Cyan
}

# 초기 통합 Confusion Matrix 설정
total_cm = np.zeros((2, 2))  # 이진 분류의 경우 2x2 행렬

# Confusion Matrix를 저장할 딕셔너리
cm_dict = {}

# Initialize figure for ROC curve
plt.figure(figsize=(8, 8))

# Initialize dictionaries to store FPR, TPR, and AUC values for each classifier
roc_curves = defaultdict(list)
roc_aucs = defaultdict(list)
mean_fpr = np.linspace(0, 1, 100)

# Loop through classifiers to plot ROC curve
for name, clf in classifiers_optimized.items():
    # print(y_train)
    clf.fit(X_train01, y_train01)
    y_pred_proba = clf.predict_proba(X_test01)[:, 1]
    y_pred = clf.predict(X_test01)

    fpr, tpr, _ = roc_curve(y_test01, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Confusion Matrix 계산
    cm = confusion_matrix(y_test01, y_pred)  # 실제 레이블과 예측 레이블을 사용

    # 통합 Confusion Matrix에 더하기
    total_cm += cm

    # Interpolate the TPR to be the same length as mean_fpr
    interp_tpr = np.interp(mean_fpr, fpr, tpr)
    interp_tpr[0] = 0.0

    roc_curves[name].append(interp_tpr)
    roc_aucs[name].append(roc_auc)

    # plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")

for name, tprs in roc_curves.items():
    color = colors.get(name, 'aqua')  # Default color if not found
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(roc_aucs[name])
    std_tpr = np.std(tprs, axis=0)

    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color=color, alpha=.2)
    plt.plot(mean_fpr, mean_tpr, color=color, label=f"{name}: {mean_auc:.2f} +/- {std_auc:.2f}")

# 변화율 계산 (예: True Positive Rate 변화율)
tpr_change_rate = total_cm[1, 1] / (total_cm[1, 1] + total_cm[1, 0])
fpr_change_rate = total_cm[0, 1] / (total_cm[0, 1] + total_cm[0, 0])

# 통합 Confusion Matrix를 그립니다.
plt.figure(figsize=(8, 6))
sns.heatmap(total_cm, annot=True, fmt='.2f', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title(f'Total Confusion Matrix: TPR Change Rate = {tpr_change_rate:.2f}, FPR Change Rate = {fpr_change_rate:.2f}')
plt.show()

# Plot random chance line
plt.plot([0, 1], [0, 1], 'k--', label="Chance (AUC = 0.5)")

# Set plot options
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1 - Specificity (False Positive Rate)')
plt.ylabel('Sensitivity (True Positive Rate)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")

plt.show()

print(classification_report(y_test01, y_pred))


### 타겟 01, 컬럼 3개 사용했을 때

In [None]:
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

colors = {
    'XGBoost': '#3232FF',
    'SVM': '#5050FF',
    'K-NN': '#1E82FF',
    'LightGBM': '#E6A055',
    'Logistic Regression': '#EF904C',
    'Random Forest': '#B93232',
}

# Initialize figure for ROC curve
plt.figure(figsize=(6, 6))

# Number of splits
n_splits = 3
cv = StratifiedKFold(n_splits=n_splits)

# Initialize dictionaries to store FPR, TPR, and AUC values for each classifier
roc_curves = defaultdict(list)
roc_aucs = defaultdict(list)
mean_fpr = np.linspace(0, 1, 100)

# Loop through each fold
for train_index, test_index in cv.split(X_train01, y_train01):
    X_train01_fold, X_test_fold = X_train01.iloc[train_index], X_train01.iloc[test_index]
    y_train01_fold, y_test_fold = y_train01.iloc[train_index], y_train01.iloc[test_index]

    # Loop through classifiers
    for name, clf in classifiers_optimized.items():
        clf.fit(X_train01_fold, y_train01_fold)
        y_pred_proba = clf.predict_proba(X_test_fold)[:, 1]

        fpr, tpr, _ = roc_curve(y_test_fold, y_pred_proba)
        roc_auc = auc(fpr, tpr)

        # Interpolate the TPR to be the same length as mean_fpr
        interp_tpr = np.interp(mean_fpr, fpr, tpr)
        interp_tpr[0] = 0.0

        roc_curves[name].append(interp_tpr)
        roc_aucs[name].append(roc_auc)

for name, tprs in roc_curves.items():
    color = colors.get(name, 'aqua')  # Default color if not found
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(roc_aucs[name])
    std_tpr = np.std(tprs, axis=0)

    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color=color, alpha=.2)
    plt.plot(mean_fpr, mean_tpr, color=color, label=f"{name}: {mean_auc:.2f} +/- {std_auc:.2f}")


# Plot random chance line
# plt.plot([0, 1], [0, 1], 'k--', label="Chance (AUC = 0.5)")

# Set plot options
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.02])
plt.xlabel('1 - Specificity (False Positive Rate)')
plt.ylabel('Sensitivity (True Positive Rate)')
# plt.title('Receiver Operating Characteristic with k-Fold Cross-Validation')

# Hide the right and top spines
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.legend(loc="lower right", frameon=False, title='AUC', title_fontsize=11)
plt.savefig(base_path + 'ROC1.png')
plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

colors = {
    'XGBoost': '#3232FF',
    'SVM': '#5050FF',
    'K-NN': '#1E82FF',
    'LightGBM': '#E6A055',
    'Logistic Regression': '#EF904C',
    'Random Forest': '#B93232',
}

# Initialize figure for ROC curve
plt.figure(figsize=(6, 6))

# 초기 통합 Confusion Matrix 설정
total_cm = np.zeros((2, 2))  # 이진 분류의 경우 2x2 행렬

# Confusion Matrix를 저장할 딕셔너리
cm_dict = {}

# Number of splits
n_splits = 3
cv = StratifiedKFold(n_splits=n_splits)

# Initialize dictionaries to store FPR, TPR, and AUC values for each classifier
roc_curves = defaultdict(list)
roc_aucs = defaultdict(list)
mean_fpr = np.linspace(0, 1, 100)

# Loop through each fold
for train_index, test_index in cv.split(X_train01, y_train01):
    X_train01_fold, X_test_fold = X_train01.iloc[train_index], X_train01.iloc[test_index]
    y_train01_fold, y_test_fold = y_train01.iloc[train_index], y_train01.iloc[test_index]

    # Loop through classifiers
    for name, clf in classifiers_optimized.items():
        clf.fit(X_train01_fold, y_train01_fold)
        y_pred_proba = clf.predict_proba(X_test_fold)[:, 1]

        fpr, tpr, _ = roc_curve(y_test_fold, y_pred_proba)
        roc_auc = auc(fpr, tpr)

        # Confusion Matrix 계산
        cm = confusion_matrix(y_test01, y_pred)  # 실제 레이블과 예측 레이블을 사용

        # 통합 Confusion Matrix에 더하기
        total_cm += cm

        # Interpolate the TPR to be the same length as mean_fpr
        interp_tpr = np.interp(mean_fpr, fpr, tpr)
        interp_tpr[0] = 0.0

        roc_curves[name].append(interp_tpr)
        roc_aucs[name].append(roc_auc)

for name, tprs in roc_curves.items():
    color = colors.get(name, 'aqua')  # Default color if not found
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(roc_aucs[name])
    std_tpr = np.std(tprs, axis=0)

    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color=color, alpha=.2)
    plt.plot(mean_fpr, mean_tpr, color=color, label=f"{name}: {mean_auc:.2f} +/- {std_auc:.2f}")


# Plot random chance line
# plt.plot([0, 1], [0, 1], 'k--', label="Chance (AUC = 0.5)")

# # 변화율 계산 (예: True Positive Rate 변화율)
# tpr_change_rate = total_cm[1, 1] / (total_cm[1, 1] + total_cm[1, 0])
# fpr_change_rate = total_cm[0, 1] / (total_cm[0, 1] + total_cm[0, 0])

# 전체 샘플 수 계산 (각 열 별로)
total_samples_per_class = np.sum(total_cm, axis=0)

# 각 셀의 값을 해당 열의 총합으로 나누어 퍼센트로 변환
total_cm_percentage_per_class = (total_cm / total_samples_per_class) * 100

# Manually create the annotations using list comprehension
rows, cols = total_cm_percentage_per_class.shape
rounded_percentage_per_class = [[f"{total_cm_percentage_per_class[i, j]:.0f}%" for j in range(cols)] for i in range(rows)]

# 통합 Confusion Matrix를 그립니다.
plt.figure(figsize=(8, 6))
sns.heatmap(total_cm_percentage_per_class, annot=rounded_percentage_per_class, fmt='', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], annot_kws={"fontsize": 14}, cbar=False)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title(f'Total Confusion Matrix: TPR Change Rate = {tpr_change_rate:.2f}, FPR Change Rate = {fpr_change_rate:.2f}')
plt.show()

# Set plot options
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.02])
plt.xlabel('1 - Specificity (False Positive Rate)')
plt.ylabel('Sensitivity (True Positive Rate)')
# plt.title('Receiver Operating Characteristic with k-Fold Cross-Validation')

# Hide the right and top spines
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.legend(loc="lower right", frameon=False, title='AUC', title_fontsize=11)

plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import math

colors = {
    'XGBoost': '#3232FF',
    'SVM': '#5050FF',
    'K-NN': '#1E82FF',
    'LightGBM': '#E6A055',
    'Logistic Regression': '#EF904C',
    'Random Forest': '#B93232',
}

# Initialize figure for ROC curve
plt.figure(figsize=(6, 6))

# 초기 통합 Confusion Matrix 설정
total_cm = np.zeros((2, 2))  # 이진 분류의 경우 2x2 행렬

# Confusion Matrix를 저장할 딕셔너리
cm_dict = {}
cms = []

# Number of splits
n_splits = 3
cv = StratifiedKFold(n_splits=n_splits)

# Initialize dictionaries to store FPR, TPR, and AUC values for each classifier
roc_curves = defaultdict(list)
roc_aucs = defaultdict(list)
mean_fpr = np.linspace(0, 1, 100)

# Loop through each fold
for train_index, test_index in cv.split(X_train01, y_train01):
    X_train01_fold, X_test_fold = X_train01.iloc[train_index], X_train01.iloc[test_index]
    y_train01_fold, y_test_fold = y_train01.iloc[train_index], y_train01.iloc[test_index]

    # Loop through classifiers
    for name, clf in classifiers_optimized.items():
        clf.fit(X_train01_fold, y_train01_fold)
        y_pred_proba = clf.predict_proba(X_test_fold)[:, 1]
        y_pred = clf.predict(X_test_fold)

        fpr, tpr, _ = roc_curve(y_test_fold, y_pred_proba)
        roc_auc = auc(fpr, tpr)

        # Confusion Matrix 계산
        cm = confusion_matrix(y_test_fold, y_pred)  # 실제 레이블과 예측 레이블을 사용

            # Confusion Matrix를 리스트에 저장
        cms.append(cm)

        # 통합 Confusion Matrix에 더하기
        total_cm += cm

        # Interpolate the TPR to be the same length as mean_fpr
        interp_tpr = np.interp(mean_fpr, fpr, tpr)
        interp_tpr[0] = 0.0

        roc_curves[name].append(interp_tpr)
        roc_aucs[name].append(roc_auc)

for name, tprs in roc_curves.items():
    color = colors.get(name, 'aqua')  # Default color if not found
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(roc_aucs[name])
    std_tpr = np.std(tprs, axis=0)

    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color=color, alpha=.2)
    plt.plot(mean_fpr, mean_tpr, color=color, label=f"{name}: {mean_auc:.2f} +/- {std_auc:.2f}")


# Plot random chance line
# plt.plot([0, 1], [0, 1], 'k--', label="Chance (AUC = 0.5)")

# # 변화율 계산 (예: True Positive Rate 변화율)
# tpr_change_rate = total_cm[1, 1] / (total_cm[1, 1] + total_cm[1, 0])
# fpr_change_rate = total_cm[0, 1] / (total_cm[0, 1] + total_cm[0, 0])

# 각 fold에서의 Confusion Matrix를 numpy array로 변환
cms = np.array(cms)


# 열별로 평균과 표준편차 계산
mean_cms = np.mean(cms, axis=0)
std_cms = np.std(cms, axis=0)

# 열별로 퍼센트와 변화율 계산
total_samples_per_class = np.sum(mean_cms, axis=0)
mean_percentage = (mean_cms / total_samples_per_class) * 100
std_percentage = (std_cms / total_samples_per_class) * 10

# Manually create the annotations using list comprehension
# Manually get the shape of the mean_percentage list
rows = len(mean_percentage)
cols = len(mean_percentage[0]) if rows > 0 else 0
# Manually create the annotations using list comprehension with floor rounding for std
annotations = [[f"{mean_percentage[i][j]:.0f}±{math.floor(std_percentage[i][j])}%" for j in range(cols)] for i in range(rows)]


# # Calculate percentages
# mean_percentage = (mean_cms / np.sum(mean_cms, axis=1, keepdims=True)) * 100
# std_percentage = (std_cms / np.sum(std_cms, axis=1, keepdims=True)) * 100

# # Create annotations
# rows, cols = mean_percentage.shape
# annotations = [[f"{mean_percentage[i, j]:.0f}±{std_percentage[i, j]:.0f}%" for j in range(cols)] for i in range(rows)]


# # 전체 샘플 수 계산 (각 열 별로)
# total_samples_per_class = np.sum(total_cm, axis=0)

# # 각 셀의 값을 해당 열의 총합으로 나누어 퍼센트로 변환
# total_cm_percentage_per_class = (total_cm / total_samples_per_class) * 100

# # Manually create the annotations using list comprehension
# rows, cols = total_cm_percentage_per_class.shape
# rounded_percentage_per_class = [[f"{total_cm_percentage_per_class[i, j]:.0f}%" for j in range(cols)] for i in range(rows)]


# 통합 Confusion Matrix를 그립니다.
plt.figure(figsize=(6, 6))
# sns.heatmap(total_cm_percentage_per_class, annot=rounded_percentage_per_class, fmt='', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], annot_kws={"fontsize": 14}, cbar=False)
# sns.heatmap(mean_percentage, annot=annotations, fmt='', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], annot_kws={"fontsize": 14}, cbar=False)
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.title(f'Total Confusion Matrix: TPR Change Rate = {tpr_change_rate:.2f}, FPR Change Rate = {fpr_change_rate:.2f}')
# plt.show()

sns.heatmap(mean_percentage.T, annot=np.array(annotations).T, fmt='', cmap='coolwarm', xticklabels=['More severe', 'Less severe'], yticklabels=['More severe', 'Less severe'], annot_kws={"fontsize": 14}, cbar=False)
plt.xlabel('True')
plt.ylabel('Predicted')
# plt.title('Total Confusion Matrix with Change Rate')
plt.show()

# Set plot options
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.02])
plt.xlabel('1 - Specificity (False Positive Rate)')
plt.ylabel('Sensitivity (True Positive Rate)')
# plt.title('Receiver Operating Characteristic with k-Fold Cross-Validation')

# Hide the right and top spines
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.legend(loc="lower right", frameon=False, title='AUC', title_fontsize=11)

plt.show()

In [None]:
fpr

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

# Initialize figure for PR curve
plt.figure(figsize=(6, 6))

# Number of splits
n_splits = 3
cv = StratifiedKFold(n_splits=n_splits)

# Initialize dictionaries to store FPR, TPR, and AUC values for each classifier
pr_curves = defaultdict(list)
pr_aucs = defaultdict(list)
mean_recall = np.linspace(0, 1, 100)

# Loop through each fold
for train_index, test_index in cv.split(X_train01, y_train01):
    X_train01_fold, X_test_fold = X_train01.iloc[train_index], X_train01.iloc[test_index]
    y_train01_fold, y_test_fold = y_train01.iloc[train_index], y_train01.iloc[test_index]

    # Loop through classifiers
    for name, clf in classifiers_optimized.items():
        clf.fit(X_train01_fold, y_train01_fold)
        y_pred_proba = clf.predict_proba(X_test_fold)[:, 1]

        precision, recall, _ = precision_recall_curve(y_test_fold, y_pred_proba)
        avg_precision = average_precision_score(y_test_fold, y_pred_proba)

        # Interpolate the precision to be the same length as mean_recall
        interp_precision = np.interp(mean_recall, recall[::-1], precision[::-1])

        pr_curves[name].append(interp_precision)
        pr_aucs[name].append(avg_precision)

# Plot average PR curves
for name, precisions in pr_curves.items():
    color = colors.get(name, 'aqua')  # Default color if not found
    mean_precision = np.mean(precisions, axis=0)
    std_precision = np.std(precisions, axis=0)
    mean_auc = np.mean(pr_aucs[name])
    std_auc = np.std(pr_aucs[name])

    precision_upper = np.minimum(mean_precision + std_precision, 1)
    precision_lower = np.maximum(mean_precision - std_precision, 0)

    plt.fill_between(mean_recall, precision_lower, precision_upper, color=color, alpha=.2)
    plt.plot(mean_recall, mean_precision, color=color,
             label=f"{name}: {mean_auc:.2f} +/- {std_auc:.2f}")


# Set plot options
plt.xlim([0.0, 1.05])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')

# Hide the right and top spines
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.legend(loc="lower right", frameon=False, title='AUC', title_fontsize=11)
plt.savefig(base_path + 'ROC_Recall_1.png')
plt.show()

In [None]:

# # Initialize figure for Precision-Recall curve
# plt.figure(figsize=(6, 6))

# # Create a uniform grid for recall values
# mean_recall = np.linspace(0, 1, 100)

# # Initialize dictionaries to store interpolated precision
# interp_precision = defaultdict(list)
# average_precisions = defaultdict(list)

# # Loop through each fold
# for train_index, test_index in cv.split(X_train01, y_train01):
#     X_train01_fold, X_test_fold = X_train01.iloc[train_index], X_train01.iloc[test_index]
#     y_train01_fold, y_test_fold = y_train01.iloc[train_index], y_train01.iloc[test_index]

#     # Loop through classifiers
#     for name, clf in classifiers_optimized.items():
#         clf.fit(X_train01_fold, y_train01_fold)
#         y_pred_proba = clf.predict_proba(X_test_fold)[:, 1]

#         precision, recall, _ = precision_recall_curve(y_test_fold, y_pred_proba)
#         avg_precision = average_precision_score(y_test_fold, y_pred_proba)

#         # Interpolate precision at the points of mean_recall
#         interp_prec = np.interp(mean_recall, np.flipud(recall), np.flipud(precision))

#         interp_precision[name].append(interp_prec)
#         average_precisions[name].append(avg_precision)

# # Plot the Precision-Recall curve
# for name, precisions in interp_precision.items():
#     color = colors.get(name, 'aqua')  # Default color if not found
#     mean_precision = np.mean(precisions, axis=0)
#     mean_avg_precision = np.mean(average_precisions[name])

#     plt.plot(mean_recall, mean_precision, color=color, label=f"{name}: {mean_avg_precision:.2f}")

# # Set plot options
# plt.xlim([0.0, 1.02])
# plt.ylim([0.0, 1.02])
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.title('Precision-Recall curve with k-Fold Cross-Validation')

# # Hide the right and top spines
# plt.gca().spines['right'].set_visible(False)
# plt.gca().spines['top'].set_visible(False)

# plt.legend(loc="upper right", frameon=False, title='Average Precision', title_fontsize=11)

# plt.show()

### 타겟 01, 컬럼 CA19-9 단독

In [None]:
X_train01_CA19 = pd.DataFrame(X_train01['CA19-9'], columns=['CA19-9'])
X_train01_CA19

In [None]:
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

colors = {
    'XGBoost': '#3232FF',
    'SVM': '#5050FF',
    'K-NN': '#1E82FF',
    'LightGBM': '#E6A055',
    'Logistic Regression': '#EF904C',
    'Random Forest': '#B93232',
}

# Initialize figure for ROC curve
plt.figure(figsize=(6, 6))

# Number of splits
n_splits = 3
cv = StratifiedKFold(n_splits=n_splits)

# Initialize dictionaries to store FPR, TPR, and AUC values for each classifier
roc_curves = defaultdict(list)
roc_aucs = defaultdict(list)
mean_fpr = np.linspace(0, 1, 100)

# Loop through each fold
for train_index, test_index in cv.split(X_train01_CA19, y_train01):
    X_train01_fold, X_test_fold = X_train01_CA19.iloc[train_index], X_train01_CA19.iloc[test_index]
    y_train01_fold, y_test_fold = y_train01.iloc[train_index], y_train01.iloc[test_index]

    # Loop through classifiers
    for name, clf in classifiers_optimized.items():
        clf.fit(X_train01_fold, y_train01_fold)
        y_pred_proba = clf.predict_proba(X_test_fold)[:, 1]

        fpr, tpr, _ = roc_curve(y_test_fold, y_pred_proba)
        roc_auc = auc(fpr, tpr)

        # Interpolate the TPR to be the same length as mean_fpr
        interp_tpr = np.interp(mean_fpr, fpr, tpr)
        interp_tpr[0] = 0.0

        roc_curves[name].append(interp_tpr)
        roc_aucs[name].append(roc_auc)

for name, tprs in roc_curves.items():
    color = colors.get(name, 'aqua')  # Default color if not found
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(roc_aucs[name])
    std_tpr = np.std(tprs, axis=0)

    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color=color, alpha=.2)
    plt.plot(mean_fpr, mean_tpr, color=color, label=f"{name}: {mean_auc:.2f} +/- {std_auc:.2f}")


# Plot random chance line
# plt.plot([0, 1], [0, 1], 'k--', label="Chance (AUC = 0.5)")

# Set plot options
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.02])
plt.xlabel('1 - Specificity (False Positive Rate)')
plt.ylabel('Sensitivity (True Positive Rate)')
# plt.title('Receiver Operating Characteristic with k-Fold Cross-Validation')

# Hide the right and top spines
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.legend(loc="lower right", frameon=False, title='AUC', title_fontsize=11)
plt.savefig(base_path + 'ROC2.png')
plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

colors = {
    'XGBoost': '#3232FF',
    'SVM': '#5050FF',
    'K-NN': '#1E82FF',
    'LightGBM': '#E6A055',
    'Logistic Regression': '#EF904C',
    'Random Forest': '#B93232',
}

# Initialize figure for ROC curve
plt.figure(figsize=(6, 6))

# 초기 통합 Confusion Matrix 설정
total_cm = np.zeros((2, 2))  # 이진 분류의 경우 2x2 행렬

# Confusion Matrix를 저장할 딕셔너리
cm_dict = {}
cms = []


# Number of splits
n_splits = 3
cv = StratifiedKFold(n_splits=n_splits)

# Initialize dictionaries to store FPR, TPR, and AUC values for each classifier
roc_curves = defaultdict(list)
roc_aucs = defaultdict(list)
mean_fpr = np.linspace(0, 1, 100)

# Loop through each fold
for train_index, test_index in cv.split(X_train01_CA19, y_train01):
    X_train01_fold, X_test_fold = X_train01_CA19.iloc[train_index], X_train01_CA19.iloc[test_index]
    y_train01_fold, y_test_fold = y_train01.iloc[train_index], y_train01.iloc[test_index]

    # Loop through classifiers
    for name, clf in classifiers_optimized.items():
        clf.fit(X_train01_fold, y_train01_fold)
        y_pred_proba = clf.predict_proba(X_test_fold)[:, 1]
        y_pred = clf.predict(X_test_fold)

        fpr, tpr, _ = roc_curve(y_test_fold, y_pred_proba)
        roc_auc = auc(fpr, tpr)

        # Confusion Matrix 계산
        cm = confusion_matrix(y_test_fold, y_pred)  # 실제 레이블과 예측 레이블을 사용

        # Confusion Matrix를 리스트에 저장
        cms.append(cm)

        # 통합 Confusion Matrix에 더하기
        total_cm += cm

        # Interpolate the TPR to be the same length as mean_fpr
        interp_tpr = np.interp(mean_fpr, fpr, tpr)
        interp_tpr[0] = 0.0

        roc_curves[name].append(interp_tpr)
        roc_aucs[name].append(roc_auc)

for name, tprs in roc_curves.items():
    color = colors.get(name, 'aqua')  # Default color if not found
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(roc_aucs[name])
    std_tpr = np.std(tprs, axis=0)

    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color=color, alpha=.2)
    plt.plot(mean_fpr, mean_tpr, color=color, label=f"{name}: {mean_auc:.2f} +/- {std_auc:.2f}")


# Plot random chance line
# plt.plot([0, 1], [0, 1], 'k--', label="Chance (AUC = 0.5)")

# 각 fold에서의 Confusion Matrix를 numpy array로 변환
cms = np.array(cms)

# 열별로 평균과 표준편차 계산
mean_cms = np.mean(cms, axis=0)
std_cms = np.std(cms, axis=0)

# 열별로 퍼센트와 변화율 계산
total_samples_per_class = np.sum(mean_cms, axis=0)
mean_percentage = (mean_cms / total_samples_per_class) * 100
std_percentage = (std_cms / total_samples_per_class) * 10

# Manually create the annotations using list comprehension
rows, cols = mean_percentage.shape
annotations = [[f"{mean_percentage[i, j]:.0f}±{std_percentage[i, j]:.0f}%" for j in range(cols)] for i in range(rows)]

# 통합 Confusion Matrix를 그립니다.
plt.figure(figsize=(6, 6))

sns.heatmap(mean_percentage.T, annot=np.array(annotations).T, fmt='', cmap='coolwarm', xticklabels=['More severe', 'Less severe'], yticklabels=['More severe', 'Less severe'], annot_kws={"fontsize": 14}, cbar=False)
plt.xlabel('True')
plt.ylabel('Predicted')
# plt.title('Total Confusion Matrix with Change Rate')
plt.show()


# Set plot options
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.02])
plt.xlabel('1 - Specificity (False Positive Rate)')
plt.ylabel('Sensitivity (True Positive Rate)')
# plt.title('Receiver Operating Characteristic with k-Fold Cross-Validation')

# Hide the right and top spines
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.legend(loc="lower right", frameon=False, title='AUC', title_fontsize=11)

plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

# Initialize figure for PR curve
plt.figure(figsize=(6, 6))

# Number of splits
n_splits = 3
cv = StratifiedKFold(n_splits=n_splits)

# Initialize dictionaries to store FPR, TPR, and AUC values for each classifier
pr_curves = defaultdict(list)
pr_aucs = defaultdict(list)
mean_recall = np.linspace(0, 1, 100)

# Loop through each fold
for train_index, test_index in cv.split(X_train01_CA19, y_train01):
    X_train01_fold, X_test_fold = X_train01_CA19.iloc[train_index], X_train01_CA19.iloc[test_index]
    y_train01_fold, y_test_fold = y_train01.iloc[train_index], y_train01.iloc[test_index]

    # Loop through classifiers
    for name, clf in classifiers_optimized.items():
        clf.fit(X_train01_fold, y_train01_fold)
        y_pred_proba = clf.predict_proba(X_test_fold)[:, 1]

        precision, recall, _ = precision_recall_curve(y_test_fold, y_pred_proba)
        avg_precision = average_precision_score(y_test_fold, y_pred_proba)

        # Interpolate the precision to be the same length as mean_recall
        interp_precision = np.interp(mean_recall, recall[::-1], precision[::-1])

        pr_curves[name].append(interp_precision)
        pr_aucs[name].append(avg_precision)

# Plot average PR curves
for name, precisions in pr_curves.items():
    color = colors.get(name, 'aqua')  # Default color if not found
    mean_precision = np.mean(precisions, axis=0)
    std_precision = np.std(precisions, axis=0)
    mean_auc = np.mean(pr_aucs[name])
    std_auc = np.std(pr_aucs[name])

    precision_upper = np.minimum(mean_precision + std_precision, 1)
    precision_lower = np.maximum(mean_precision - std_precision, 0)

    plt.fill_between(mean_recall, precision_lower, precision_upper, color=color, alpha=.2)
    plt.plot(mean_recall, mean_precision, color=color,
             label=f"{name}: {mean_auc:.2f} +/- {std_auc:.2f}")


# Set plot options
plt.xlim([0.0, 1.05])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')

# Hide the right and top spines
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.legend(loc="lower right", frameon=False, title='AUC', title_fontsize=11)
plt.savefig(base_path + 'ROC_Recall_2.png')
plt.show()

### 012, 컬럼 3개

In [None]:
X = df_t.drop('Stage(TNM)', axis=1)
y = df_t['Stage(TNM)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # 랜덤 스테이트 42?

if is_dropcol:
  X = df_t_drop.drop('Stage(TNM)', axis=1)
  y = df_t_drop['Stage(TNM)']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # 랜덤 스테이트 42?

In [None]:
# from sklearn.model_selection import StratifiedKFold
# from collections import defaultdict
# import numpy as np
# import matplotlib.pyplot as plt

# colors = {
#     'XGBoost': '#3232FF',
#     'SVM': '#5050FF',
#     'K-NN': '#1E82FF',
#     'LightGBM': '#E6A055',
#     'Logistic Regression': '#EF904C',
#     'Random Forest': '#B93232',
# }

# # Initialize figure for ROC curve
# plt.figure(figsize=(6, 6))

# # Number of splits
# n_splits = 3
# cv = StratifiedKFold(n_splits=n_splits)

# # Initialize dictionaries to store FPR, TPR, and AUC values for each classifier
# roc_curves = defaultdict(list)
# roc_aucs = defaultdict(list)
# mean_fpr = np.linspace(0, 1, 100)

# # ... (이전 코드와 동일)

# # Number of classes in the dataset
# num_classes = 3  # Replace with your actual number of classes

# # Initialize dictionaries to store interpolated TPR for each classifier and class
# interp_tprs = defaultdict(lambda: defaultdict(list))

# # Loop through each fold
# for train_index, test_index in cv.split(X_train, y_train):
#     X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
#     y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

#     # Binarize the labels for multiclass ROC
#     y_test_fold_bin = label_binarize(y_test_fold, classes=[0, 1, 2])  # classes should be your actual classes

#     # Loop through classifiers
#     for name, clf in classifiers_optimized.items():
#         clf.fit(X_train_fold, y_train_fold)
#         y_pred_proba = clf.predict_proba(X_test_fold)

#         # Compute ROC curve and ROC area for each class
#         fpr = dict()
#         tpr = dict()
#         roc_auc = dict()

#         for i in range(num_classes):
#             fpr[i], tpr[i], _ = roc_curve(y_test_fold_bin[:, i], y_pred_proba[:, i])
#             roc_auc[i] = auc(fpr[i], tpr[i])

#             # Interpolate the TPR to be the same length as mean_fpr
#             interp_tpr = np.interp(mean_fpr, fpr[i], tpr[i])
#             interp_tpr[0] = 0.0
#             interp_tprs[name][i].append(interp_tpr)

#         roc_curves[name].append(interp_tpr)
#         roc_aucs[name].append(roc_auc)

# for name, tprs in roc_curves.items():
#     color = colors.get(name, 'aqua')  # Default color if not found
#     mean_tpr = np.mean(tprs, axis=0)
#     mean_tpr[-1] = 1.0
#     mean_auc = auc(mean_fpr, mean_tpr)
#     std_auc = np.std(roc_aucs[name])
#     std_tpr = np.std(tprs, axis=0)

#     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
#     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

#     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color=color, alpha=.2)
#     plt.plot(mean_fpr, mean_tpr, color=color, label=f"{name}: {mean_auc:.2f} +/- {std_auc:.2f}")


# # Plot random chance line
# # plt.plot([0, 1], [0, 1], 'k--', label="Chance (AUC = 0.5)")

# # Set plot options
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.02])
# plt.xlabel('1 - Specificity (False Positive Rate)')
# plt.ylabel('Sensitivity (True Positive Rate)')
# # plt.title('Receiver Operating Characteristic with k-Fold Cross-Validation')

# # Hide the right and top spines
# plt.gca().spines['right'].set_visible(False)
# plt.gca().spines['top'].set_visible(False)

# plt.legend(loc="lower right", frameon=False, title='AUC', title_fontsize=11)

# plt.show()

# feature importance

In [None]:
# feature selection via Feature Importance
m = 20

X01_feature = df_t01.drop(columns=["Stage(TNM)"])
y01_feature = df_t01['Stage(TNM)']

rf = RandomForestClassifier()
rf.fit(X01_feature, y01_feature)
print("Train ACC : %.4f" % accuracy_score(y01_feature, rf.predict(X01_feature)))
fi_df = pd.DataFrame({'feature':X01_feature.columns, 'importance':rf.feature_importances_})
selected_cols = fi_df.sort_values(by="importance", ascending=False)[:m]["feature"].values

display(selected_cols)

X01_feature = df_t[selected_cols]
display(X01_feature)

In [None]:
import matplotlib.pyplot as plt

# fi_df를 feature importance가 높은 순서대로 정렬합니다.
sorted_fi_df = fi_df.sort_values(by="importance", ascending=False)

# 그래프를 그리기 위해 데이터를 준비합니다.
features = sorted_fi_df['feature']
importances = sorted_fi_df['importance']

# 그래프를 그립니다.
plt.figure(figsize=(10, 6))
plt.bar(features, importances)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.show()

In [None]:

# feature selection via Feature Importance
X012_feature = df_t.drop(columns=["Stage(TNM)"])
y012_feature = df_t['Stage(TNM)']

# 각 feature와 target variable 사이의 correlation을 계산합니다.
correlations = X012_feature.corrwith(y012_feature).sort_values(ascending=False)

rf = RandomForestClassifier()
rf.fit(X012_feature, y012_feature)
print("Train ACC : %.4f" % accuracy_score(y012_feature, rf.predict(X012_feature)))
fi_df = pd.DataFrame({'feature':X012_feature.columns, 'importance':rf.feature_importances_})
selected_cols = fi_df.sort_values(by="importance", ascending=False)[:m]["feature"].values

# Feature-Target Correlation 그래프를 그릴 때 selected_cols 순서에 따라 정렬
sorted_correlations = correlations.loc[selected_cols]

display(selected_cols)

X012_feature = df_t[selected_cols]
display(X012_feature)

# fi_df를 feature importance가 높은 순서대로 정렬합니다.
sorted_fi_df = fi_df.sort_values(by="importance", ascending=False)

# 그래프를 그리기 위해 데이터를 준비합니다.
features = sorted_fi_df['feature']
importances = sorted_fi_df['importance']

In [None]:
# Feature Importance 그래프
plt.figure(figsize=(9, 2.8))
plt.bar(features[:20], importances[:20], color='#778899')
plt.ylabel('Importance')
plt.xticks([])
plt.title('Feature Importance')
plt.savefig(base_path + 'Feature importance.png')
plt.show()

# Feature-Target Correlation 그래프
plt.figure(figsize=(9, 2.8))
plt.bar(sorted_correlations.index, sorted_correlations.values, color='#778899')
plt.xticks(rotation=45)
# plt.xlabel('Features')
plt.ylabel('Correlation')
plt.title('Feature Correlation')
plt.savefig(base_path + 'Feature correlation.png')
plt.show()

# Figure4.

In [None]:
df_vc

In [None]:
df_vd