In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
import matplotlib as mpl
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

In [2]:
df = pd.read_csv('./../p_lungcancer_all_add2.csv')
df2 = df.copy()

In [3]:
df.Stage.value_counts()

Stage
Stage 4    7151
Stage 1    3338
Stage 3    2765
Stage 2    1743
Stage 0       3
Name: count, dtype: int64

In [4]:
df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2

In [5]:
df.head()

Unnamed: 0,AGE,Stage,Tumor_Size,lymph_node_meta_size,Adenocarcinoma,Large_cell_carcinoma,Squamous_cell_carcinoma,TX,T0,T1,...,Death,Survival_period,EGFR_0,EGFR_1,EGFR_99,Drink_1,Drink_2,Drink_3,Drink_99,BMI
0,72,Stage 1,0.0,0.0,1,1,0,1,0,0,...,0,157,False,False,True,False,False,False,True,33.822654
1,66,Stage 4,2.0,4.5,1,0,0,0,0,1,...,0,567,False,False,True,False,False,False,True,18.984376
2,75,Stage 4,0.0,0.0,1,1,0,1,0,0,...,0,253,False,False,True,False,False,True,False,25.596524
3,77,Stage 4,2.0,8.0,1,0,1,0,0,1,...,0,266,False,True,False,False,True,False,False,31.486033
4,85,Stage 1,2.5,0.0,1,0,0,0,0,0,...,0,470,False,False,True,False,True,False,False,14.922211


In [6]:
#T0는 종양이 존재하지 않음을 의미. 즉, 종양이 없다는 정보이기 때문에 다른 T 값들과 합산하면 의미가 왜곡될 우려 있음.
#TX는 종양의 크기를 정확하게 알 수 없다는 의미, 사실상 정보가 없다고 볼 수 있음. 
# 따라서 TX를 다른 T 값들과 합산하면 제대로 된 의미를 갖지 못함.
# T0와 TX는 제외하고 T1~T4 값만 합산
df['T_sum'] = df[['T1', 'T1a', 'T1b', 'T1c', 'T2', 'T2a', 'T2b', 'T3', 'T4']].sum(axis=1)

In [7]:
# 각 T 항목에 대해 사이즈 곱하기
df['T_sized_sum'] = (df['T1'] * 2 +
                        df['T1a'] * 0.5 +
                        df['T1b'] * 1.5 +
                        df['T1c'] * 2.5 +
                        df['T2'] * 4 +
                        df['T2a'] * 3.5 +
                        df['T2b'] * 4.5 +
                        df['T3'] * 6 +
                        df['T4'] * 8.5)

In [8]:
# 각 N 항목에 대해 원하는 사이즈 곱하기 (2, 4.5, 8)
df['N_sized_sum'] = (df['N1'] * 2 +
                        df['N2'] * 4.5 +
                        df['N3'] * 8)

In [9]:
# N 항목 합산하기
df['N_sum'] = df['N1'] + df['N2'] + df['N3']

In [10]:
# M 항목 합산하기
df['M_sum'] = df['M1a'] + df['M1b'] + df['M1c']

In [11]:
# 각 T 항목에 대해 원하는 가중치 곱하기
df['T_weighted_sum'] = (df['T1'] * 1 +
                        df['T1a'] * 1 +
                        df['T1b'] * 1 +
                        df['T1c'] * 1 +
                        df['T2'] * 1 +
                        df['T2a'] * 1 +
                        df['T2b'] * 1 +
                        df['T3'] * 2 +
                        df['T4'] * 3)

In [12]:
# 각 N 항목에 대해 원하는 가중치 곱하기 (2, 2.5,3)
df['N_weighted_sum'] = (df['N1'] * 2 +
                        df['N2'] * 2.5 +
                        df['N3'] * 3)

In [13]:
# 각 M 항목에 대해 원하는 가중치 곱하기 (3,4,5)
df['M_weighted_sum'] = (df['M1a'] * 3 +
                        df['M1b'] * 4 +
                        df['M1c'] * 5)

In [14]:
df.head()

Unnamed: 0,AGE,Stage,Tumor_Size,lymph_node_meta_size,Adenocarcinoma,Large_cell_carcinoma,Squamous_cell_carcinoma,TX,T0,T1,...,Drink_99,BMI,T_sum,T_sized_sum,N_sized_sum,N_sum,M_sum,T_weighted_sum,N_weighted_sum,M_weighted_sum
0,72,Stage 1,0.0,0.0,1,1,0,1,0,0,...,True,33.822654,0,0.0,0.0,0,0,0,0.0,0
1,66,Stage 4,2.0,4.5,1,0,0,0,0,1,...,True,18.984376,1,2.0,4.5,1,1,1,2.5,4
2,75,Stage 4,0.0,0.0,1,1,0,1,0,0,...,False,25.596524,0,0.0,0.0,0,1,0,0.0,4
3,77,Stage 4,2.0,8.0,1,0,1,0,0,1,...,False,31.486033,1,2.0,10.0,2,1,1,5.0,3
4,85,Stage 1,2.5,0.0,1,0,0,0,0,0,...,False,14.922211,1,2.5,0.0,0,0,1,0.0,0


In [15]:
from sklearn.preprocessing import OrdinalEncoder

# 'stage' 컬럼에 대해 순서 지정 (1기 → 4기)
stage_order = [['Stage 0','Stage 1', 'Stage 2', 'Stage 3', 'Stage 4']]

# OrdinalEncoder 객체 생성 및 적용
encoder = OrdinalEncoder(categories=stage_order)
df['Stage'] = encoder.fit_transform(df[['Stage']])

In [16]:
df.head()

Unnamed: 0,AGE,Stage,Tumor_Size,lymph_node_meta_size,Adenocarcinoma,Large_cell_carcinoma,Squamous_cell_carcinoma,TX,T0,T1,...,Drink_99,BMI,T_sum,T_sized_sum,N_sized_sum,N_sum,M_sum,T_weighted_sum,N_weighted_sum,M_weighted_sum
0,72,1.0,0.0,0.0,1,1,0,1,0,0,...,True,33.822654,0,0.0,0.0,0,0,0,0.0,0
1,66,4.0,2.0,4.5,1,0,0,0,0,1,...,True,18.984376,1,2.0,4.5,1,1,1,2.5,4
2,75,4.0,0.0,0.0,1,1,0,1,0,0,...,False,25.596524,0,0.0,0.0,0,1,0,0.0,4
3,77,4.0,2.0,8.0,1,0,1,0,0,1,...,False,31.486033,1,2.0,10.0,2,1,1,5.0,3
4,85,1.0,2.5,0.0,1,0,0,0,0,0,...,False,14.922211,1,2.5,0.0,0,0,1,0.0,0


In [17]:
#Smoke에 원핫 인코딩하기
df = pd.get_dummies(data = df, columns=['Smoke'], drop_first=True)

In [18]:
#범주형을 제외한 컬럼에 정규화
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['Tumor_Size', 'lymph_node_meta_size','Height','Weight','FEV1_FVC_P',
        'DLCO_VA_P','AGE','Survival_period','BMI', 
        'T_sum','N_sum','M_sum','T_weighted_sum','N_weighted_sum','M_weighted_sum','T_sized_sum','N_sized_sum']] = scaler.fit_transform(df[['Tumor_Size', 'lymph_node_meta_size','Height','Weight',
                           'FEV1_FVC_P','DLCO_VA_P','AGE','Survival_period','BMI',
                          'T_sum','N_sum','M_sum',
                           'T_weighted_sum','N_weighted_sum','M_weighted_sum','T_sized_sum','N_sized_sum']])

In [19]:
#df 컬럼 확인
df.columns

Index(['AGE', 'Stage', 'Tumor_Size', 'lymph_node_meta_size', 'Adenocarcinoma',
       'Large_cell_carcinoma', 'Squamous_cell_carcinoma', 'TX', 'T0', 'T1',
       'T1a', 'T1b', 'T1c', 'T2', 'T2a', 'T2b', 'T3', 'T4', 'N0', 'N1', 'N2',
       'N3', 'M0', 'M1a', 'M1b', 'M1c', 'Height', 'Weight', 'FEV1_FVC_P',
       'DLCO_VA_P', 'Operation', 'Chemotherapy', 'Radiation_Therapy', 'Death',
       'Survival_period', 'EGFR_0', 'EGFR_1', 'EGFR_99', 'Drink_1', 'Drink_2',
       'Drink_3', 'Drink_99', 'BMI', 'T_sum', 'T_sized_sum', 'N_sized_sum',
       'N_sum', 'M_sum', 'T_weighted_sum', 'N_weighted_sum', 'M_weighted_sum',
       'Smoke_1', 'Smoke_2'],
      dtype='object')

In [20]:
#Train data set 분리
df_train = df.iloc[0:10000]

In [21]:
#Test data set 분리
df_test = df.iloc[10000:15000]

In [23]:
# Test data set에 대한 reset index 실행
df_test.reset_index(drop=True, inplace=True)

In [24]:
df_test.head()

Unnamed: 0,AGE,Stage,Tumor_Size,lymph_node_meta_size,Adenocarcinoma,Large_cell_carcinoma,Squamous_cell_carcinoma,TX,T0,T1,...,T_sum,T_sized_sum,N_sized_sum,N_sum,M_sum,T_weighted_sum,N_weighted_sum,M_weighted_sum,Smoke_1,Smoke_2
0,-1.902632,4.0,0.736174,-0.829312,1,0,0,0,0,0,...,0.558768,0.736174,-0.817569,-1.022336,1.047668,0.058894,-0.994531,1.072939,True,False
1,-0.186232,2.0,-0.30227,-0.118879,1,0,0,0,0,0,...,0.558768,-0.30227,-0.128025,0.852134,-0.954501,0.058894,0.513629,-0.919212,False,False
2,-1.216072,4.0,-0.925337,-0.118879,1,1,1,1,0,0,...,-1.789653,-0.925337,-0.128025,0.852134,1.047668,-1.244074,0.513629,1.570977,False,False
3,-2.177256,4.0,-0.925337,-0.118879,0,0,0,1,0,0,...,-1.789653,-0.925337,-0.128025,0.852134,1.047668,-1.244074,0.513629,0.574902,True,False
4,-0.529512,3.0,-0.717648,2.01242,0,0,1,0,0,0,...,0.558768,-0.717648,1.940607,0.852134,-0.954501,0.058894,1.26771,-0.919212,True,False


In [25]:
df_train2 = df_train.copy()

In [26]:
df_train.Death.value_counts()

Death
0    7897
1    2103
Name: count, dtype: int64

In [27]:
df_test.Death.value_counts()

Death
0    3089
1    1911
Name: count, dtype: int64

In [28]:
#Train Classifier data set - X
train_C_x = df_train.drop(columns=['Death','Drink_1','Drink_2','Drink_3','Drink_99','EGFR_99',
                                   'T1', 'T1a', 'T1b', 'T1c', 'T2', 'T2a', 'T2b', 'T3', 'T4', 'N1', 'N2', 'N3', 'M1a', 'M1b', 'M1c' ], axis=1)

In [29]:
#Train Classifier data set - Target
train_C_y = df_train.Death.values

In [30]:
train_C_x.shape

(10000, 32)

In [31]:
#Test Classifier data set - X
test_C_x = df_test.drop(columns=['Death','Drink_1','Drink_2','Drink_3','Drink_99','EGFR_99',
                                 'T1', 'T1a', 'T1b', 'T1c', 'T2', 'T2a', 'T2b', 'T3', 'T4', 'N1', 'N2', 'N3', 'M1a', 'M1b', 'M1c' ], axis=1)

In [32]:
##Test Classifier data set - target
test_C_y = df_test.Death.values