In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
import matplotlib as mpl
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

In [2]:
df = pd.read_csv('./../p_lungcancer_all_add2.csv')
df2 = df.copy()

In [3]:
df.Stage.value_counts()

Stage
Stage 4    7151
Stage 1    3338
Stage 3    2765
Stage 2    1743
Stage 0       3
Name: count, dtype: int64

In [4]:
'''
def encode_T_stage(row):
    if row['TX'] == 1:
        return 0  # 종양 발견 불가
    elif row['T0'] == 1:
        return 1  # 종양 없음
    elif row['T1a'] == 1:
        return 2  # T1a
    elif row['T1b'] == 1:
        return 3  # T1b
    elif row['T1c'] == 1:
        return 4  # T1c
    elif row['T2a'] == 1:
        return 5  # T2a
    elif row['T2b'] == 1:
        return 6  # T2b
    elif row['T3'] == 1:
        return 7  # T3
    elif row['T4'] == 1:
        return 8  # T4
    return 0  # Default 값

# 새 컬럼 생성
df['T_stage'] = df.apply(encode_T_stage, axis=1)
'''

"\ndef encode_T_stage(row):\n    if row['TX'] == 1:\n        return 0  # 종양 발견 불가\n    elif row['T0'] == 1:\n        return 1  # 종양 없음\n    elif row['T1a'] == 1:\n        return 2  # T1a\n    elif row['T1b'] == 1:\n        return 3  # T1b\n    elif row['T1c'] == 1:\n        return 4  # T1c\n    elif row['T2a'] == 1:\n        return 5  # T2a\n    elif row['T2b'] == 1:\n        return 6  # T2b\n    elif row['T3'] == 1:\n        return 7  # T3\n    elif row['T4'] == 1:\n        return 8  # T4\n    return 0  # Default 값\n\n# 새 컬럼 생성\ndf['T_stage'] = df.apply(encode_T_stage, axis=1)\n"

In [5]:
'''
# N단계 인코딩 함수
def encode_N_stage(row):
    if row['N1'] == 1:
        return 1  # N1: 동측 폐문 또는 폐 내부 림프절 전이
    elif row['N2'] == 1:
        return 2  # N2: 동측 종격림프절 전이
    elif row['N3'] == 1:
        return 3  # N3: 반대측 림프절 전이
    return 0  # N0: 림프절 전이 없음

# 새 N_stage 컬럼 생성
df['N_stage'] = df.apply(encode_N_stage, axis=1)
'''

"\n# N단계 인코딩 함수\ndef encode_N_stage(row):\n    if row['N1'] == 1:\n        return 1  # N1: 동측 폐문 또는 폐 내부 림프절 전이\n    elif row['N2'] == 1:\n        return 2  # N2: 동측 종격림프절 전이\n    elif row['N3'] == 1:\n        return 3  # N3: 반대측 림프절 전이\n    return 0  # N0: 림프절 전이 없음\n\n# 새 N_stage 컬럼 생성\ndf['N_stage'] = df.apply(encode_N_stage, axis=1)\n"

In [6]:
'''
# M단계 인코딩 함수
def encode_M_stage(row):
    if row['M1a'] == 1:
        return 1  # M1a: 흉막, 심낭 전이 또는 반대측 폐 전이
    elif row['M1b'] == 1:
        return 2  # M1b: 단일 원격 장기 전이
    elif row['M1c'] == 1:
        return 3  # M1c: 다수의 원격 장기 전이
    return 0  # M0: 원격 전이 없음

# 새 M_stage 컬럼 생성
df['M_stage'] = df.apply(encode_M_stage, axis=1)
'''

"\n# M단계 인코딩 함수\ndef encode_M_stage(row):\n    if row['M1a'] == 1:\n        return 1  # M1a: 흉막, 심낭 전이 또는 반대측 폐 전이\n    elif row['M1b'] == 1:\n        return 2  # M1b: 단일 원격 장기 전이\n    elif row['M1c'] == 1:\n        return 3  # M1c: 다수의 원격 장기 전이\n    return 0  # M0: 원격 전이 없음\n\n# 새 M_stage 컬럼 생성\ndf['M_stage'] = df.apply(encode_M_stage, axis=1)\n"

In [7]:
df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2

In [8]:
df.head()

Unnamed: 0,AGE,Stage,Tumor_Size,lymph_node_meta_size,Adenocarcinoma,Large_cell_carcinoma,Squamous_cell_carcinoma,TX,T0,T1,...,Death,Survival_period,EGFR_0,EGFR_1,EGFR_99,Drink_1,Drink_2,Drink_3,Drink_99,BMI
0,72,Stage 1,0.0,0.0,1,1,0,1,0,0,...,0,157,False,False,True,False,False,False,True,33.822654
1,66,Stage 4,2.0,4.5,1,0,0,0,0,1,...,0,567,False,False,True,False,False,False,True,18.984376
2,75,Stage 4,0.0,0.0,1,1,0,1,0,0,...,0,253,False,False,True,False,False,True,False,25.596524
3,77,Stage 4,2.0,8.0,1,0,1,0,0,1,...,0,266,False,True,False,False,True,False,False,31.486033
4,85,Stage 1,2.5,0.0,1,0,0,0,0,0,...,0,470,False,False,True,False,True,False,False,14.922211


In [9]:
#T0는 종양이 존재하지 않음을 의미. 즉, 종양이 없다는 정보이기 때문에 다른 T 값들과 합산하면 의미가 왜곡될 우려 있음.
#TX는 종양의 크기를 정확하게 알 수 없다는 의미, 사실상 정보가 없다고 볼 수 있음. 
# 따라서 TX를 다른 T 값들과 합산하면 제대로 된 의미를 갖지 못함.
# T0와 TX는 제외하고 T1~T4 값만 합산
df['T_sum'] = df[['T1', 'T1a', 'T1b', 'T1c', 'T2', 'T2a', 'T2b', 'T3', 'T4']].sum(axis=1)

In [10]:
# 각 T 항목에 대해 원하는 가중치 곱하기
df['T_weighted_sum'] = (df['T1'] * 2 +
                        df['T1a'] * 0.5 +
                        df['T1b'] * 1.5 +
                        df['T1c'] * 2.5 +
                        df['T2'] * 4 +
                        df['T2a'] * 3.5 +
                        df['T2b'] * 4.5 +
                        df['T3'] * 6 +
                        df['T4'] * 8.5)

In [11]:
# 각 N 항목에 대해 원하는 가중치 곱하기 (2, 4.5, 8)
df['N_weighted_sum'] = (df['N1'] * 2 +
                        df['N2'] * 4.5 +
                        df['N3'] * 8)

In [12]:
# N 항목 합산하기
df['N_sum'] = df['N1'] + df['N2'] + df['N3']

In [13]:
# M 항목 합산하기
df['M_sum'] = df['M1a'] + df['M1b'] + df['M1c']

In [14]:
df.head()

Unnamed: 0,AGE,Stage,Tumor_Size,lymph_node_meta_size,Adenocarcinoma,Large_cell_carcinoma,Squamous_cell_carcinoma,TX,T0,T1,...,Drink_1,Drink_2,Drink_3,Drink_99,BMI,T_sum,T_weighted_sum,N_weighted_sum,N_sum,M_sum
0,72,Stage 1,0.0,0.0,1,1,0,1,0,0,...,False,False,False,True,33.822654,0,0.0,0.0,0,0
1,66,Stage 4,2.0,4.5,1,0,0,0,0,1,...,False,False,False,True,18.984376,1,2.0,4.5,1,1
2,75,Stage 4,0.0,0.0,1,1,0,1,0,0,...,False,False,True,False,25.596524,0,0.0,0.0,0,1
3,77,Stage 4,2.0,8.0,1,0,1,0,0,1,...,False,True,False,False,31.486033,1,2.0,10.0,2,1
4,85,Stage 1,2.5,0.0,1,0,0,0,0,0,...,False,True,False,False,14.922211,1,2.5,0.0,0,0


In [15]:
from sklearn.preprocessing import OrdinalEncoder

# 'stage' 컬럼에 대해 순서 지정 (1기 → 4기)
stage_order = [['Stage 0','Stage 1', 'Stage 2', 'Stage 3', 'Stage 4']]

# OrdinalEncoder 객체 생성 및 적용
encoder = OrdinalEncoder(categories=stage_order)
df['Stage'] = encoder.fit_transform(df[['Stage']])

In [16]:
df.head()

Unnamed: 0,AGE,Stage,Tumor_Size,lymph_node_meta_size,Adenocarcinoma,Large_cell_carcinoma,Squamous_cell_carcinoma,TX,T0,T1,...,Drink_1,Drink_2,Drink_3,Drink_99,BMI,T_sum,T_weighted_sum,N_weighted_sum,N_sum,M_sum
0,72,1.0,0.0,0.0,1,1,0,1,0,0,...,False,False,False,True,33.822654,0,0.0,0.0,0,0
1,66,4.0,2.0,4.5,1,0,0,0,0,1,...,False,False,False,True,18.984376,1,2.0,4.5,1,1
2,75,4.0,0.0,0.0,1,1,0,1,0,0,...,False,False,True,False,25.596524,0,0.0,0.0,0,1
3,77,4.0,2.0,8.0,1,0,1,0,0,1,...,False,True,False,False,31.486033,1,2.0,10.0,2,1
4,85,1.0,2.5,0.0,1,0,0,0,0,0,...,False,True,False,False,14.922211,1,2.5,0.0,0,0


In [17]:
df = pd.get_dummies(data = df, columns=['Smoke'], drop_first=True)

In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['Tumor_Size', 'lymph_node_meta_size','Height','Weight','FEV1_FVC_P',
        'DLCO_VA_P','AGE','Survival_period','BMI', 
        'T_sum','N_sum','M_sum','T_weighted_sum','N_weighted_sum']] = scaler.fit_transform(df[['Tumor_Size', 'lymph_node_meta_size','Height','Weight',
                           'FEV1_FVC_P','DLCO_VA_P','AGE','Survival_period','BMI',
                          'T_sum','N_sum','M_sum',
                           'T_weighted_sum','N_weighted_sum']])

In [22]:
df.columns

Index(['AGE', 'Stage', 'Tumor_Size', 'lymph_node_meta_size', 'Adenocarcinoma',
       'Large_cell_carcinoma', 'Squamous_cell_carcinoma', 'TX', 'T0', 'T1',
       'T1a', 'T1b', 'T1c', 'T2', 'T2a', 'T2b', 'T3', 'T4', 'N0', 'N1', 'N2',
       'N3', 'M0', 'M1a', 'M1b', 'M1c', 'Height', 'Weight', 'FEV1_FVC_P',
       'DLCO_VA_P', 'Operation', 'Chemotherapy', 'Radiation_Therapy', 'Death',
       'Survival_period', 'EGFR_0', 'EGFR_1', 'EGFR_99', 'Drink_1', 'Drink_2',
       'Drink_3', 'Drink_99', 'BMI', 'T_sum', 'T_weighted_sum',
       'N_weighted_sum', 'N_sum', 'M_sum', 'Smoke_1', 'Smoke_2'],
      dtype='object')

In [23]:
df_train = df.iloc[0:10000]

In [24]:
df_test = df.iloc[10000:15000]

In [25]:
df_train.head()

Unnamed: 0,AGE,Stage,Tumor_Size,lymph_node_meta_size,Adenocarcinoma,Large_cell_carcinoma,Squamous_cell_carcinoma,TX,T0,T1,...,Drink_3,Drink_99,BMI,T_sum,T_weighted_sum,N_weighted_sum,N_sum,M_sum,Smoke_1,Smoke_2
0,0.63764,1.0,-0.925967,-0.829312,1,1,0,1,0,0,...,False,True,1.577004,-1.789653,-0.925337,-0.817569,-1.022336,-0.954501,False,True
1,0.225704,4.0,-0.095256,0.769162,1,0,0,0,0,1,...,False,True,-0.98825,0.558768,-0.094582,0.733905,0.852134,1.047668,False,True
2,0.843608,4.0,-0.925967,-0.829312,1,1,0,1,0,0,...,True,False,0.154863,-1.789653,-0.925337,-0.817569,-1.022336,1.047668,False,True
3,0.980921,4.0,-0.095256,2.01242,1,0,1,0,0,1,...,False,False,1.173047,0.558768,-0.094582,2.630151,2.726603,1.047668,False,True
4,1.530169,1.0,0.112422,-0.829312,1,0,0,0,0,0,...,False,False,-1.690521,0.558768,0.113107,-0.817569,-1.022336,-0.954501,False,True
