#### 폐암 사망 데이터 분석
- 사용 데이터 : lung_cancer_mortality_data_large_v2.csv
- 피처/속성 : 15개
- 타겟/라벨 : survived
- 학습-방법 : 지도학습 > 분류> 이진분류
- 학습 알고리즘 : 인공신경망(ANN) -> 심층 신경망 (MLP, DNN) : 은닉층이 많은 구성
- 프레임워크 : Pytorch

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *

from imblearn.over_sampling import SMOTE

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler 
from torchmetrics.classification import BinaryF1Score, BinaryConfusionMatrix, BinaryAccuracy, BinaryRecall, BinaryPrecision, BinarySpecificity
from torchinfo import summary

#### 1. 데이터 불러오기

In [2]:
data = './lung_cancer_mortality_data_large_v2.csv'
lungDF = pd.read_csv(data)
lungDF.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,beginning_of_treatment_date,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Female,Germany,2016-04-07,Stage III,2016-04-21,No,Never Smoked,31.1,257,1,1,0,0,Combined,2017-11-15,0
1,2,50.0,Male,Czech Republic,2023-04-22,Stage III,2023-05-02,Yes,Passive Smoker,25.9,208,1,0,0,0,Radiation,2024-04-25,0
2,3,65.0,Male,Romania,2023-04-07,Stage IV,2023-04-12,No,Never Smoked,18.9,193,0,0,0,0,Surgery,2025-03-11,0
3,4,51.0,Female,Latvia,2016-02-07,Stage III,2016-02-13,Yes,Former Smoker,34.6,249,1,1,1,0,Surgery,2017-04-14,1
4,5,37.0,Male,Greece,2023-12-01,Stage I,2023-12-03,Yes,Never Smoked,40.2,262,0,0,0,0,Chemotherapy,2024-09-20,0


In [3]:
lungDF = lungDF.drop(['id','country'], axis=1)

In [4]:
lungDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3250000 entries, 0 to 3249999
Data columns (total 16 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   age                          float64
 1   gender                       object 
 2   diagnosis_date               object 
 3   cancer_stage                 object 
 4   beginning_of_treatment_date  object 
 5   family_history               object 
 6   smoking_status               object 
 7   bmi                          float64
 8   cholesterol_level            int64  
 9   hypertension                 int64  
 10  asthma                       int64  
 11  cirrhosis                    int64  
 12  other_cancer                 int64  
 13  treatment_type               object 
 14  end_treatment_date           object 
 15  survived                     int64  
dtypes: float64(2), int64(6), object(8)
memory usage: 396.7+ MB


#### 2. 데이터 전처리

In [5]:
lungDF['age'] = lungDF['age'].astype('int')
lungDF['beginning_of_treatment_date'] = pd.to_datetime(lungDF['beginning_of_treatment_date'])
lungDF['end_treatment_date'] = pd.to_datetime(lungDF['end_treatment_date'])
lungDF['diagnosis_date'] = pd.to_datetime(lungDF['diagnosis_date'])

In [6]:
lungDF['cancer_stage'] = lungDF['cancer_stage'].replace({'Stage I':1,'Stage II':2, 'Stage III':3, 'Stage IV':4})
lungDF['gender'] = lungDF['gender'].replace({'Male':0, 'Female':1})
lungDF['family_history'] = lungDF['family_history'].replace({'No':0, 'Yes':1})
lungDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3250000 entries, 0 to 3249999
Data columns (total 16 columns):
 #   Column                       Dtype         
---  ------                       -----         
 0   age                          int32         
 1   gender                       int64         
 2   diagnosis_date               datetime64[ns]
 3   cancer_stage                 int64         
 4   beginning_of_treatment_date  datetime64[ns]
 5   family_history               int64         
 6   smoking_status               object        
 7   bmi                          float64       
 8   cholesterol_level            int64         
 9   hypertension                 int64         
 10  asthma                       int64         
 11  cirrhosis                    int64         
 12  other_cancer                 int64         
 13  treatment_type               object        
 14  end_treatment_date           datetime64[ns]
 15  survived                     int64         
dtype

In [7]:
lungDF['smoking_status'].value_counts()

smoking_status
Passive Smoker    814289
Former Smoker     812264
Current Smoker    811824
Never Smoked      811623
Name: count, dtype: int64

In [8]:
lungDF['treatment_type'].value_counts()

treatment_type
Surgery         813480
Chemotherapy    812702
Radiation       812263
Combined        811555
Name: count, dtype: int64

In [9]:
endcoder = LabelEncoder()

In [10]:
lungDF['smoking_status'] = lungDF['smoking_status'].replace({'Never Smoked':0,'Passive Smoker':1,'Former Smoker':2,'Current Smoker':3})

In [11]:
endcoder.fit(lungDF['treatment_type'])
lungDF['treatment_type'] = endcoder.transform(lungDF['treatment_type'])

In [12]:
lungDF['start_days'] = (lungDF['beginning_of_treatment_date']-lungDF['diagnosis_date']).dt.days
lungDF['treatment_days'] = (lungDF['end_treatment_date'] - lungDF['beginning_of_treatment_date']).dt.days
lungDF.head()

Unnamed: 0,age,gender,diagnosis_date,cancer_stage,beginning_of_treatment_date,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived,start_days,treatment_days
0,64,1,2016-04-07,3,2016-04-21,0,0,31.1,257,1,1,0,0,1,2017-11-15,0,14,573
1,50,0,2023-04-22,3,2023-05-02,1,1,25.9,208,1,0,0,0,2,2024-04-25,0,10,359
2,65,0,2023-04-07,4,2023-04-12,0,0,18.9,193,0,0,0,0,3,2025-03-11,0,5,699
3,51,1,2016-02-07,3,2016-02-13,1,2,34.6,249,1,1,1,0,3,2017-04-14,1,6,426
4,37,0,2023-12-01,1,2023-12-03,1,0,40.2,262,0,0,0,0,0,2024-09-20,0,2,292


In [13]:
cancerDF = lungDF.drop(['diagnosis_date','beginning_of_treatment_date','end_treatment_date'], axis=1)
cancerDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3250000 entries, 0 to 3249999
Data columns (total 15 columns):
 #   Column             Dtype  
---  ------             -----  
 0   age                int32  
 1   gender             int64  
 2   cancer_stage       int64  
 3   family_history     int64  
 4   smoking_status     int64  
 5   bmi                float64
 6   cholesterol_level  int64  
 7   hypertension       int64  
 8   asthma             int64  
 9   cirrhosis          int64  
 10  other_cancer       int64  
 11  treatment_type     int32  
 12  survived           int64  
 13  start_days         int64  
 14  treatment_days     int64  
dtypes: float64(1), int32(2), int64(12)
memory usage: 347.1 MB


3. 모델 클래스 설계 및 정의
- - -
- 클래스 목적 : lungcancer 데이터 학습 및 추론 목적
- 클래스 이름 : CancerModel
- 부모 클래스 : nn.Module
- 매 개 변 수 : 층별 입출력 개수 고정하므로 필요 없음
- 속성 / 필드 :
- 기능 / 역할 : __init__() : 모델 구조 설정, forward() : 순방향 학습 <=오버라이딩(상속관계에서만 가능)
- 클래스 구조
    * 입력층 : 입력 20개   / 출력 1000개
    * 은닉층 : 입력 1000개 / 출력 500개
    * 은닉층 : 입력 500개  / 출력 200개
    * 은닉층 : 입력 200개  / 출력 100개
    * 은닉층 : 입력 100개  / 출력 50개
    * 출력층 : 입력 50개  / 출력 1개(이진분류)
- - -
- 손실함수 / 활성화 함수
    * 클래스 형태 ==> nn.BCELoss, nn.leakyReLU ==> __init__() 메서드
    * 함수 형태 ==> torch.nn.fuctional 아래에 ==> forward() 메서드

In [14]:
# class CancerModel(nn.Module):
#     def __init__(self,in_out=5,perceptrons = []) :
#         super().__init__()
#         self.i_layer = nn.Linear(14,perceptrons[0] if len(perceptrons)>0 else in_out)
        
#         self.h_layers = nn.ModuleList()
#         for idx in range(len(perceptrons)-1) :
#             self.h_layers.append(nn.Linear(perceptrons[idx], perceptrons[idx+1]))

#         self.o_layer = nn.Linear(perceptrons[-1] if len(perceptrons)>0 else in_out,1)

#     def forward(self, x):
#         # 입력층
#         y = F.relu(self.i_layer(x))

#         # 은닉층
#         for layer in self.h_layers:
#             y = F.relu(layer(y))
        
#         # 출력층
#         return self.o_layer(y)

In [15]:
# Dropout 실시

class CancerModel(nn.Module):
    def __init__(self,in_out=5,perceptrons = []) :
        super(CancerModel, self).__init__()
        self.i_layer = nn.Linear(14,perceptrons[0] if len(perceptrons)>0 else in_out)
        
        self.h_layers = nn.ModuleList()
        for idx in range(len(perceptrons)-1) :
            self.h_layers.append(nn.Linear(perceptrons[idx], perceptrons[idx+1]))
        self.dropout_prob = 0.5

        self.o_layer = nn.Linear(perceptrons[-1] if len(perceptrons)>0 else in_out,1)

    def forward(self, x):
        # 입력층
        y = F.relu(self.i_layer(x))

        # 은닉층
        for layer in self.h_layers:
            y = F.relu(layer(y))
            y = F.dropout(y, p=self.dropout_prob)
        # 출력층
        return self.o_layer(y)

In [16]:
model = CancerModel(perceptrons=[32,64,128,64,32,16])
print(model)

CancerModel(
  (i_layer): Linear(in_features=14, out_features=32, bias=True)
  (h_layers): ModuleList(
    (0): Linear(in_features=32, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=128, bias=True)
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): Linear(in_features=32, out_features=16, bias=True)
  )
  (o_layer): Linear(in_features=16, out_features=1, bias=True)
)


In [17]:
# 모델 사용 메모리 정보 확인
summary(model, input_size=(1000000,14))

Layer (type:depth-idx)                   Output Shape              Param #
CancerModel                              [1000000, 1]              --
├─Linear: 1-1                            [1000000, 32]             480
├─ModuleList: 1-2                        --                        --
│    └─Linear: 2-1                       [1000000, 64]             2,112
│    └─Linear: 2-2                       [1000000, 128]            8,320
│    └─Linear: 2-3                       [1000000, 64]             8,256
│    └─Linear: 2-4                       [1000000, 32]             2,080
│    └─Linear: 2-5                       [1000000, 16]             528
├─Linear: 1-3                            [1000000, 1]              17
Total params: 21,793
Trainable params: 21,793
Non-trainable params: 0
Total mult-adds (G): 21.79
Input size (MB): 56.00
Forward/backward pass size (MB): 2696.00
Params size (MB): 0.09
Estimated Total Size (MB): 2752.09

4. 데이터셋 클래스 설계 및 정의
- - -
- 데이터셋 : lung_cancer_mortality_data_large_v2.csv
- 피처 개수 : 15개
- 타겟 개수 : 1개
- 클래스 이름 : CancerDataset
- 부모 클래스 : utils.data.Dataset
- 속성 / 필드 : featureDF, targetDF, n_rows, n_features
- 필수 메서드
    * _ _init_ _(self)
    * _ _len_ _(self)
    * _ _getitem_ _(self, index) : 특정 인덱스의 피처와 타겟 반환

In [18]:
class CancerDataset(Dataset):
    def __init__(self, featureDF, targetDF):
        self.featureDF = featureDF
        self.targetDF = targetDF
        self.n_rows = featureDF.shape[0]
        self.n_features = featureDF.shape[1]

    def __len__(self):
        return self.n_rows
    
    def __getitem__(self, index):
        featureTS = torch.FloatTensor(self.featureDF.iloc[index].values)
        targetTS = torch.FloatTensor(self.targetDF.iloc[index].values)

        return featureTS, targetTS

In [19]:
## 데이터셋 인스턴스 생성
# 피처와 타겟 추출
featureDF = cancerDF.drop('survived',axis=1)
targetDF = cancerDF[['survived']]

# 커스텀 데이터셋 인스턴스 생성
cancerDS = CancerDataset(featureDF, targetDF)

# 데이터 로더 인스턴스 생성
cancerDL = DataLoader(cancerDS)
# for feature, target in cancerDL:
#     print(feature.shape, target.shape, feature, target, sep='\n')
#     break

5. 학습 준비

In [20]:
### 학습 진행 관련 설정
EPOCH = 100
BATCH_SIZE = 64
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
LR = 0.001

In [21]:
# 모델 인스턴스 생성
model = CancerModel(perceptrons=[32,64,16,8]).to(DEVICE)

In [22]:
# 학습/검증/테스트용 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(featureDF, targetDF, random_state=7, stratify=targetDF)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=7, stratify=y_train)
print(f'{X_train.shape} {X_test.shape} {X_val.shape}')
print(f'{y_train.shape} {y_test.shape} {y_val.shape}')

# # 오버 샘플링하기 - SMOTE 이용 ------------------------------------------
# smote = SMOTE(sampling_strategy=0.6, random_state=23)
# X_train, y_train = smote.fit_resample(X_train, y_train)

# 학습/검증/테스트용 데이터셋 생성
trainDS = CancerDataset(X_train, y_train)
valDS = CancerDataset(X_val, y_val)
testDS = CancerDataset(X_test, y_test)

# 학습용 데이터로더 인스턴스
trainDL = DataLoader(trainDS, batch_size=BATCH_SIZE)

(1828125, 14) (812500, 14) (609375, 14)
(1828125, 1) (812500, 1) (609375, 1)


In [29]:
# 클래스 가중치
values = y_train.value_counts().values.tolist()
weight = values[0]/sum(values)
Weight = torch.FloatTensor([weight]).to(DEVICE)

0.2196004102564103


In [24]:
# 최적화 인스턴스
optimizer = optim.RMSprop(model.parameters(), lr=LR)

# 최적화 스케줄링 인스턴스 생성
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='max',patience=5, verbose=True)

# 손실함수 인스턴스 : BCELoss / 예측값은 확률값으로 전달 ==> sigmoid() AF 처리 후 전달
reqLoss = nn.BCEWithLogitsLoss(weight=Weight)



6. 학습 진행

In [25]:
### models 폴더 아래 프로젝트 폴더 아래 모델 파일 저장
import os

# 저장 경로
SAVE_PATH = '../DeepLearning/models/project/'
# 저장 파일명
SAVE_FILE = 'model_train_wb.pth'

# 모델 구조 및 파라미터 모두 저장 파일명
SAVE_MODEL = 'model_all.pth'

In [26]:
# 경로상 폴더 존재 여부 체크
if not os.path.exists(SAVE_PATH) : os.makedirs(SAVE_PATH)   # 폴더 / 폴더 / ...  하위폴더까지 생성

In [27]:
# 학습 효과 확인
Loss_History, Score_History = [[],[],[]],[[],[],[]]
CNT = cancerDS.n_rows/BATCH_SIZE

for epoch in range(EPOCH):
    # 학습 모드로 모델 설정
    model.train()

    # 배치크기만큼 데이터 로딩 후 학습 진행
    total_loss, total_score = 0,0

    for featureTS, targetTS in trainDL:
        # 학습 진행
        pre_y = model(featureTS)

        # 손실 계산
        loss = reqLoss(pre_y,targetTS)
        total_loss += loss.item()

        # 성능평가 계산
        score = BinarySpecificity(threshold=weight)(pre_y, targetTS)
        total_score += score.item()

        # 최적화 진행
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # 에포크 당 검증
    # 검증 모드로 모델 설정
    model.eval()

    with torch.no_grad():
        # 검증용 데이터셋 생성
        val_feature_TS = torch.FloatTensor(valDS.featureDF.values)
        val_target_TS = torch.FloatTensor(valDS.targetDF.values)

        # 평가
        pre_val = model(val_feature_TS)
        
        # 손실 계산
        loss_val = reqLoss(pre_val, val_target_TS)
        
        # 성능 평가
        score_val = BinarySpecificity(threshold=weight)(pre_val, val_target_TS)
        

    # 테스트 데이터 확인
    model.eval()

    with torch.no_grad():
        # 테스트용 데이터셋 생성
        test_featureTS = torch.FloatTensor(testDS.featureDF.values)
        test_target_TS = torch.FloatTensor(testDS.targetDF.values)

        # 평가
        pre_test = model(test_featureTS)

        # 손실 계산
        loss_test = reqLoss(pre_test, test_target_TS)

        # 성능 평가
        score_test = BinarySpecificity(threshold=weight)(pre_test, test_target_TS)
    
    # 에포크 당 손실과 성능 평가 값 저장
    Loss_History[0].append(total_loss/CNT)
    Score_History[0].append(total_score/CNT)

    Loss_History[1].append(loss_val)
    Score_History[1].append(score_val)
    
    Loss_History[2].append(loss_test)
    Score_History[2].append(score_test)
    

    print(f'[{epoch+1}/{EPOCH}]\n- [TRAIN] LOSS : {Loss_History[0][-1]} SCORE : {Score_History[0][-1]}')
    print(f'- [VALID] LOSS : {Loss_History[1][-1]} SCORE : {Score_History[1][-1]}')
    print(f'- [TEST]  LOSS : {Loss_History[2][-1]} SCORE : {Score_History[2][-1]}')

    # 최적화 스케줄러 인스턴스 업데이트
    scheduler.step(score_val)
    print(f'scheduler.num_bad_epochs => {scheduler.num_bad_epochs}')

    # 성능이 좋은 학습 가중치 저장
    # SAVE_FILE = f'model_train_wb{epoch}_{score_val:.2f}.pth'  # 성능이 좋아진 에포크, 스코어마다 파일 새로 저장
    if len(Score_History[1]) == 1:
        # 첫번째라서 무조건 모델 파라미터 저장
        torch.save(model.state_dict(), SAVE_PATH+SAVE_FILE)
        # 모델 전체 저장
        torch.save(model, SAVE_PATH+SAVE_MODEL)
    else : 
        if Score_History[1][-1] > max(Score_History[1][:-1]) :
            torch.save(model.state_dict(), SAVE_PATH+SAVE_FILE)
            torch.save(model, SAVE_PATH+SAVE_MODEL)

    # 손실감소(또는 성능개선)가 안되는 경우 조기 종료
    if scheduler.num_bad_epochs >= scheduler.patience :
        print(f'성능 개선이 없어서 {scheduler.patience} EPOCH에 조기 종료함!')
        break

[1/100]
- [TRAIN] LOSS : 0.23146880982149565 SCORE : 0.5625055027184119
- [VALID] LOSS : 0.4108080565929413 SCORE : 1.0
- [TEST]  LOSS : 0.4108094871044159 SCORE : 1.0
scheduler.num_bad_epochs => 0
[2/100]
- [TRAIN] LOSS : 0.23109488361740113 SCORE : 0.5625107692307693
- [VALID] LOSS : 0.4108528792858124 SCORE : 1.0
- [TEST]  LOSS : 0.4108756184577942 SCORE : 1.0
scheduler.num_bad_epochs => 1
[3/100]
- [TRAIN] LOSS : 0.23108625784536507 SCORE : 0.5625107692307693
- [VALID] LOSS : 0.41080835461616516 SCORE : 1.0
- [TEST]  LOSS : 0.4108092188835144 SCORE : 1.0
scheduler.num_bad_epochs => 2
[4/100]
- [TRAIN] LOSS : 0.23108598473064715 SCORE : 0.5625107692307693
- [VALID] LOSS : 0.41080835461616516 SCORE : 1.0
- [TEST]  LOSS : 0.4108092188835144 SCORE : 1.0
scheduler.num_bad_epochs => 3
[5/100]
- [TRAIN] LOSS : 0.23108598473064715 SCORE : 0.5625107692307693
- [VALID] LOSS : 0.41080835461616516 SCORE : 1.0
- [TEST]  LOSS : 0.4108092188835144 SCORE : 1.0
scheduler.num_bad_epochs => 4
[6/100]