# **Classification model**

학습하기에 들어가기 전, 앞서 선택한 최종 변수들을 사용하여 학습/시험용 데이터를 생성하겠습니다.

In [None]:
x_data = df[['성별코드', '연령대 코드(5세단위)', '신장(5Cm단위)', '체중(5Kg 단위)', '허리둘레', 
             '시력(좌)', '시력(우)', '청력(좌)', '청력(우)', '식전혈당(공복혈당)', '혈색소', 
             '요단백', '혈청크레아티닌', '(혈청지오티)AST', '(혈청지오티)ALT', '감마 지티피', 
             '흡연상태', '음주여부', '구강검진 수검여부']]
             
y_data = df[['혈압상태']]

7:3의 비율로 학습데이터와 검증데이터를 분리합니다.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=0, stratify = y_data)

scale이 각각 다른 연속형 변수들의 값을 통일시키기 위해 표준화 작업을 수행합니다.

In [None]:
scaler = StandardScaler()
cols_n = ['신장(5Cm단위)', '체중(5Kg 단위)', '허리둘레', '시력(좌)', '시력(우)', 
          '청력(좌)', '청력(우)', '식전혈당(공복혈당)', '혈색소', '요단백',
          '혈청크레아티닌', '(혈청지오티)AST', '(혈청지오티)ALT', '감마 지티피']
scaler.fit(x_train[cols_n])
x_train[cols_n] = scaler.transform(x_train[cols_n])
x_test[cols_n] = scaler.transform(x_test[cols_n])


Tensorflow와 Pytorch 기반의 심층 신경망 모델을 구축합니다. 

### **Pytorch**

In [None]:
%%capture
!pip install transformers
!pip install datasets

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import DataLoader, Dataset
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
from datasets import load_metric
from tqdm import tqdm

In [None]:
class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        self.fc1 = nn.Linear(19, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 3)
        self.dropout = nn.Dropout(p=0.2)
    
    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = self.dropout(x)
        x = nn.functional.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [None]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __getitem__(self, index):
        X_item, y_item = self.data[index]
        return X_item, y_item

    def __len__(self):
        return len(self.data)

In [None]:
model = DNN()
model

DNN(
  (fc1): Linear(in_features=19, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=0)

train_df, val_df

(        성별코드  연령대 코드(5세단위)  신장(5Cm단위)  체중(5Kg 단위)   허리둘레  시력(좌)  시력(우)  청력(좌)  \
 50211      1            11        165          55   75.1    1.2    1.2    1.0   
 467405     1             9        170          85   91.0    1.5    1.5    1.0   
 174239     2            10        155          55   83.3    0.7    0.7    1.0   
 236900     2             9        155          50   78.0    1.5    1.5    1.0   
 157759     2            14        160          65   74.0    0.8    0.4    1.0   
 ...      ...           ...        ...         ...    ...    ...    ...    ...   
 153314     2            11        160          75   82.0    0.7    0.6    1.0   
 970050     2            13        140          70  100.0    0.4    0.5    1.0   
 118727     2             9        155          50   76.0    1.2    1.2    1.0   
 438802     2            10        165          50   65.6    0.9    0.8    1.0   
 307755     1            11        175          70   81.0    1.0    1.2    1.0   
 
         청력(우)

In [None]:
## data loader 생성

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)


train_data = []
val_data = []

for i in tqdm(range(len(train_df))):
    x_tmp = torch.tensor(train_df.iloc[i].drop('혈압상태'))
    y_tmp = train_df['혈압상태'][i]
    train_data.append((x_tmp, y_tmp))
    
for i in tqdm(range(len(val_df))):
    x_tmp = torch.tensor(val_df.iloc[i].drop('혈압상태'))
    y_tmp = val_df['혈압상태'][i]
    val_data.append((x_tmp, y_tmp))
    
train_data[:5], val_data[:5], len(train_data), len(val_data)

100%|██████████| 784106/784106 [13:34<00:00, 962.72it/s]
100%|██████████| 196027/196027 [03:19<00:00, 984.59it/s]


([(tensor([  1.0000,  11.0000, 165.0000,  55.0000,  75.1000,   1.2000,   1.2000,
             1.0000,   1.0000,  81.0000,  15.8000,   1.0000,   0.8000,  38.0000,
            26.0000, 136.0000,   3.0000,   1.0000,   1.0000], dtype=torch.float64),
   0),
  (tensor([  1.0000,   9.0000, 170.0000,  85.0000,  91.0000,   1.5000,   1.5000,
             1.0000,   1.0000, 100.0000,  16.2000,   1.0000,   1.0000,  23.0000,
            48.0000,  43.0000,   2.0000,   0.0000,   1.0000], dtype=torch.float64),
   1),
  (tensor([  2.0000,  10.0000, 155.0000,  55.0000,  83.3000,   0.7000,   0.7000,
             1.0000,   1.0000,  94.0000,  12.8000,   2.0000,   0.7000,  27.0000,
            28.0000,  13.0000,   1.0000,   0.0000,   0.0000], dtype=torch.float64),
   2),
  (tensor([  2.0000,   9.0000, 155.0000,  50.0000,  78.0000,   1.5000,   1.5000,
             1.0000,   1.0000,  89.0000,  13.2000,   1.0000,   0.8000,  19.0000,
            11.0000,   4.0000,   1.0000,   0.0000,   0.0000], dtype=torch.float

다음과 같은 하이퍼 파라미터를 시도했습니다.

optimizer 
  - Adam
  - AdamW (채택)
  - SGD

lr_scheduler
  - StepLR (채택)
  - MultiStepLR



In [None]:
batch_size = 128

train_dataset = MyDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = MyDataset(val_data)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), betas=(0.9, 0.999), lr=1e-4, weight_decay=1e-6, eps=1e-08)
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-6)


lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, 
    step_size= 5, 
    gamma=0.1
)

# lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
#     optimizer, 
#     milestones=[20,40], 
#     gamma=0.1
# )



In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

def accuracy(outputs, labels):
    _, predicted = torch.max(outputs, dim=1)
    correct = (predicted == labels).sum().item()
    total = labels.size(0)
    acc = correct / total
    return acc

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

model.to(device)

cuda:0


DNN(
  (fc1): Linear(in_features=19, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
## 학습 시작

from time import time

num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    print("epoch_step ==>", epoch+1)
    epoch_start = time()
    
    train_acc = 0
    num_train_batches = 0
    for step, batch in enumerate(train_loader):

        batch = tuple(t.to(device) for t in batch)
        b_input, b_labels = batch
        
        input = b_input.to(dtype=torch.float32)
        labels = b_labels.to(device)
        
        optimizer.zero_grad()  # 그라디언트 초기화
        
        logits = model(input)
        loss = loss_fn(logits, labels)
        
        train_acc += accuracy(logits, labels)
        num_train_batches += 1
        
        
        if step % 1000 == 0:
            print("step :", step, "/", len(train_loader), "      loss :", loss.item())
        
        loss.backward()  # 역전파
        optimizer.step()  # 파라미터 업데이트
        
    lr_scheduler.step()  # 학습률 감소
    print("lr: ", optimizer.param_groups[0]['lr'])

    
    model.eval()
    
    val_acc = 0
    num_val_batches = 0
    with torch.no_grad():
        for val_batch in val_loader:
            
            val_batch = tuple(t.to(device) for t in val_batch)
            val_input, val_labels = val_batch
        
            input = val_input.to(dtype=torch.float32)
            labels = val_labels.to(device)
            
            logits = model(input)
            val_acc += accuracy(logits, labels)
            num_val_batches += 1
        
    train_acc /= num_train_batches
    val_acc /= num_val_batches
    
    print(f'\nEpoch {epoch+1}: Train Acc = {train_acc:.2f}, Val Acc = {val_acc:.2f}')
    epoch_end = time()
    
    epoch_elapsed = epoch_end - epoch_start
    print('Elapsed time is %f seconds.' % epoch_elapsed, "\n")

epoch_step ==> 1
step : 0 / 6126       loss : 4.77826452255249
step : 1000 / 6126       loss : 1.0904873609542847
step : 2000 / 6126       loss : 1.0632644891738892
step : 3000 / 6126       loss : 1.0517127513885498
step : 4000 / 6126       loss : 1.0394359827041626
step : 5000 / 6126       loss : 1.0095802545547485
step : 6000 / 6126       loss : 1.010966181755066
lr:  0.0001

Epoch 1: Train Acc = 0.46, Val Acc = 0.49
Elapsed time is 16.956844 seconds. 

epoch_step ==> 2
step : 0 / 6126       loss : 1.0452593564987183
step : 1000 / 6126       loss : 1.0397653579711914
step : 2000 / 6126       loss : 1.0580270290374756
step : 3000 / 6126       loss : 1.0117261409759521
step : 4000 / 6126       loss : 1.0977792739868164
step : 5000 / 6126       loss : 1.0195575952529907
step : 6000 / 6126       loss : 1.0315979719161987
lr:  0.0001

Epoch 2: Train Acc = 0.48, Val Acc = 0.49
Elapsed time is 16.613703 seconds. 

epoch_step ==> 3
step : 0 / 6126       loss : 1.0409446954727173
step : 1000 