In [1]:
import torch
from torch import nn
import torch.optim as optim # Adam, SGD
from torch.utils.data import DataLoader # 
from torch.utils.data import Dataset
from torchvision import datasets,transforms
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import os
#import matplotlib.pyplot as plt

*** 데이터 작업을 위한 기본 요소 ***

1. torch.utils.data.DataLoader -> DataLoader는 Dataset을 반복 가능한 객체(iterable)로 감쌉니다.

2. torch.utils.data.Dataset -> Dataset은 샘플과 정답(label)을 저장

torchvision.datasets 모듈은 CIFAR, COCO등과 같은 다양한 실제 비전(vision)데이터에 대한 Dataset을 포함하고 있습니다.



데이터 전처리 순서


1. 이름,Cabin, 티켓 삭제
2. NaN 처리
3. 성별/항구 

0:male, 1:female

0:S, 1:C, 2:Q

In [2]:
train_set = pd.read_csv('Desktop/titanic/train.csv',index_col=0)

In [3]:
def preprocessing(data):
    data.drop(['Name','Cabin','Ticket'], axis=1, inplace=True)       # 1. 이름,캐빈,티켓삭제
    data['Age'].fillna(data['Age'].mean(), inplace=True)             # 2. NAN 처리
    data.dropna(inplace=True)                                        # 2. Nan 처리
    data['Sex'] = data['Sex'].map({'female': 0, 'male': 1})          # 3. 성별 numeric
    data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q':2}) # 3. 항구 numeric
    return data

In [4]:
train_set = preprocessing(train_set)

print(train_set.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 1 to 891
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    int64  
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 62.5 KB
None


In [5]:
class TrainDataset(Dataset):
    def __init__(self, data):                 #데이터셋의 전처리를 해주는 부분
        self.data = data
        self.x_data = self.data.drop('Survived', axis=1)
        self.y_data = self.data['Survived']
    
    def __len__(self):                        #데이터셋의 길이. 즉, 총 샘플의 수를 적어주는 부분
        return len(self.x_data)
    
    def __getitem__(self, idx):               #데이터셋에서 특정 1개의 샘플을 가져오는 함수
        x = torch.FloatTensor(self.x_data.iloc[idx])
        y = self.y_data.iloc[idx]
        return x, y

In [6]:
train, valid  = train_test_split(train_set, test_size=0.2, random_state=42)
train, test = train_test_split(train, test_size=0.1, random_state=42)

# df = train_set
# train, valid, test = np.split(df.sample(frac=1,random_state=42),[int(.7*len(df)), int(.8*len(df))])

train_data = TrainDataset(train) # train 7
valid_data = TrainDataset(valid)   # valid 2
test_data = TrainDataset(test)   # test 1

trainloader = DataLoader(train_data, batch_size=10, shuffle=False, num_workers=0)
validloader = DataLoader(valid_data, batch_size=10, shuffle=False, num_workers=0)
testloader = DataLoader(test_data, batch_size=10, shuffle=False, num_workers=0)

In [7]:
print(train.shape)
print(valid.shape)
print(test.shape)

(639, 8)
(178, 8)
(72, 8)


In [8]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

 Hyperparameters

 input_size = 10
 num_classes = 2
 learning_rate = 0.001
 batch_size = 10
 num_epoch = 1

In [9]:
# Create Fully Connected Network
class DNN(nn.Module): # 10 x 7
    def __init__(self):
        super(DNN, self).__init__() # call parent class
    
        self.inputlayer = nn.Sequential(
        nn.Linear(7, 20),
        nn.BatchNorm1d(20),
        nn.ReLU(),
        nn.Dropout(0.2))
        
        self.hiddenlayer1 = nn.Sequential(
        nn.Linear(20,20),
        nn.BatchNorm1d(20),
        nn.ReLU(),
        nn.Dropout(0.2))
        
        self.hiddenlayer2 = nn.Sequential(
        nn.Linear(20,30),
        nn.BatchNorm1d(30),
        nn.ReLU(),
        nn.Dropout(0.2))
        
        self.hiddenlayer3 = nn.Sequential(
        nn.Linear(30,20),
        nn.BatchNorm1d(20),
        nn.ReLU(),
        nn.Dropout(0.2))
        
        self.hiddenlayer4 = nn.Sequential(
        nn.Linear(20,20),
        nn.BatchNorm1d(20),
        nn.ReLU(),
        nn.Dropout(0.2))
        
        self.hiddenlayer5 = nn.Sequential(
        nn.Linear(20,10),
        nn.BatchNorm1d(10),
        nn.ReLU(),
        nn.Dropout(0.2))
        
        self.hiddenlayer6 = nn.Sequential(
        nn.Linear(10,10),
        nn.BatchNorm1d(10),
        nn.ReLU(),
        nn.Dropout(0.2))
        
        self.out = nn.Sequential(
        nn.Linear(10, 1))
        
    def forward(self, x):
        x = self.inputlayer(x)
        x = self.hiddenlayer1(x)
        x = self.hiddenlayer2(x)
        x = self.hiddenlayer3(x)
        x = self.hiddenlayer4(x)
        x = self.hiddenlayer5(x)
        x = self.hiddenlayer6(x)
        x = self.out(x) 
        return torch.sigmoid(x)   # sigmoid as we use BCELoss


In [10]:
## 네트워크 저장
# train을 마친 네트워크 저장
# net : 네트워크 파라미터, optim 두개를 dict 형태로 저장
def save(ckpt_dir, model, optimizer, epoch):
    if not os.path.exists(ckpt_dir): 
        os.makedirs(ckpt_dir)
        
    torch.save({'model': model.state_dict(), 'optimizer': optimizer.state_dict()},
              "./%s/model_epoch%d.pth" % (ckpt_dir, epoch))


## 네트워크 불러오기
def load(ckpt_dir, model, optimizer):
    if not os.path.exists(ckpt_dir):  # 저장된 네트워크가 없다면 인풋을 그대로 반환 
        epoch = 0
        return model, optimizer, epoch
    
    ckpt_lst = os.listdir(ckpt_dir)
    ckpt_lst.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))
    
    dict_model = torch.load('./%s/%s' % (ckpt_dir, ckpt_lst[-1]))
    
    model.load_state_dict(dict_model['model'])
    optimizer.load_state_dict(dict_model['optimizer'])
    epoch = int(ckpt_lst[-1].split('epoch')[1].split('.pth')[0])
    
    return model, optimizer, epoch

def test(dataloader):
    best_model, opt, end_ep = load(ckpt_dir, model, optim.Adam(model.parameters(), lr=0.001))
    best_model.eval()
    test_loss, test_acc = check_model(dataloader, best_model) # check acc on val data
    print(f'{end_ep+1}/{end_ep+1} [==============================] - test_loss: {float(test_loss):.4f} - test_acc: {float(test_acc):.4f}' )

In [14]:
# Loss and optimizer
criterion = nn.BCELoss() #BECLoss #CrossEntropyLoss
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [12]:
# Initialize network
model = DNN().to(device)

In [22]:
# Train Network
ckpt_dir = 'Desktop/titanic/checkpoint'
end_epoch = 1000
print(f'Train on {len(train_data)} samples, validate on {len(valid_data)} samples')

for epoch in range(end_epoch):
    running_loss = []
    model.train()
    
    for i, (inputs, labels) in enumerate(trainloader):
        if torch.cuda.is_available():
            inputs = inputs.to(device)
            labels = labels.to(device)
        
        
        #print(inputs) # -> [batch_size, #of feature]
        # 순전파
        outputs = model(inputs)         # outputs: tensor(10,1)

        labels = labels.unsqueeze(1)    # labels : tensor(10) -> tensor(10,1)
        labels = labels.float()         # labels : tensor(10,1).float
        
        loss = criterion(outputs, labels)
        running_loss += [loss.item()] 

        # 역전파 + 최적화 를 한 후
        optimizer.zero_grad() # prob: contains loss, sol: zero_grad
        loss.backward()
        
        # gradient descent or adam step
        optimizer.step()
         
           
        # 통계를 출력합니다.
    # model.eval()    
    save(ckpt_dir, model, optimizer, epoch)
    valid_loss, valid_acc = check_model(validloader, model) # check acc on val data 

    print(f'Epoch {epoch+1}/{end_epoch}')
    print(f'{len(train_data)}/{len(train_data)} [==============================] - loss: {np.mean(running_loss):.4f} - val_loss: {float(valid_loss):.4f} - val_acc: {float(valid_acc):.4f}' )
    

Train on 639 samples, validate on 178 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/10

In [16]:
# check acc and test
def check_model(loader, model):
    num_correct = 0
    num_samples = 0
    running_loss = []
    
    # check acc
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            
            outputs = model(x)

            predictions = (outputs>=0.5).float().squeeze(1)
            
            # acc
            num_correct += (predictions == y).sum()
            num_samples += y.size(0)
           
            # loss 
            y = y.unsqueeze(1)
            y = y.float()
            
            running_loss += [criterion(outputs,y).item()]
            
        acc = (num_correct/num_samples)*100
        loss = np.mean(running_loss)
       
    return (loss, acc)

In [15]:
# 7*20 + 20*10 + 10*2 -> weight 
# 20 + 10 + 2 -> bias
# 20*2 + 10*2 -> batchNorm

from torchsummary import summary

summary(model, (7,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 20]             160
       BatchNorm1d-2                   [-1, 20]              40
              ReLU-3                   [-1, 20]               0
           Dropout-4                   [-1, 20]               0
            Linear-5                   [-1, 20]             420
       BatchNorm1d-6                   [-1, 20]              40
              ReLU-7                   [-1, 20]               0
           Dropout-8                   [-1, 20]               0
            Linear-9                   [-1, 30]             630
      BatchNorm1d-10                   [-1, 30]              60
             ReLU-11                   [-1, 30]               0
          Dropout-12                   [-1, 30]               0
           Linear-13                   [-1, 20]             620
      BatchNorm1d-14                   

In [None]:
# class TestDataset(Dataset):
#     def __init__(self, data): #데이터셋의 전처리를 해주는 부분
#         self.x_data = data
    
#     def __len__(self):  #데이터셋의 길이. 즉, 총 샘플의 수를 적어주는 부분
#         return len(self.x_data)
    
#     def __getitem__(self, idx):    #데이터셋에서 특정 1개의 샘플을 가져오는 함수
#         x = torch.FloatTensor(self.x_data.iloc[idx])
#         return x