<a href="https://colab.research.google.com/github/highway92/machine_learning/blob/main/year_dream/do/MLP%EB%A5%BC_%EC%9D%B4%EC%9A%A9%ED%95%9C_%EA%B8%88%EC%9C%B5%EB%8D%B0%EC%9D%B4%ED%84%B0_%EB%B6%84%EC%84%9D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, confusion_matrix


import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
import torch
from torch import nn, optim

from torch.utils.data import DataLoader, Dataset

import torch.nn.functional as F

# 1. 데이터 로드 및 전처리
- feature selection
- MinMax Scaler

In [3]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
submission = pd.read_csv('./sample_submission.csv')

In [4]:
X = train_df.iloc[:, :-1]
y = train_df.iloc[:, -1]


In [5]:
sel = VarianceThreshold(threshold=0.01)
X = sel.fit_transform(X)


X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)
print('training set length :', len(X_train))
print('validation set length :', len(X_valid))

training set length : 80000
validation set length : 20000


In [6]:
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_valid = scaler.fit_transform(X_valid)

# 2. MLP 구현 with pytorch

In [7]:
class TensorData(Dataset):
  def __init__(self, x_data, y_data):
    self.x_data = torch.FloatTensor(x_data)
    self.y_data = torch.LongTensor(y_data.to_numpy())
    self.len = self.y_data.shape[0]

  def __getitem__(self, index):
    return self.x_data[index], self.y_data[index]

  def __len__(self):
    return self.len

In [8]:
trainsets = TensorData(X_train, y_train)
trainloader = torch.utils.data.DataLoader(trainsets, batch_size = 32, shuffle = True)

testsets = TensorData(X_valid, y_valid)
testloader = torch.utils.data.DataLoader(testsets, batch_size = 32, shuffle = False)

In [9]:
class MLP(nn.Module):
  def __init__(self):
    super().__init__()
    self.dropout = nn.Dropout(p=0.2)
    self.linear1 = nn.Linear(63, 126)
    self.linear2 = nn.Linear(126, 48)
    self.linear3 = nn.Linear(48, 24)
    self.linear4 = nn.Linear(24, 2)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    mlp1 = F.relu(self.linear1(x))
    # print(mlp1.shape)
    mlp1 = self.dropout(mlp1)
    mlp2 = F.relu(self.linear2(mlp1))
    mlp2 = self.dropout(mlp2)
    mlp3 = F.relu(self.linear3(mlp2))
    mlp3 = self.dropout(mlp3)
    # print(mlp3.shape)
    mlp4 = F.relu(self.linear4(mlp3))
    # print(mlp4.shape)
    output = self.sigmoid(mlp4)
    return output


In [10]:
# model = MLP()
# input = torch.Tensor(1, 63)

# ot = model(input)
# print(ot)

In [11]:
# model = nn.Sequential(
#     nn.Linear(63, 126),
#     nn.ReLU(),
#     nn.Dropout(0.2),
#     nn.Linear(126, 48),
#     nn.ReLU(),
#     nn.Dropout(0.2),
#     nn.Linear(48, 24),
#     nn.ReLU(),
#     nn.Dropout(0.2),
#     nn.Linear(24, 2),
#     nn.Sigmoid()
# )

In [12]:
# X_trrain_tensor = torch.tensor(X_train, dtype = torch.float32)
# y_train_tensor = torch.tensor(y_train, dtype = torch.int64)

In [13]:
EPOCHS = 200
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = MLP().to(DEVICE)

criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [14]:
loss_list = []
acc_list = []
for epoch in range(EPOCHS):
    for i, (X_batch, y_batch) in enumerate(trainloader):
        #Forward 
        X_batch = X_batch.to(DEVICE)
        y_batch = y_batch.to(DEVICE)
        y_output = model(X_batch)

        loss = criterion(y_output, y_batch) 

        #Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #misc (acc 계산, etc) 
        y_pred = torch.max(y_output, 1)[1]
        acc = accuracy_score(y_pred.data.cpu(), y_batch.data.cpu())
        loss_list.append(loss.item())
        acc_list.append(acc)

    if (epoch+1) % 10 == 0:
        print('Epoch [{}/{}] Step [{}/{}] Loss: [{:.4f}] Train ACC [{:.2f}%]'.format(epoch+1, EPOCHS, \
                                                                                   i+1, len(trainloader), loss.item(), acc*100))

Epoch [10/200] Step [2500/2500] Loss: [0.6541] Train ACC [62.50%]
Epoch [20/200] Step [2500/2500] Loss: [0.6948] Train ACC [50.00%]
Epoch [30/200] Step [2500/2500] Loss: [0.6079] Train ACC [75.00%]
Epoch [40/200] Step [2500/2500] Loss: [0.5967] Train ACC [75.00%]
Epoch [50/200] Step [2500/2500] Loss: [0.6079] Train ACC [68.75%]
Epoch [60/200] Step [2500/2500] Loss: [0.5455] Train ACC [84.38%]
Epoch [70/200] Step [2500/2500] Loss: [0.6092] Train ACC [75.00%]
Epoch [80/200] Step [2500/2500] Loss: [0.5882] Train ACC [78.12%]
Epoch [90/200] Step [2500/2500] Loss: [0.5827] Train ACC [75.00%]
Epoch [100/200] Step [2500/2500] Loss: [0.5935] Train ACC [78.12%]
Epoch [110/200] Step [2500/2500] Loss: [0.5773] Train ACC [81.25%]
Epoch [120/200] Step [2500/2500] Loss: [0.6354] Train ACC [68.75%]
Epoch [130/200] Step [2500/2500] Loss: [0.5649] Train ACC [84.38%]
Epoch [140/200] Step [2500/2500] Loss: [0.5362] Train ACC [87.50%]
Epoch [150/200] Step [2500/2500] Loss: [0.5124] Train ACC [90.62%]
Epoc

In [34]:
test_y_pred = []
test_acc_list = []
with torch.no_grad():

    for X_batch, y_batch in testloader:    
        #Forward
        X_batch = X_batch.to(DEVICE)
        y_batch = y_batch.to(DEVICE)
        y_output = model(X_batch)
        
        #misc (acc 계산, etc) 
        y_pred = torch.max(y_output, 1)[1]
        test_y_pred.extend(y_pred.data.cpu()) ##
        
        acc = accuracy_score(y_pred.data.cpu(), y_batch.data.cpu())
        test_acc_list.append(acc)
    test_acc = np.mean(test_acc_list)
print('Test ACC: [{:.2f}%]'.format(test_acc*100))

Test ACC: [72.42%]


In [39]:
from sklearn.metrics import f1_score
print(f1_score(list(y_valid),test_y_pred, average='macro'))

0.6259546514091968


In [26]:
print(len(list(y_valid)))

20000


In [23]:
len(test_y_pred)

20000

In [28]:
list(y_valid)[:3]

[1, 0, 1]