In [1]:
# !pip install torchsummary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1



[notice] A new release of pip is available: 23.0 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [225]:
import random
import pandas as pd
import numpy as np
import os
import time
from tqdm.auto import tqdm
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
from torch import optim

from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings(action='ignore')

In [226]:
# Random Seed
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(69) # Seed 고정

In [227]:
GPU_NUM = 0 # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')

## Data Load

In [248]:
train = pd.read_csv('./data/train_data.csv')
test = pd.read_csv('./data/test_data.csv')

## EDA

## Pre-Processing

#### One Hot Encoding

In [249]:
# int -> str
train['type'] = train['type'].astype(str)
test['type'] = test['type'].astype(str)

In [250]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [251]:
train.shape

(2463, 15)

#### Drop columns

In [252]:
# train.drop('out_pressure', axis=1, inplace=True)
# test.drop('out_pressure', axis=1, inplace=True)

## Train

In [253]:
CFG = {
    'SR':16000,
    'EPOCHS': 100,
    'batch_size': 64,
    'learning_rate': 4e-4,
    'weight_decay': 1e-4,
}

In [267]:
# 데이터셋 정의
class CustomDataset(Dataset):
    def __init__(self, data):
        self.x = np.array(data)
        
    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x[idx])
        return x
        
    def __len__(self):
        return len(self.x)

# dataset
train_dataset = CustomDataset(train)
test_dataset = CustomDataset(test)

# dataloader
train_loader = DataLoader(dataset=train_dataset, batch_size=CFG['batch_size'], shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=CFG['batch_size'], shuffle=False)

In [255]:
train.shape

(2463, 15)

In [256]:
# 모델 정의
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        
        # encoder
        self.encoder = nn.Sequential(
            nn.Linear(15, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 7),
            nn.ReLU(),
            nn.Linear(7, 3),
            nn.ReLU(),
            nn.Linear(3, 1),
            nn.ReLU(),
        )
        
        # decoder
        self.decoder = nn.Sequential(
            nn.Linear(1, 3),
            nn.ReLU(),
            nn.Linear(3, 7),
            nn.ReLU(),
            nn.Linear(7, 16),
            nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 15),
            nn.ReLU(),
        )
        
    def forward(self, x):
        
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        
        return encoded, decoded

In [257]:
model = AutoEncoder().to(device)
loss_func = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=CFG['learning_rate'], weight_decay = CFG['weight_decay'])
# summary(model, [(1,2463)] )

In [258]:
# train
def training(model, train_loader, optimizer):
    
    model.train()
    
    running_loss = 0.0
    len_data = len(train_loader.dataset)

    for x in train_loader:
        x = x.to(device)
        
        encoded, decoded = model(x)
#         loss = loss_func(x, decoded)
        loss = loss_func(x, decoded)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    return running_loss/len_data

In [259]:
# run
loss_history = {'train': []}
start_time = time.time()

for epoch in range(1, CFG['EPOCHS'] + 1):
    
    print('Epoch {}/{}'.format(epoch, CFG['EPOCHS']))
    
    train_loss = training(model, train_loader, optimizer)
    loss_history['train'].append(train_loss)
    
    print('train loss: %.6f, time: %.4f min' %(train_loss, (time.time()-start_time)/60))
    print('-'*10)

Epoch 1/100
train loss: 6448.405907, time: 0.0026 min
----------
Epoch 2/100
train loss: 6448.461213, time: 0.0055 min
----------
Epoch 3/100
train loss: 6437.585325, time: 0.0084 min
----------
Epoch 4/100
train loss: 6431.513639, time: 0.0115 min
----------
Epoch 5/100
train loss: 6393.254859, time: 0.0141 min
----------
Epoch 6/100
train loss: 6327.265492, time: 0.0165 min
----------
Epoch 7/100
train loss: 6179.946965, time: 0.0199 min
----------
Epoch 8/100
train loss: 5875.816522, time: 0.0241 min
----------
Epoch 9/100
train loss: 5364.982998, time: 0.0265 min
----------
Epoch 10/100
train loss: 4526.201247, time: 0.0291 min
----------
Epoch 11/100
train loss: 3371.982694, time: 0.0322 min
----------
Epoch 12/100
train loss: 2089.282151, time: 0.0351 min
----------
Epoch 13/100
train loss: 1072.436417, time: 0.0382 min
----------
Epoch 14/100
train loss: 605.326833, time: 0.0405 min
----------
Epoch 15/100
train loss: 510.344335, time: 0.0437 min
----------
Epoch 16/100
train lo

## Predict

In [240]:
def eval(model, dataloader):
    scores = []
    model.eval()
    print('Testing...')
    with torch.no_grad():
        for x in dataloader:
            x = x.to(device)
            x_hat, z = model(x)
            score = torch.mean( torch.abs(x - x_hat) , axis=1)
            scores.extend(score.cpu().numpy())
            
    return np.array(scores), z

In [241]:
def get_pred_label(model_pred, t):
    # (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred <= t, 0, model_pred)
    model_pred = np.where(model_pred > t, 1, model_pred)
    return model_pred

In [242]:
with torch.no_grad():
    for j, x in enumerate(train_loader):
        x = x.to(device)
        optimizer.zero_grad()
        output, z = model.forward(x)
        break

In [243]:
scores, z = eval(model, train_loader)
# Train data (정상 데이터)에서 발견할 수 있는 score의 최댓값인 t를 임계치로 설정
# 정상데이터 관찰할 수 있는 관측치 중 가장 큰 값이므로, 임계치 이하의 값은 
# 정상 데이터일 것이라는 가정
t=scores.max()

print(scores.shape)

Testing...
(2463,)


## Inference

In [244]:
train_pred = get_pred_label(scores, t)
Counter(train_pred)

Counter({0.0: 2463})

In [245]:
scores_, z_ = eval(model, test_loader)

Testing...


In [246]:
pred_test = get_pred_label(scores_, t)

In [247]:
# 정상 (0), 이상 (1)
list(pred_test).count(0), list(pred_test).count(1)

(7389, 0)

## Submission

In [263]:
submit = pd.read_csv('./data/answer_sample.csv')

In [264]:
submit['label'] = pred_test
submit.head()

Unnamed: 0,type,label
0,0,0
1,0,0
2,0,1
3,0,1
4,0,1


In [265]:
submit.to_csv('./submit/kmeans.csv', index=False)