In [1]:
# !pip install torchsummary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1



[notice] A new release of pip is available: 23.0 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [124]:
import random
import pandas as pd
import numpy as np
import os
import time
from tqdm.auto import tqdm
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
from torch import optim

from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings(action='ignore')

In [125]:
# Random Seed
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(69) # Seed 고정

In [126]:
GPU_NUM = 0 # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')

## Data Load

In [137]:
train = pd.read_csv('./data/train_data.csv')
test = pd.read_csv('./data/test_data.csv')

## EDA

## Pre-Processing

#### One Hot Encoding

In [138]:
# int -> str
train['type'] = train['type'].astype(str)
test['type'] = test['type'].astype(str)

In [139]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

## Train

In [140]:
CFG = {
    'SR':16000,
    'EPOCHS': 100,
    'batch_size': 64,
    'learning_rate': 4e-4,
    'weight_decay': 1e-4,
}

In [132]:
# 데이터셋 정의
class CustomDataset(Dataset):
    def __init__(self, data):
        self.x = data
    
    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x[idx])
        return x
    
    def __len__(self):
        return len(self.x)
    
train_dataset = CustomDataset(train)
test_dataset = CustomDataset(test)

In [147]:
# 데이터 로더
train_loader = DataLoader(dataset=train_dataset, batch_size=CFG['batch_size'], shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=CFG['batch_size'], shuffle=False)

In [142]:
# 모델 정의
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
    
        # encoder
        self.encoder = nn.Sequential(
            nn.Linear(2463, 2000),
            nn.ReLU(),
            nn.Linear(2000, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
        )
        
        # decoder
        self.decoder = nn.Sequential(
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, 2000),
            nn.ReLU(),
            nn.Linear(2000, 2463),
            nn.Sigmoid(),
        )
    
    def forward(self, x):
        
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        
        return encoded, decoded

In [143]:
model = AutoEncoder().to(device)
loss_func = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=CFG['learning_rate'], weight_decay = CFG['weight_decay'])
# summary(model, [(1,2463)] )

In [144]:
# train
def train(model, train_loader, optimizer):
    
    model.train()
    
    running_loss = 0.0
    len_data = len(train_loader.dataset)
    
    for x in train_loader:
        x = x.to(device)
        
        x_hat, _ = model(x)
        loss = loss_func(x, x_hat)
        
        opt.zero_grad()
        loss.backward()
        opt.step()
        
        running_loss += loss.item()
    
    return running_loss/len_data

In [146]:
# run
for epoch in range(1, CFG['EPOCHS'] + 1):
    print(train(model, train_loader, optimizer))

KeyError: 2395

## Predict

In [None]:
with torch.no_grad():
    
    for j,x in enumerate(train_loader):
        x = x.to(device)
        opt.zero_grad()
        output, z = model.forward(x)
        break

In [None]:
def eval(model, dataloader):
    """Testing the Deep SVDD model"""

    scores = []
    model.eval()
    print('Testing...')
    with torch.no_grad():
        for x in dataloader:
            x = x.to(device)
            x_hat, z = model(x)
            score = torch.mean( torch.abs(x - x_hat) , axis=1)
            scores.extend(score.cpu().numpy())
            
    return np.array(scores), z

In [None]:
scores, z = eval(model, train_loader)
# Train data (정상 데이터)에서 발견할 수 있는 score의 최댓값인 t를 임계치로 설정
# 정상데이터 관찰할 수 있는 관측치 중 가장 큰 값이므로, 임계치 이하의 값은 
# 정상 데이터일 것이라는 가정
t=scores.max()

print(scores.shape)

In [None]:
scores_, z_ = eval(model, test_loader)

In [None]:
def get_pred_label(model_pred, t):
    # (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred <= t, 0, model_pred)
    model_pred = np.where(model_pred > t, 1, model_pred)
    return model_pred

In [None]:
train_pred = get_pred_label(scores, t)
Counter(train_pred)

## Submission

In [None]:
pred_test = get_pred_label(scores_, t)

In [None]:
Counter(pred_test)

In [263]:
submit = pd.read_csv('./data/answer_sample.csv')

In [264]:
submit['label'] = pred_test
submit.head()

Unnamed: 0,type,label
0,0,0
1,0,0
2,0,1
3,0,1
4,0,1


In [265]:
submit.to_csv('./submit/kmeans.csv', index=False)