https://dacon.io/competitions/official/235930/codeshare/5508?page=2&dtype=recent

## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

In [19]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
# import random
# import pandas as pd
# import numpy as np
# import os
# import librosa
# from sklearn.model_selection import train_test_split
# from sklearn import preprocessing
# from sklearn.ensemble import IsolationForest
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import f1_score
# from sklearn.metrics import classification_report

# import matplotlib.pyplot as plt

# from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [3]:
EPOCHS = 400
LR = 1e-2
BS = 16384
SEED = 41

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED) # Seed 고정

## Data Pre-processing

In [5]:
import os
os.chdir('/Users/lhs/Desktop/Machine_Sound_Data')
# os.chdir('/content/drive/MyDrive/YDS/DACON/230116_Machine_Error_Sound')

In [6]:
train = pd.read_csv('./train_.csv').drop(columns=['Unnamed: 0']) # 모두 정상 Sample
# train_df = train_df.iloc[:,2:]
test = pd.read_csv('./test_.csv').drop(columns=['Unnamed: 0'])
# test_df = test_df.iloc[:,2:]

In [9]:
from sklearn.model_selection import train_test_split

X_train, x_val, Y_train, y_val = train_test_split(train, train['LABEL'], test_size=0.2)

In [11]:
class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['LABEL'].values
            self.df = self.df.drop(columns=['LABEL']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

In [13]:
train_dataset = MyDataset(df = X_train, eval_mode=False)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=6)

val_dataset = MyDataset(df = x_val, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False, num_workers=6)

# 1D AutoEncoder

In [14]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(30,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
        )
        self.Decoder = nn.Sequential(
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,30),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x

# Train

In [17]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(EPOCHS):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x)
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

In [None]:
model = nn.DataParallel(AutoEncoder())
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/lhs/miniforge3/envs/lhs/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/lhs/miniforge3/envs/lhs/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'MyDataset' on <module '__main__' (built-in)>


## Submission

In [14]:
submit = pd.read_csv('./sample_submission.csv')

In [17]:
submit['LABEL'] = temp_result
submit.head()

Unnamed: 0,SAMPLE_ID,LABEL
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0


In [18]:
submit.LABEL.value_counts()

0    1325
1     189
Name: LABEL, dtype: int64

In [19]:
import datetime

In [20]:
path = '/Users/lhs/Desktop/GitHub/Dacon/230116_Machine_Error_Sound/result/'

now = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
submit.to_csv(f'{path}{now}.csv',encoding='utf-8', index=False)