In [1]:
cd /home/work/

/home/work


In [2]:
!nvcc --version
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0
Wed Nov 23 11:42:57 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  CUDA GPU            Off  | 00000000:81:00.0 Off |                  Off |
| 30%   26C    P8     5W / 230W |     48MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+------------

In [3]:
import re 
import os 
import gc
import math
import time
import copy
import random
import pickle
from glob import glob
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import torch
import librosa
import torchaudio
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
from torch.nn import functional as F
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import datasets, models, transforms

import warnings
warnings.filterwarnings(action='ignore')

%matplotlib inline

cudnn.benchmark = True
plt.ion()   # 대화형 모드

In [4]:
train = pd.read_csv('Tcae_apply/new_train.csv')
valid = pd.read_csv('Tcae_apply/new_valid.csv')
gender= pd.read_csv('Tcae_apply/all_gender.csv')

train = train.merge(gender[['vocal', 'gender']], on='vocal', how='left')
valid = valid.merge(gender[['vocal', 'gender']], on='vocal', how='left')

In [5]:
train.shape, valid.shape, gender.shape

((3825, 5), (675, 5), (4500, 3))

In [6]:
train.head()

Unnamed: 0,vocal,label,title,npy_path,gender
0,Final_dataset/정인_The End.wav,4412,정인_The End,/home/work/mel_dataset7/정인_The End.npy,1
1,Final_dataset/비_내여자 (Acoustic Ver.).wav,4111,비_내여자 (Acoustic Ver.),/home/work/mel_dataset4/비_내여자 (Acoustic Ver.).npy,0
2,Final_dataset/폴킴_Goodbye Days.wav,4458,폴킴_Goodbye Days,/home/work/mel_dataset8/폴킴_Goodbye Days.npy,0
3,Final_dataset/스탠딩 에그_사랑한다는 말.wav,4242,스탠딩 에그_사랑한다는 말,/home/work/mel_dataset4/스탠딩 에그_사랑한다는 말.npy,0
4,Final_dataset/송가인_사랑의 꽃씨.wav,4479,송가인_사랑의 꽃씨,/home/work/mel_dataset4/송가인_사랑의 꽃씨.npy,1


In [7]:
import os
import unicodedata

train['lfcc_path'] = train['npy_path'].apply(lambda x : unicodedata.normalize('NFC', x.replace('mel_dataset', 'lfcc_dataset')))
valid['lfcc_path'] = valid['npy_path'].apply(lambda x : unicodedata.normalize('NFC', x.replace('mel_dataset', 'lfcc_dataset')))

In [8]:
train.head()

Unnamed: 0,vocal,label,title,npy_path,gender,lfcc_path
0,Final_dataset/정인_The End.wav,4412,정인_The End,/home/work/mel_dataset7/정인_The End.npy,1,/home/work/lfcc_dataset7/정인_The End.npy
1,Final_dataset/비_내여자 (Acoustic Ver.).wav,4111,비_내여자 (Acoustic Ver.),/home/work/mel_dataset4/비_내여자 (Acoustic Ver.).npy,0,/home/work/lfcc_dataset4/비_내여자 (Acoustic Ver.)...
2,Final_dataset/폴킴_Goodbye Days.wav,4458,폴킴_Goodbye Days,/home/work/mel_dataset8/폴킴_Goodbye Days.npy,0,/home/work/lfcc_dataset8/폴킴_Goodbye Days.npy
3,Final_dataset/스탠딩 에그_사랑한다는 말.wav,4242,스탠딩 에그_사랑한다는 말,/home/work/mel_dataset4/스탠딩 에그_사랑한다는 말.npy,0,/home/work/lfcc_dataset4/스탠딩 에그_사랑한다는 말.npy
4,Final_dataset/송가인_사랑의 꽃씨.wav,4479,송가인_사랑의 꽃씨,/home/work/mel_dataset4/송가인_사랑의 꽃씨.npy,1,/home/work/lfcc_dataset4/송가인_사랑의 꽃씨.npy


In [9]:
valid.head()

Unnamed: 0,vocal,label,title,npy_path,gender,lfcc_path
0,Final_dataset/이효리_One Two Three N'Four.wav,4331,이효리_One Two Three N'Four,/home/work/mel_dataset6/이효리_One Two Three N'Fo...,1,/home/work/lfcc_dataset6/이효리_One Two Three N'F...
1,Final_dataset/소향_마지막 약속.wav,4490,소향_마지막 약속,/home/work/mel_dataset4/소향_마지막 약속.npy,1,/home/work/lfcc_dataset4/소향_마지막 약속.npy
2,Final_dataset/박시환_고래의 꿈.wav,4449,박시환_고래의 꿈,/home/work/mel_dataset3/박시환_고래의 꿈.npy,0,/home/work/lfcc_dataset3/박시환_고래의 꿈.npy
3,Final_dataset/신해철_재즈 카페.wav,4337,신해철_재즈 카페,/home/work/mel_dataset4/신해철_재즈 카페.npy,0,/home/work/lfcc_dataset4/신해철_재즈 카페.npy
4,Final_dataset/자우림_Anna.wav,4418,자우림_Anna,/home/work/mel_dataset6/자우림_Anna.npy,1,/home/work/lfcc_dataset6/자우림_Anna.npy


In [10]:
# import os
# import unicodedata

# lst = []

# for path in tqdm(train['lfcc_path']):
#     old_file = path
#     new_file = unicodedata.normalize('NFC', path)
#     try:
#         os.rename(old_file, new_file)
#     except:
#         lst.append(old_file)
        
# for path in tqdm(valid['lfcc_path']):
#     old_file = path
#     new_file = unicodedata.normalize('NFC', path)
#     try:
#         os.rename(old_file, new_file)
#     except:
#         lst.append(old_file)

In [11]:
# import shutil

# os.makedirs('data/train/male', exist_ok=True)
# os.makedirs('data/train/female', exist_ok=True)
# os.makedirs('data/valid/male', exist_ok=True)
# os.makedirs('data/valid/female', exist_ok=True)

# for i in tqdm(range(len(train['lfcc_path']))):
#     file_source = train['lfcc_path'][i]
#     if train['gender'][i]==0:
#         file_destination = 'data/train/male'
#     else:
#         file_destination = 'data/train/female'
#     shutil.copy(file_source, file_destination)

# for i in tqdm(range(len(valid['lfcc_path']))):
#     file_source = valid['lfcc_path'][i]
#     if valid['gender'][i]==0:
#         file_destination = 'data/valid/male'
#     else:
#         file_destination = 'data/valid/female'
#     shutil.copy(file_source, file_destination)

In [12]:
data_dir = 'data/'

data_transforms = {
    'train': transforms.Compose([
        transforms.ToPILImage(),
        transforms.Lambda(lambda image: image.convert('RGB')),
        transforms.ToTensor(),
    ]),
    'valid': transforms.Compose([
        transforms.ToPILImage(),
        transforms.Lambda(lambda image: image.convert('RGB')),
        transforms.ToTensor(),
    ]),
}

def npy_loader(path):
    sample = torch.from_numpy(np.load(path))
    return sample

image_datasets = {x: datasets.DatasetFolder(root=os.path.join(data_dir, x),
                                           loader=npy_loader,
                                           extensions='.npy',
                                           transform=data_transforms[x])
                  for x in ['train', 'valid']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], 
                                              batch_size=16,
                                              shuffle=True, 
                                              num_workers=0,
                                              drop_last=True)
              for x in ['train', 'valid']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'valid']}
class_names = image_datasets['train'].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [13]:
image_datasets['train'][0]

(tensor([[[0.0353, 0.9216, 0.1451,  ..., 0.1961, 0.1961, 0.1961],
          [0.3412, 0.1098, 0.6314,  ..., 0.0000, 0.0000, 0.0000],
          [0.2941, 0.0784, 0.4510,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.5765, 0.0706, 0.0431,  ..., 0.0000, 0.0000, 0.0000],
          [0.7059, 0.9255, 0.5529,  ..., 0.0000, 0.0000, 0.0000],
          [0.6667, 0.8980, 0.3765,  ..., 0.0000, 0.0000, 0.0000]],
 
         [[0.0353, 0.9216, 0.1451,  ..., 0.1961, 0.1961, 0.1961],
          [0.3412, 0.1098, 0.6314,  ..., 0.0000, 0.0000, 0.0000],
          [0.2941, 0.0784, 0.4510,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.5765, 0.0706, 0.0431,  ..., 0.0000, 0.0000, 0.0000],
          [0.7059, 0.9255, 0.5529,  ..., 0.0000, 0.0000, 0.0000],
          [0.6667, 0.8980, 0.3765,  ..., 0.0000, 0.0000, 0.0000]],
 
         [[0.0353, 0.9216, 0.1451,  ..., 0.1961, 0.1961, 0.1961],
          [0.3412, 0.1098, 0.6314,  ..., 0.0000, 0.0000, 0.0000],
          [0.2941, 0.0784, 0.4510,  ...,

In [14]:
image_datasets['train'][0][0].shape

torch.Size([3, 256, 10336])

### 모델 학습하기
- learning rate scheduling

### Model

In [15]:
# make conv_block
def conv_block(shape, in_, out_, kernel_size, stride=(1, 1), padding=0):
    block = torch.nn.Sequential(
        nn.Conv2d(in_, out_, kernel_size, stride=stride, padding=padding, bias=False),
        nn.ELU(),
        nn.BatchNorm2d(out_),
        nn.Dropout(0.2)
    )
    shape1 = int(np.floor((shape[0] - kernel_size[0] + 2 * padding) / stride[0]) + 1)
    shape2 = int(np.floor((shape[1] - kernel_size[1] + 2 * padding) / stride[1]) + 1)
    return block, shape1, shape2


def fc_block(in_, out_):
    block = torch.nn.Sequential(
      torch.nn.Linear(in_, out_, bias=False),
      torch.nn.BatchNorm1d(out_),
      nn.ReLU()
    )
    return block

In [16]:
class CRNN2D_elu(nn.Module):
    def __init__(self):
        super().__init__()
        
        #self.Bn0 = nn.BatchNorm2d(input_size)
        
        self.conv_block1, self.shape1, self.shape2 = conv_block((256, 10336), 3, 8, (3, 3), padding=1)
        self.conv_block2, self.shape1, self.shape2 = conv_block((self.shape1, self.shape2), 8, 16, (3, 3), stride=(2, 2))
        self.conv_block3, self.shape1, self.shape2 = conv_block((self.shape1, self.shape2), 16, 32, (3, 3), stride=(2, 2))
        self.conv_block4, self.shape1, self.shape2 = conv_block((self.shape1, self.shape2), 32, 64, (3, 3), stride=(2, 2))
        self.conv_block5, self.shape1, self.shape2 = conv_block((self.shape1, self.shape2), 64, 128, (3, 3), stride=(1, 1))
        self.conv_block6, self.shape1, self.shape2 = conv_block((self.shape1, self.shape2), 128, 256, (3, 3), stride=(2, 2))
        self.conv_block7, self.shape1, self.shape2 = conv_block((self.shape1, self.shape2), 256, 512, (3, 3), stride=(2, 2))
        self.conv_block8, self.shape1, self.shape2 = conv_block((self.shape1, self.shape2), 512, 512, (3, 3), stride=(4, 4))
        
        self.gru1 = nn.GRU(512, 64, num_layers=1, batch_first=True)
        self.gru2 = nn.GRU(64, 64, num_layers=1, batch_first=True)
        self.gru3 = nn.GRU(64, 64, num_layers=1, batch_first=True)
        
        self.drop = nn.Dropout(p=0.3)
        
        self.fc_block1 = fc_block(5120, 64)
        self.fc_last = nn.Linear(64, 2)
        
        
    def forward(self, x):
        h = torch.randn(1, x.size(0), 64).cuda()
        
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = self.conv_block4(x)
        x = self.conv_block5(x)
        x = self.conv_block6(x)
        x = self.conv_block7(x)
        x = self.conv_block8(x)
#         x = self.conv_block9(x)
        
        x = x.transpose(1, 3)
        x = torch.reshape(x, (x.size(0), x.size(1), -1))
        
        x, h = self.gru1(x, h)
        x, h = self.gru2(x, h)
        x, h = self.gru3(x, h)
        x = self.drop(x)
        
        x = torch.reshape(x, (x.size(0), -1))
                
        x = self.fc_block1(x)
#         x = self.fc_block2(x)
#         x = self.fc_block3(x)
        x = self.fc_last(x)
        return x

In [17]:
batch_size = 16

model_ft = CRNN2D_elu()
model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# 모든 매개변수들이 최적화되었는지 관찰
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# 7 에폭마다 0.1씩 학습률 감소
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [18]:
#!pip install -q torchsummary

In [19]:
from torchsummary import summary
summary(model_ft, input_size=(3, 256, 10336))  # 40배 차이

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1        [-1, 8, 256, 10336]             216
               ELU-2        [-1, 8, 256, 10336]               0
       BatchNorm2d-3        [-1, 8, 256, 10336]              16
           Dropout-4        [-1, 8, 256, 10336]               0
            Conv2d-5        [-1, 16, 127, 5167]           1,152
               ELU-6        [-1, 16, 127, 5167]               0
       BatchNorm2d-7        [-1, 16, 127, 5167]              32
           Dropout-8        [-1, 16, 127, 5167]               0
            Conv2d-9         [-1, 32, 63, 2583]           4,608
              ELU-10         [-1, 32, 63, 2583]               0
      BatchNorm2d-11         [-1, 32, 63, 2583]              64
          Dropout-12         [-1, 32, 63, 2583]               0
           Conv2d-13         [-1, 64, 31, 1291]          18,432
              ELU-14         [-1, 64, 3

In [20]:
epochs = 20
train_loader = dataloaders['train']
valid_loader = dataloaders['valid']
optimizer = optimizer_ft
model = model_ft

In [21]:
train_loss_arr = []
valid_loss_arr = []

for epoch in tqdm(range(epochs)):
    start = time.time()
    train_avg_loss = 0
    train_acc = 0
    model.train()
    for image, label in tqdm(train_loader):
        # ------- assign train data
        image = image.to(device)
        label = label.to(device)
        # ------- forward prop
        optimizer.zero_grad()
        output = model(image)
        # ------- backward prop
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        # ------- get train performance
        batch_acc = ((output.argmax(dim=1) == label).float().mean())
        train_acc += batch_acc / len(train_loader)
        train_avg_loss += loss / len(train_loader)
    train_loss_arr.append(train_avg_loss)
    print(f'Epoch : {epoch+1}/{epochs}, train_acc : {train_acc:.4f}, train_loss : {train_avg_loss:.4f}', end=' / ')

    model.eval()
    with torch.no_grad():
        valid_acc=0
        valid_avg_loss =0
        for image, label in valid_loader:
            # ------- assign valid data
            image = image.to(device)
            label = label.to(device)
            # ------- forward prop
            val_output = model(image)
            val_loss = criterion(val_output, label)
            # ------- get valid performance
            val_batch_acc = ((val_output.argmax(dim=1) == label).float().mean()) # acc = 맞춘 개수 / 배치사이즈
            valid_acc += val_batch_acc / len(valid_loader) # acc / total_Iteration 
            valid_avg_loss += val_loss / len(valid_loader) # val_loss / total_Iteration
        valid_loss_arr.append(valid_avg_loss) 
        print(f'valid_acc : {valid_acc:.4f}, val_loss : {valid_avg_loss:.4f}, takes {time.time() - start}secs')

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/239 [00:00<?, ?it/s]

Epoch : 1/20, train_acc : 0.5839, train_loss : 0.6940 / valid_acc : 0.5536, val_loss : 0.6846, takes 202.2200150489807secs


  0%|          | 0/239 [00:00<?, ?it/s]

Epoch : 2/20, train_acc : 0.6187, train_loss : 0.6605 / valid_acc : 0.4881, val_loss : 0.6995, takes 182.3018093109131secs


  0%|          | 0/239 [00:00<?, ?it/s]

Epoch : 3/20, train_acc : 0.6598, train_loss : 0.6307 / valid_acc : 0.5164, val_loss : 0.7119, takes 190.74477100372314secs


  0%|          | 0/239 [00:00<?, ?it/s]

Epoch : 4/20, train_acc : 0.6789, train_loss : 0.5970 / valid_acc : 0.5223, val_loss : 0.7470, takes 194.81108164787292secs


  0%|          | 0/239 [00:00<?, ?it/s]

Epoch : 5/20, train_acc : 0.7293, train_loss : 0.5367 / valid_acc : 0.5357, val_loss : 0.8081, takes 194.248685836792secs


  0%|          | 0/239 [00:00<?, ?it/s]

Epoch : 6/20, train_acc : 0.7762, train_loss : 0.4752 / valid_acc : 0.5655, val_loss : 0.8119, takes 189.55034518241882secs


  0%|          | 0/239 [00:00<?, ?it/s]

Epoch : 7/20, train_acc : 0.8203, train_loss : 0.4060 / valid_acc : 0.5342, val_loss : 0.8741, takes 189.3623652458191secs


  0%|          | 0/239 [00:00<?, ?it/s]

Epoch : 8/20, train_acc : 0.8342, train_loss : 0.3649 / valid_acc : 0.5491, val_loss : 0.8696, takes 185.96666479110718secs


  0%|          | 0/239 [00:00<?, ?it/s]

Epoch : 9/20, train_acc : 0.8774, train_loss : 0.2979 / 

KeyboardInterrupt: 

In [None]:
plt.plot(train_loss_arr, label='train')
plt.plot(valid_loss_arr, label='valid')
plt.legend()
plt.show()

In [35]:
torch.save(model_ft.state_dict(), 'resnet50.pt')

In [11]:
model = torch.load('Tcae_apply/gender_classification_model.pt')
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [23]:
from PIL import Image
from torch.autograd import Variable

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dic = {'0': 'male', '1' : 'female'}
img_array = np.load('mel_dataset5/안예은_가자.npy').squeeze(0)

transformation = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Lambda(lambda image: image.convert('RGB')),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

image_tensor = transformation(img_array).float()
image_tensor = image_tensor.unsqueeze_(0)

img = Variable(image_tensor).to(device)
output = model(img).to(device)
index = output.cpu().data.numpy().argmax()

dic[str(index)]

'male'