In [1]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from datetime import datetime 

In [2]:
import keras
from keras.layers import Dense, Conv2D, SeparableConv2D, Convolution2D, AveragePooling2D
from keras.layers import MaxPooling2D, GlobalAveragePooling2D, GlobalMaxPooling2D, Activation, Dropout, BatchNormalization, Flatten, Input
from keras.models import Model, Sequential
from keras.applications.mobilenet_v2 import MobileNetV2

from keras.callbacks import ModelCheckpoint 

In [4]:
X_train = np.load("./X_train_44100.npy")
X_test = np.load("./X_test_44100.npy")
y_train = np.load("./y_train_44100.npy")
y_test = np.load("./y_test_44100.npy")

In [5]:
X_train.shape, y_train.shape

((80345, 128, 128, 1), (80345,))

In [6]:
def samplewise_normalize_audio_X(X):
    for i in range(len(X)):
        X[i] -= np.min(X[i])
        X[i] /= (np.max(np.abs(X[i])) + 1.0)

In [7]:
from keras.utils import to_categorical

def load_audio_datafiles(X_or_XX_file, y_file, normalize):
    X_or_XX, y = X_or_XX_file, to_categorical(y_file)
    if normalize:
        print(' normalize samplewise')
        if len(X_or_XX.shape) == 5:
            for X in X_or_XX: # it is XX
                samplewise_normalize_audio_X(X)
        else:
            samplewise_normalize_audio_X(X_or_XX) # it is X
    return X_or_XX, y

In [6]:
# 3. Load all dataset & normalize
X_train, y_train = load_audio_datafiles(X_train, y_train, normalize=True)
X_test, y_test = load_audio_datafiles(X_test, y_test, normalize=True)
print('Loaded train:test = {}:{} samples.'.format(len(X_train), len(X_test)))

 normalize samplewise
 normalize samplewise
Loaded train:test = 80345:34435 samples.


In [15]:
base_model = MobileNetV2(weights=None, input_shape=(128,128,1), include_top=False, alpha=0.35)

# 어떠한 소리인지 분류하는 가중치( 특징 추출 )  vs  분위기 분류하기 위한 특징?
# base_model.load_weights("./mobilenetv2_fsd2018_41cls.h5", by_name=True, skip_mismatch=True)

# for layer in base_model.layers:
    # layer.trainable=False

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(5, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=predictions)

model.compile(loss="categorical_crossentropy",
            optimizer = keras.optimizers.Adam(lr=0.0001),
            metrics=["accuracy"])


model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 128, 128, 1) 0                                            
__________________________________________________________________________________________________
Conv1_pad (ZeroPadding2D)       (None, 129, 129, 1)  0           input_2[0][0]                    
__________________________________________________________________________________________________
Conv1 (Conv2D)                  (None, 64, 64, 16)   144         Conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_Conv1 (BatchNormalization)   (None, 64, 64, 16)   64          Conv1[0][0]                      
_______________________________________________________________________________________

In [11]:
num_epochs = 10
num_batch_size = 32

In [None]:
start = datetime.now()

filename = 'checkpoint-epoch-{}-batch-{}-trial-001.h5'.format(num_epochs, num_batch_size)

checkpoint = ModelCheckpoint(filename,             # file명을 지정합니다
                             monitor='val_loss',   # val_loss 값이 개선되었을때 호출됩니다
                             verbose=1,            # 로그를 출력합니다
                             save_best_only=True,  # 가장 best 값만 저장합니다
                             mode='auto'           # auto는 알아서 best를 찾습니다. min/max
                            )


history = model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), verbose=1, callbacks=[checkpoint],)

duration = datetime.now() - start
print("Training completed in time: ", duration)

model.save("./mobilenetv2_youtube_5cls.h5")
print("save model!")

Epoch 1/50
   4/1256 [..............................] - ETA: 18:55 - loss: 1.6767 - accuracy: 0.1836

# Using torch

In [2]:
import torch
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1650 Ti


In [3]:
X_train = torch.tensor(X_train)
X_test = torch.tensor(X_test)
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)

NameError: name 'X_train' is not defined

In [4]:
import torchvision.models as models

In [11]:
import torch
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1650 Ti


In [6]:
model = models.mobilenet_v2(pretrained=False, progress=True)
print(model)

MobileNetV2(
  (features): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=Tr

In [14]:
model.classifier[1] = torch.nn.Linear(in_features=model.classifier[1].in_features, out_features=5)
print(model.classifier)

Sequential(
  (0): Dropout(p=0.2, inplace=False)
  (1): Linear(in_features=1280, out_features=5, bias=True)
)


In [15]:
print(model)

MobileNetV2(
  (features): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=Tr