# process

1. 데이터 경로 정하기
2. base 디렉토리 설정
3. 모듈 임포트
4. 추론
  1. 모델 로드
  2. 모델 메모리 적재 (변수에)
  3. **오디오 파일 받을지(예상)**, request로 들어온 오디오 자체 사용할지
  4. 오디오 파일 경로
  5. 모델에 입력 -> 아웃풋 get_likely_index 입력하여 -> 우리가 원하는 라벨(텍스트) 값으로 변경하여 반환
  6. 클라이언트로 반환

In [14]:
import torch
import torchaudio
import torchaudio.transforms as T
import torch.nn as nn
import soundfile

import numpy as np

In [2]:
working_dir = '/home/team2/workspace'

# 서버 경로
ZIP_BASE_DIR = '/data/team2/audio/'
EXTRACT_BASE_DIR = ZIP_BASE_DIR + 'Training/data/remote/PROJECT/AI학습데이터/KoreanSpeech/data'

# /media/{case_pk}/{audio_pk}.wav

In [3]:
audio_standard_length = 160000

def get_speech(file_path):
    waveform, sample_rate = torchaudio.backend.soundfile_backend.load(file_path)

    length = waveform.size(1)
    result = torch.zeros((1, audio_standard_length))
    idx = (audio_standard_length - waveform.size(1)) // 2

    result[0, idx:idx+length] = waveform

    return result, sample_rate 

def make_melspectogram(file_path):
    waveform, sample_rate = get_speech(file_path)

    n_fft = 512
    win_length = 512
    hop_length = 256
    n_mels = 128
    
    mel_spectrogram = T.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        win_length=win_length,
        hop_length=hop_length,
        center=True,
        pad_mode="reflect",
        power=2.0,
        norm="slaney",
        onesided=True,
        n_mels=n_mels,
        mel_scale="htk",
    )

    melspec = mel_spectrogram(waveform)             # 결과값

    return melspec

In [4]:
# 추론 확인용 데이터
inference_data = []
with open('../audio_data_test', 'r') as f:
    inference_data = list(map(lambda x: x.split('\t')[0], f.readlines()))[:4]

In [22]:
mel_specs = tuple(map(lambda x: make_melspectogram(EXTRACT_BASE_DIR + x), inference_data))

mel_specs_dataset = torch.stack(mel_specs)





## MobileNet V2

In [12]:
NUM_CLASSES = 6

In [15]:
class CustomMobilenetV2(nn.Module):
    def __init__(self, num_classes):
        super(CustomMobilenetV2, self).__init__()
        mobilenet = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=False)

        self.features = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=3, kernel_size=(1,1)),
            *list(mobilenet.features)[:-1])
        self.classifier = nn.Linear(1280*20, num_classes)

    def forward(self, x):
        x = self.features(x).view(-1, 320*4*20)
        x = self.classifier(x)
        return x

In [16]:
model = CustomMobilenetV2(NUM_CLASSES)

Using cache found in /home/team2/.cache/torch/hub/pytorch_vision_v0.10.0


In [18]:
def load_model(model):
    model.load_state_dict(torch.load('../model_state_dict.pt'))

In [19]:
load_model(model)

In [None]:
model.eval()

In [23]:
output = model(mel_specs_dataset)
print(output)

tensor([[1.4972, 0.9863, 1.3125, 0.7184, 1.8279, 0.6440],
        [1.5952, 1.0205, 1.1982, 0.8567, 1.9987, 0.6770],
        [1.6216, 0.8704, 1.1452, 1.1122, 2.1213, 0.3271],
        [1.5131, 0.9181, 1.1979, 0.8749, 2.1032, 0.7557]],
       grad_fn=<AddmmBackward0>)


In [24]:
# 계산된 결과 (1x6)에서 가장 큰 확률이 predict label
def get_likely_index(tensor):
    return tensor.argmax(dim=-1)

In [26]:
result_label = get_likely_index(output)
print(result_label)

tensor([4, 4, 4, 4])


In [28]:
label_dict = {0: '서울,경기', 1: '강원', 2: '충청', 3: '경상', 4: '전라', 5: '제주'}

result = list(map(lambda x: label_dict[x], result_label.tolist()))
print(result)

['전라', '전라', '전라', '전라']
