# 지역방언 음성 분류 AI

## 데이터 준비
- 압축 풀기
- 라벨 데이터 처리
- 라벨 별 오디오 파일 개수 확인

In [1]:
# Modules, Configs
from glob import glob


# 압축 파일 경로
ZIP_BASE_DIR = 'D:/ssafy_ai/한국인 대화 음성/'

# 라벨 데이터 audio_path 앞에 붙는 경로
EXTRACT_BASE_DIR = './Training/data/remote/PROJECT/AI학습데이터/KoreanSpeech/data'

### 압축 풀기

In [None]:
train_zips = os.listdir(ZIP_BASE_DIR + 'Training')
valid_zips = os.listdir(ZIP_BASE_DIR + 'Validation')

train_label_zips = filter(lambda x: '[라벨]' in x, train_zips)

for train_label_zip in train_label_zips:
    train_audio_zip = train_label_zip.replace('[라벨]', '[원천]')

    # label_tar = tarfile.open(ZIP_BASE_DIR + 'Training/' + train_label_zip)
    # label_tar.extractall('./Training')
    # label_tar.close()

    # audio_tar = tarfile.open(ZIP_BASE_DIR + 'Training/' + train_audio_zip)
    # audio_tar.extractall('./Training')
    # audio_tar.close()

    # break

### 라벨 데이터 처리

In [246]:
GLOB_PATH = './Training/**/*.txt'

file_list = glob(GLOB_PATH, recursive=True)
metadata_list = filter(lambda x: 'metadata' in x, file_list)

labels = []

for metadata in metadata_list:
    with open(metadata, 'r', encoding='UTF-8') as f:
        for l in f.readlines():
            try:
                data = l.split('|')

                audio_path = data[0].rstrip()               # 음성 파일 경로    
                dialect = int(data[6])                      # 라벨링 데이터 - (1: 서울,경기, 2: 강원, 3: 충청, 4: 경상, 5: 전라, 6: 제주, 9: 기타)

                labels.append((audio_path, dialect))
            except:
                pass

In [247]:
label_counter = {
    1: 0,
    2: 0,
    3: 0,
    4: 0,
    5: 0, 
    6: 0,
    9: 0
}
for _, label in labels:
    label_counter[label]+=1

MIN_AUDIO_LENGTH = min(list(label_counter.values())[:-1])
MIN_AUDIO_LENGTH

print(label_counter)
print(MIN_AUDIO_LENGTH)

In [250]:
# 데이터 셔플 (임시)
from random import shuffle
shuffle(labels)

## 데이터 처리
- 데이터를 20개씩 라벨별로 분리
- Train, Test Dataset 분리

In [252]:
# Modules, Configs
from copy import deepcopy

# 라벨 데이터를 20개씩 균등하게 분배
MAX_LENGTH = 20

In [253]:
# 데이터를 20개씩 분리한 리스트로 저장
label_source = {i:[] for i in range(1, 7)}
label_dataset = [deepcopy(label_source) for _ in range(MIN_AUDIO_LENGTH//MAX_LENGTH)]
label_idxs = {i:0 for i in range(1, 7)}
for audio_path, label in labels:
    if label == 9:
        continue
    idx = label_idxs[label]
    try:
        label_dataset[idx][label].append(audio_path)
        if len(label_dataset[idx][label]) >= MAX_LENGTH:
            label_idxs[label] +=1
    except:
        pass


In [254]:
# 임의로 라벨별로 10000개씩만 가져와서 학습시켜보기 (Train : 60000, Test : 60000)
temp_dataset = []
for i in range(500):
    for label in label_dataset[i]:
        for audio_path in label_dataset[i][label]:
            temp_dataset.append(audio_path+'\t'+str(label))

# train용 오디오 데이터 경로 + 라벨 임시 저장
with open('./audio_data_train', 'w', encoding='utf8') as f:
    f.write('\n'.join(temp_dataset))

    
temp_dataset = []
for i in range(500,1000):
    for label in label_dataset[i]:
        for audio_path in label_dataset[i][label]:
            temp_dataset.append(audio_path+'\t'+str(label))

# test용 오디오데이터 경로 + 라벨 임시 저장
with open('./audio_data_test', 'w', encoding='utf8') as f:
    f.write('\n'.join(temp_dataset))

In [503]:
# 전처리에 필요한 zeropadding, cutting을 위해 audio_length 체크하는 부분, 로컬에서 엄청오래걸리길래 서버에 돌려놓음
audio_length = []
for audio_path, _ in labels:
    try:
        waveform, sample_rate = get_speech(EXTRACT_BASE_DIR + audio_path.replace('06.경제', '6.경제'))
        audio_length.append(waveform.size()[-1])
    except:
        pass

## 모델 학습
- Pytorch Dataset, Dataloader 정의
- Mobilenet v2
- 모델 커스터마이징
- 학습, 테스트

In [None]:
# Modules, Configs, Utils
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import torchaudio.transforms as T

# !pip install SoundFile
import soundfile
import os
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import numpy as np

# 클래스 개수
NUM_CLASSES = 6
torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False

# 음성 데이터 불러오기
def get_speech(file_path):
    return torchaudio.backend.soundfile_backend.load(file_path)

n_fft = 512
win_length = 512
hop_length = 256
n_mels = 128

# Melspectogram 변환
def make_melspectogram(audio_path):
    waveform, sample_rate = get_speech(audio_path)
    
    mel_spectrogram = T.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        win_length=win_length,
        hop_length=hop_length,
        center=True,
        pad_mode="reflect",
        power=2.0,
        norm="slaney",
        onesided=True,
        n_mels=n_mels,
        mel_scale="htk",
    )

    melspec = mel_spectrogram(waveform)             # 결과값

    # print(melspec)
    # print(melspec.size())
    # f[audio_path.split('/')[-1]] = melspec.numpy()

    return True, melspec.numpy()

    # except RuntimeError as re:
    #     if str(re.args[0]).startswith('Error opening'):
    #         print('**********  파일 열기 에러 (경로 및 이름 확인 필요)  **********')
    #         print(f'오류 발생한 경로 : {EXTRACT_BASE_DIR}{audio_path}')
    # except:
    #     print('melspectrogram 변환 과정 중 오류 발생')

    # return False, None

In [280]:
# 버전체크 (서버에서는 0.11.0 cuda 사용)
torchaudio.__version__

'0.11.0+cpu'

### Dataset 정의

In [483]:
class DialectAudioDataset(Dataset):
    def __init__(self, label_file, root_dir, transform=None, target_transform=None):
        with open(label_file, 'r', encoding='UTF-8') as f:
            self.audio_datas = [l.rstrip().split('\t') for l in f.readlines()]
        self.root_dir = root_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.audio_datas)
    
    def __getitem__(self, idx):
        audio_path = self.root_dir + self.audio_datas[idx][0]
        # audio melspectogram
        # image = read_image(img_path)
        # audio = get_speech(self.root_dir, self.audio_datas[idx][0])
        
        # print(self.audio_datas[idx])
        
        # 경로상 오류있음, 처리
        audio_path = audio_path.replace('06.경제', '6.경제')

        # 데이터 로드 시 melspec처리까지 진행, 확인했을 때 25정도까지도 줄어들었음
        audio = make_melspectogram(audio_path)[1][:,:,:20]
        
        # label은 0부터 시작
        label = torch.tensor([int(self.audio_datas[idx][1]) - 1], dtype=torch.long)

        # zero padding, noise padding 필요
        if self.transform:
            audio = self.transform(audio)
        if self.target_transform:
            label = self.target_transform(label)

        return audio, label

In [484]:
# 데이터 로드
training_data = DialectAudioDataset('./audio_data_train', EXTRACT_BASE_DIR)
testing_data = DialectAudioDataset('./audio_data_test', EXTRACT_BASE_DIR)

### 데이터로더

In [485]:
train_loader = DataLoader(training_data, batch_size=64, shuffle=True)
test_loader = DataLoader(testing_data, batch_size=64, shuffle=True)

### MobileNet V2

In [487]:
# !pip install pillow
mobilenet = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=False)

Using cache found in C:\Users\sosin/.cache\torch\hub\pytorch_vision_v0.10.0


In [488]:
# 모바일넷의 구조를 확인가능
print(mobilenet)

MobileNetV2(
  (features): Sequential(
    (0): ConvBNActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNActivation(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=1e-05, momen

### 모델 커스터마이징

In [489]:
# 우리 데이터의 경우 1 channel, 128 x audio_length의 구조를 가지고있음
# 모바일넷의 경우 in_channel이 3으로 시작 (vision model의 특성, rgb 3차원)
# 따라서, 모델의 in channel을 1로 받아서 3으로 늘려주는 conv net 을 하나 추가

# 모바일넷 classification의 classifier는 1000개 (이미지 1000개를 분류하는 대회용으로 만들어진 모델이라 그럼)
# 1000개 분류 모델이 아닌 6개(수도권,전라,충청,제주,경상,강원) 분류 모델이므로 classifier 부분을 재정의

class CustomModel(nn.Module):
    def __init__(self, originalModel, num_classes):
        super(CustomModel, self).__init__()

        # in channel 1->3 추가,  classifier 재정의
        self.features = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=3, kernel_size=(1,1)), *list(originalModel.features)[:-1])
        self.classifier = nn.Linear(1280, num_classes)

    def forward(self, x):
        x = self.features(x).view(-1, 320*4)
        x = self.classifier(x)
        return x

In [490]:
# 모델 로드
model = CustomModel(mobilenet, NUM_CLASSES)

# Loss함수 cross entropy loss
creterion = nn.CrossEntropyLoss()

# optimizer adam
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)  # reduce the learning after 20 epochs by a factor of 10


### 모델 학습, 테스트

In [None]:
# Modules, Configs
import torch.nn.functional as F

# !pip install tqdm
from tqdm import tqdm

# 모델 학습 디바이스 체크
# GPU서버에선 cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [493]:
# 학습함수
def train(model, epoch, log_interval):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.to(device)
        target = target.to(device)

        # apply transform and model on whole batch directly on device
        # data = transform(data)
        output = model(data)

        # cross_entropy loss (batch * NUM_CLASSES, batch * 1) [[0.13,0.13,0.13,0.13,0.13,0.35], ...] , [6, ...]
        loss = creterion(output.squeeze(), target.squeeze())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print training stats
        if batch_idx % log_interval == 0:
            print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}")

        # update progress bar
        pbar.update(pbar_update)
        # record loss
        losses.append(loss.item())

In [494]:
# 맞춘 개수
def number_of_correct(pred, target):
    # count number of correct predictions
    return pred.squeeze().eq(target).sum().item()

# 계산된 결과 (1x6)에서 가장 큰 확률이 predict label
def get_likely_index(tensor):
    # find most likely label index for each element in the batch
    return tensor.argmax(dim=-1)

def test(model, epoch):
    model.eval()
    correct = 0
    for data, target in test_loader:

        data = data.to(device)
        target = target.to(device)

        # apply transform and model on whole batch directly on device
        # data = transform(data)
        output = model(data)

        pred = get_likely_index(output)
        correct += number_of_correct(pred, target)

        # update progress bar
        pbar.update(pbar_update)

    print(f"\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n")

In [497]:
log_interval = 20
n_epoch = 2

pbar_update = 1 / (len(train_loader) + len(test_loader))
losses = []

with tqdm(total=n_epoch) as pbar:
    for epoch in range(1, n_epoch + 1):
        train(model, epoch, log_interval)
        test(model, epoch)
        scheduler.step()

  0%|          | 0/2 [00:00<?, ?it/s]

torch.Size([64, 6])


  0%|          | 0.0005330490405117271/2 [00:02<3:02:53, 5488.21s/it]

torch.Size([64, 6])


  0%|          | 0.0010660980810234541/2 [00:05<3:02:52, 5489.19s/it]

torch.Size([64, 6])


  0%|          | 0.0015991471215351812/2 [00:08<2:57:03, 5315.80s/it]

torch.Size([64, 6])


  0%|          | 0.0021321961620469083/2 [00:11<2:55:02, 5256.71s/it]

torch.Size([64, 6])


  0%|          | 0.0026652452025586353/2 [00:14<2:53:00, 5197.00s/it]

torch.Size([64, 6])


  0%|          | 0.0031982942430703624/2 [00:16<2:54:39, 5247.92s/it]

torch.Size([64, 6])


  0%|          | 0.0037313432835820895/2 [00:19<2:55:22, 5271.00s/it]

torch.Size([64, 6])


  0%|          | 0.0042643923240938165/2 [00:22<2:55:18, 5270.59s/it]

torch.Size([64, 6])


  0%|          | 0.004797441364605543/2 [00:25<2:53:47, 5226.30s/it] 

torch.Size([64, 6])


  0%|          | 0.00533049040511727/2 [00:28<2:55:58, 5293.15s/it] 

torch.Size([64, 6])


  0%|          | 0.0058635394456289965/2 [00:31<2:58:17, 5364.63s/it]

torch.Size([64, 6])


  0%|          | 0.006396588486140723/2 [00:33<2:53:59, 5236.57s/it] 

torch.Size([64, 6])


  0%|          | 0.00692963752665245/2 [00:36<2:52:44, 5200.47s/it] 

torch.Size([64, 6])


  0%|          | 0.007462686567164176/2 [00:39<2:53:43, 5231.03s/it]

torch.Size([64, 6])


  0%|          | 0.007995735607675903/2 [00:42<2:51:54, 5178.16s/it]

torch.Size([64, 6])


  0%|          | 0.00852878464818763/2 [00:44<2:54:30, 5257.71s/it] 

torch.Size([64, 6])


  0%|          | 0.009061833688699356/2 [00:47<2:56:09, 5308.58s/it]

torch.Size([64, 6])


  0%|          | 0.009594882729211083/2 [00:50<2:56:22, 5316.68s/it]

torch.Size([64, 6])


  1%|          | 0.01012793176972281/2 [00:53<2:55:07, 5280.61s/it] 

torch.Size([64, 6])


  1%|          | 0.010660980810234536/2 [00:56<2:58:42, 5389.95s/it]

torch.Size([64, 6])


  1%|          | 0.011194029850746263/2 [00:59<3:00:47, 5454.35s/it]



  1%|          | 0.011194029850746263/2 [00:59<2:57:34, 5357.31s/it]


KeyboardInterrupt: 

## 추론