In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import random
import torch
import torchaudio
from glob import glob
from torch.utils.data import DataLoader, Dataset, ConcatDataset

In [3]:
class SingerDataset(Dataset):
    def __init__(self, folder, class_to_idx):
        self.fpaths = glob(folder + '/*.wav')    
        random.seed(42)
        random.shuffle(self.fpaths)
        self.labels = [class_to_idx[fpath.split('/')[-1].split('_')[-2]] for fpath in self.fpaths]
        
    def __len__(self): 
        return len(self.fpaths)

    def __getitem__(self, idx):
        vocal = self.fpaths[idx]
        label = self.labels[idx]
        return vocal, label

In [4]:
folder = '/content/drive/MyDrive/Colab Notebooks/D&A_Conference/Vocal_data'

paths = glob(folder + '/*/')
labels = [path.split('/')[-2] for path in paths]

In [5]:
class_to_idx = {}

for label, idx in zip(labels, range(1, 150+1)):
    class_to_idx[label] = idx

In [6]:
# class_to_idx = {'SINGER_01': 0, 'SINGER_02': 1, 'SINGER_03': 2, 'SINGER_04': 3, 'SINGER_05': 4, 'SINGER_06': 5, 'SINGER_07': 6, 'SINGER_08': 7,
#                 'SINGER_09': 8, 'SINGER_10': 9, 'SINGER_11': 10, 'SINGER_12': 11, 'SINGER_13': 12, 'SINGER_14': 13, 'SINGER_15': 14, 'SINGER_16': 15,
#                 'SINGER_17': 16, 'SINGER_18': 17, 'SINGER_19': 18, 'SINGER_20': 19, 'SINGER_21': 20, 'SINGER_22': 21, 'SINGER_23': 22, 'SINGER_24': 23,
#                 'SINGER_25': 24, 'SINGER_26': 25, 'SINGER_27': 26, 'SINGER_28': 27, 'SINGER_29': 28, 'SINGER_30': 29, 'SINGER_31': 30, 'SINGER_32': 31,
#                 'SINGER_33': 32, 'SINGER_34': 33, 'SINGER_35': 34, 'SINGER_36': 35, 'SINGER_37': 36, 'SINGER_38': 37, 'SINGER_39': 38, 'SINGER_40': 39,
#                 'SINGER_41': 40, 'SINGER_42': 41, 'SINGER_43': 42, 'SINGER_44': 43, 'SINGER_45': 44, 'SINGER_46': 45, 'SINGER_47': 46, 'SINGER_48': 47,
#                 'SINGER_49': 48, 'SINGER_50': 49, 'SINGER_51': 50, 'SINGER_52': 51, 'SINGER_53': 52, 'SINGER_54': 53, 'SINGER_55': 54, 'SINGER_56': 55,
#                 'SINGER_57': 56, 'SINGER_58': 57, 'SINGER_59': 58, 'SINGER_60': 59, 'SINGER_61': 60, 'SINGER_62': 61, 'SINGER_63': 62, 'SINGER_64': 63,
#                 'SINGER_65': 64, 'SINGER_66': 65, 'SINGER_67': 66, 'SINGER_68': 67, 'SINGER_69': 68, 'SINGER_70': 69, 'SINGER_71': 70, 'SINGER_72': 71,
#                 'SINGER_73': 72, 'SINGER_74': 73, 'SINGER_75': 74, 'SINGER_76': 75, 'SINGER_77': 76, 'SINGER_78': 77, 'SINGER_79': 78, 'SINGER_80': 79,
#                 'SINGER_81': 80, 'SINGER_82': 81, 'SINGER_83': 82, 'SINGER_84': 83, 'SINGER_85': 84, 'SINGER_86': 85, 'SINGER_87': 86, 'SINGER_88': 87,
#                 'SINGER_89': 88, 'SINGER_90': 89, 'SINGER_91': 90, 'SINGER_92': 91}

class_names = tuple(class_to_idx.keys())

In [7]:
# train_dir = '/content/drive/MyDrive/Colab Notebooks/D&A_Conference/Guide Vocal Data/data/1.Training'
# valid_dir = '/content/drive/MyDrive/Colab Notebooks/D&A_Conference/Guide Vocal Data/data/2.Validation'
dir = '/content/drive/MyDrive/Colab Notebooks/D&A_Conference/Final_dataset/'

In [8]:
# train_dataset = SingerDataset(train_dir, class_to_idx)
# valid_dataset = SingerDataset(valid_dir, class_to_idx)
dataset = SingerDataset(dir, class_to_idx)

In [9]:
dataset

<__main__.SingerDataset at 0x7fe7dca392d0>

In [10]:
# len(train_dataset), len(valid_dataset)
len(dataset)

7633

In [11]:
# train_data = []
# train_label = []

# valid_data = []
# valid_label = []

data = []
label = []

In [12]:
# for i in range(len(train_dataset)):
#     train_data.append(train_dataset[i][0])
#     train_label.append(train_dataset[i][1])

# for i in range(len(valid_dataset)):
#     valid_data.append(valid_dataset[i][0])
#     valid_label.append(valid_dataset[i][1])

for i in range(len(dataset)):
    data.append(dataset[i][0])
    label.append(dataset[i][1])

In [13]:
import pandas as pd

# train = pd.DataFrame([train_data, train_label]).T
# valid = pd.DataFrame([valid_data, valid_label]).T
train = pd.DataFrame([data, label]).T
train.columns = ['vocal', 'label']

In [14]:
train['label'].value_counts()
# zion.T, 선미

124    85
134    85
137    79
114    78
102    78
       ..
65     30
71     30
107    30
139    28
128    26
Name: label, Length: 150, dtype: int64

In [15]:
class_to_idx

{'정승환': 1,
 '홍대광': 2,
 '태진아': 3,
 '아이유': 4,
 '이진아': 5,
 '린': 6,
 '치즈': 7,
 '에릭남': 8,
 '에일리': 9,
 '조용필': 10,
 '김종국': 11,
 '김동률': 12,
 '카더가든': 13,
 '나윤권': 14,
 '서문탁': 15,
 '소찬휘': 16,
 '윤상': 17,
 '허각': 18,
 '이은미': 19,
 '나비': 20,
 '케이윌': 21,
 '규현': 22,
 '김연우': 23,
 '백아연': 24,
 '김조한': 25,
 '왁스': 26,
 '남진': 27,
 '더원': 28,
 '김완선': 29,
 '루시드폴': 30,
 '김나영': 31,
 '바비 킴': 32,
 '김연자': 33,
 '신해철': 34,
 '박완규': 35,
 '박정현': 36,
 '박진영': 37,
 '백예린': 38,
 '보아': 39,
 '샘김': 40,
 '선우정아': 41,
 '성시경': 42,
 '스텔라장': 43,
 '케이시': 44,
 'JUNIEL': 45,
 '윤도현': 46,
 '싸이': 47,
 '아이비': 48,
 '알리': 49,
 '윤종신': 50,
 '거미': 51,
 '이문세': 52,
 '임재범': 53,
 '이선희': 54,
 '스탠딩 에그': 55,
 '송가인': 56,
 '정인': 57,
 '이정': 58,
 '이석훈': 59,
 'XIA (준수)': 60,
 '박화요비': 61,
 '잔나비': 62,
 

In [16]:
from sklearn.model_selection import train_test_split

train_data, valid_data, train_label, valid_label = train_test_split(train['vocal'], train['label'], 
                                                                    test_size=0.3, random_state=42,
                                                                    stratify=train['label'])

In [17]:
len(train_data), len(valid_data), len(train_label), len(valid_label)

(5343, 2290, 5343, 2290)

In [18]:
train = pd.DataFrame([train_data, train_label]).T.reset_index(drop=True)
valid = pd.DataFrame([valid_data, valid_label]).T.reset_index(drop=True)

In [25]:
train['vocal'][0]

'/content/drive/MyDrive/Colab Notebooks/D&A_Conference/Final_dataset/이정_빈자리.wav'

In [20]:
train.to_csv('/content/drive/MyDrive/Colab Notebooks/D&A_Conference/Files/train.csv', index=False)
valid.to_csv('/content/drive/MyDrive/Colab Notebooks/D&A_Conference/Files/valid.csv', index=False)

In [None]:
cd /content/drive/MyDrive/Colab Notebooks/D&A_Conference/

In [None]:
# import shutil
# import os

# file_source = glob('Final_datset/*.wav')
# file_destination = 'Final_dataset/train/'

# for source in file_source:
#     get_files = os.listdir(source)
#     if train['vocal']
#     for g in get_files:
#         shutil.move(source+g, file_destination)

In [22]:
len(glob('/content/drive/MyDrive/Colab Notebooks/D&A_Conference/Final_dataset/*'))

7635