# 데이터 전처리 과정

ESC 50과 VISC 데이터셋을 사용하여, [Indoor, Outdoor, In_Vehicle, Unknown]으로 정리

폴더 구조
```bash
.
|-ESC-50-master/
|    |-meta/
|        |-esc50.csv
|    |-audio/
|        |-audio files
|-custom_dataset/
|    |-indoor
|    |-outdoor
|    |-in_vehicle
|    |-unknown
|- prep_data.ipynb
```

## ESC 50

https://github.com/karolpiczak/ESC-50

ESC 50은 50개의 카테고리로 .csv파일에 레이블링 되어있습니다. 아래 코드는 원래의 50개의 레이블을 사용할 4개의 분류로 재분류하는 코드입니다.

ESC-50-master 폴더 내에 저장된 음성파일들을 분류에 따라 custom_dataset 폴더에 저장합니다.

In [None]:
import csv
import shutil
import os
import torchaudio
import torch

: 

In [None]:
target = ["indoor", "outdoor", "in_vehicle", "unknown"]
#이하 아래의 레이블들은 ESC50에 들어있는 세부 카테고리들을 커스텀 데이터셋의 분류에 맞추어 정리한 것
src_indoor = ["clock_tick","keyboard_typing","mouse_click","washing_machine","vacuum_cleaner",
              "door_wood_knock","door_wood_creaks","can_opening","glass_breaking","clock_alarm",
              "toilet_flush","brushing_teeth","snoring","drinking_sipping","coughing",
              "sneezing","crying_baby","laughing","clapping","breathing"]
src_outdoor = ["chirping_birds","crickets","wind","sea_waves","rain",
               "thunderstorm","helicopter","chainsaw","siren","car_horn",
               "church_bells","fireworks","hand_saw"]
src_vehicle = ["engine", "airplane", "train"]
src_unknown = ["water_drops", "pouring_water", "footsteps", "crackling_fire", "dog",
               "pig", "cow", "frog", "cat", "hen", "insects", "sheep", "crow", "rooster"]

mapping = {}

In [None]:
#ESC50 category, custom category의 형태로 매핑을 저장한다.
for indoor_category in src_indoor:
    mapping[indoor_category] = 'indoor'
for outdoor_category in src_outdoor:
    mapping[outdoor_category] = 'outdoor'
for vehicle_category in src_vehicle:
    mapping[vehicle_category] = 'in_vehicle'
for unknown_category in src_unknown:
    mapping[unknown_category] = 'unknown'

In [None]:
#파일 위치들
CSV_FILE_PATH = "./ESC-50-master/meta/esc50.csv"
SOURCE_FOLDER = "./ESC-50-master/audio"
DESTINATION_FOLDER_DICT = {
    'indoor': './custom_dataset/indoor',
    'outdoor': "./custom_dataset/outdoor",
    'in_vehicle': "./custom_dataset/in_vehicle",
    "unknown": "./custom_dataset/unknown"
}

In [None]:
try:
    with open(CSV_FILE_PATH, mode='r', encoding='utf-8', newline='') as file:
        reader = csv.DictReader(file)

        for row in reader:
            try:
                filename = row['filename']
                category = row['category']
            except KeyError:
                print("there is no filename and category in csv file")
            source_path = os.path.join(SOURCE_FOLDER, filename)
            destination_folder = DESTINATION_FOLDER_DICT[mapping[category]]
            destination_path = os.path.join(destination_folder, filename)

            try:
                shutil.move(source_path, destination_path)
                print(f"moved: {filename} -> {destination_folder}")
            except shutil.Error as e:
                print(f"Error moving {filename} (e.g., file already exists): {e}. Skipping.")
            except IOError as e:
                print(f"IOError (e.g., permissions) moving {filename}: {e}. Skipping.")    

except FileNotFoundError:
    print(f"FATAL ERROR: The CSV file was not found at: {CSV_FILE_PATH}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

## VISC

https://zenodo.org/records/5606504

VISC 데이터셋은 전부 차량 내부의 소음을 녹음한 데이터셋입니다. ESC50과는 다르게 48KHz로 샘플링되어있기 때문에 16KHz로 리샘플링글 거친 뒤, 전부 In_vehicle 폴더로 저장합니다.

In [None]:
#데이터셋 위치 및 샘플링 상수 설정
VISC_DATASET_FOLDER = "./VISC_Dataset_SON/VISC Dataset SON"
TARGET_FOLDER = './custom_dataset/in_vehicle'
TARGET_SAMPLE_RATE = 16000

In [None]:
for filename in os.listdir(VISC_DATASET_FOLDER):
    source_path = os.path.join(VISC_DATASET_FOLDER, filename)
    target_path = os.path.join(TARGET_FOLDER, filename)

    try:
        waveform, sr = torchaudio.load(source_path)

        if sr != TARGET_SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=TARGET_SAMPLE_RATE)
            resampled_waveform = resampler(waveform)
        else:
            resampled_waveform = waveform
        
        torchaudio.save(target_path, resampled_waveform, TARGET_SAMPLE_RATE)

    except Exception as e:
        print(f"ERROR on processing {filename}: {e}")            

    print(f"Processed: {filename}")