In [14]:
import os
import shutil
import random
random.seed(42)

def clear_folder(folder_path):
    
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)  # 如果是文件或链接则删除
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # 如果是目录则递归删除
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')



# 随机拷贝1/10的数据作为minidata，噪音单独处理
def copy_random_files(src_dir, dst_dir, ratio=0.1):

    clear_folder(dst_dir)

    # 遍历源目录中的所有子文件夹
    for subdir, dirs, files in os.walk(src_dir):
        for folder in dirs:
            if folder != "_background_noise_":
                src_folder_path = os.path.join(src_dir, folder)
                dst_folder_path = os.path.join(dst_dir, folder)
                
                # 如果目标子文件夹不存在，则创建它
                if not os.path.exists(dst_folder_path):
                    os.makedirs(dst_folder_path)
                
                # 获取所有文件并随机选择一部分进行复制
                sub_files = os.listdir(src_folder_path)
                files_to_copy = random.sample(sub_files, int(len(sub_files) * ratio))
                for file in files_to_copy:
                    src_file_path = os.path.join(src_folder_path, file)
                    dst_file_path = os.path.join(dst_folder_path, file)
                    shutil.copy(src_file_path, dst_file_path)



In [17]:
# 使用示例
source_directory = './data/speech_commands_v0.01'
dsttination_directory = './data/data_clean'
copy_random_files(source_directory, dsttination_directory, 1)

In [15]:
# 将mini数据集划分成规定的种类
# 将unknown balance一下
import random
def merge_specific_cate(src_dir, dst_dir, cate=[]):

    clear_folder(dst_dir)
    
    for folder in cate:
        dst_folder_path = os.path.join(dst_dir, folder)
        if not os.path.exists(dst_folder_path):
            os.makedirs(dst_folder_path)

    for folder in os.listdir(src_dir):
        dst_folder = folder[0].upper() + folder[1:]
        if  dst_folder in cate:
            dst_folder_path = os.path.join(dst_dir, dst_folder)
        else:
            # 如果随机数小于等于概率，则执行
            probability = 0.0625
            random_number = random.random()
            if random_number <= probability:

                default_folder = cate[-1]
                dst_folder_path = os.path.join(dst_dir, default_folder, dst_folder)
                if not os.path.exists(dst_folder_path):
                    os.makedirs(dst_folder_path)
        
        src_folder_path = os.path.join(src_dir, folder)
        for file in os.listdir(src_folder_path):
            src_file_path = os.path.join(src_folder_path, file)
            dst_file_path = os.path.join(dst_folder_path, file)
            shutil.copy(src_file_path, dst_file_path)

            
        

In [18]:
src_directory = 'data\\data_clean'
dst_directory = 'data\\data_balance'
cate = ["Yes" , "No", "Up" , "Down" , "Left" , "Right" , "On" , "Off" , "Stop" , "Go" , "Silence" , "Unknown"]
merge_specific_cate(src_directory, dst_directory, cate)
# os.listdir(src_directory)