# gigaspeech dataset 불러오기

In [1]:
import pandas as pd
import os
import torch
import torchaudio
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# GigaSpeech metadata root directory
metadata_root = '/home/nas4/DB/gigaspeech/data/metadata'

def load_metadata(metadata_name):
    dir_path = os.path.join(metadata_root, metadata_name)
    csv_files = [os.path.join(dir_path, f)
                 for f in os.listdir(dir_path) if f.endswith('.csv')]
    dfs = []
    for csv_path in csv_files:
        df = pd.read_csv(csv_path)
        dfs.append(df)
    if len(dfs) > 0:
        return pd.concat(dfs, ignore_index=True)
    else:
        return pd.DataFrame()  # Return empty dataframe if no data

# 지정된 메타데이터 로드
dev_metadata = load_metadata('dev_metadata')
m_metadata = load_metadata('m_metadata_additional')
s_metadata = load_metadata('s_metadata_additional')
test_metadata = load_metadata('test_metadata')
xs_metadata = load_metadata('xs_metadata')


In [2]:
xs_metadata.head()

Unnamed: 0,sid,speaker,text_tn,begin_time,end_time,title,url,path,aid,source,codec,channels,md5,speaker.1,category
0,YOU0000000315_S0000660,,AS THEY'RE LEAVING <COMMA> CAN KASH PULL ZAHRA...,2941.89,2945.07,Return to Vasselheim | Critical Role: VOX MACH...,https://www.youtube.com/watch?v=zr2n1fLVasU,audio/youtube/P0004/YOU0000000315.opus,YOU0000000315,youtube,s16le,1,0b848c605910ebeeac71e719a421dd5e,,Gaming
1,AUD0000001043_S0000775,,SIX TOMATOES <PERIOD>,3673.96,3675.26,Asteroid of Fear,http//www.archive.org/download/asteroid_of_fea...,audio/audiobook/P0011/AUD0000001043.opus,AUD0000001043,audiobook,s16le,1,46be639d055b736fcc183ee245578f5e,,audiobook
2,AUD0000001043_S0000942,,AND SOMETHING BROUGHT BACK RESTORED FROM THE R...,4393.0,4397.11,Asteroid of Fear,http//www.archive.org/download/asteroid_of_fea...,audio/audiobook/P0011/AUD0000001043.opus,AUD0000001043,audiobook,s16le,1,46be639d055b736fcc183ee245578f5e,,audiobook
3,YOU0000000299_S0000629,,TO HELP SCREEN READER USERS IN THE MIDST OF DI...,2553.74,2557.77,Making Accessible Web Apps Using HTML5 and Chr...,https://www.youtube.com/watch?v=x18vEEfpK3g,audio/youtube/P0003/YOU0000000299.opus,YOU0000000299,youtube,s16le,1,13091ee7f381dfbe4a1ed97da72813e2,,Science and Technology
4,AUD0000000468_S0000084,,FOR ALICE HAD READ SEVERAL NICE LITTLE STORIES...,525.56,534.42,Alice's Adventures Underground,http//www.archive.org/download/alice_undergrou...,audio/audiobook/P0005/AUD0000000468.opus,AUD0000000468,audiobook,s16le,1,17567b04ff9e8930735d1daf0ee3405e,,audiobook


In [3]:
# train_metadata (m_metadata, s_metadata, xs_metadata) , dev_metadata , test_metadata
# df_merged
# matadata 다시 만들기

import glob

default_path = '/home/nas4/DB/gigaspeech/data/audio'
data_type = 'xs_files'
search_path = os.path.join(default_path, data_type)

# 모든 .wav 파일 경로를 한 번에 가져와서 sid: path 딕셔너리로 만듦
with tqdm(total=1, desc='Searching wav files') as pbar:
    wav_paths = glob.glob(os.path.join(search_path, '**', '*.wav'), recursive=True)
    pbar.update(1)
wav_sid_dict = {}
for wav_path in wav_paths:
    # 파일명에서 sid 추출 (확장자 제거)
    sid = os.path.splitext(os.path.basename(wav_path))[0]
    wav_sid_dict[sid] = wav_path

success_count = 0
fail_count = 0
success_list = []
fail_list = []

with tqdm(total=len(xs_metadata), desc=f"Success: {success_count}, Fail: {fail_count}") as pbar:
    for i in range(len(xs_metadata)):
        sid = xs_metadata.at[i, 'sid']
        found_path = wav_sid_dict.get(sid, None)
        xs_metadata.at[i, "audio_path"] = found_path
        if found_path is not None:
            success_count += 1
        else:
            fail_count += 1
        success_list.append(success_count)
        fail_list.append(fail_count)
        pbar.set_description(f"Success: {success_count}, Fail: {fail_count}")
        pbar.update(1)

cols_to_save = ['sid', 'audio_path', 'begin_time', 'end_time', 'category', 'codec', 'channels', 'text_tn']
xs_metadata[cols_to_save].to_csv('xs_metadata_path.csv', index=False)


Searching wav files: 100%|██████████| 1/1 [00:06<00:00,  6.04s/it]
Success: 9389, Fail: 0: 100%|██████████| 9389/9389 [00:11<00:00, 810.00it/s]


In [4]:
# train_metadata (m_metadata, s_metadata, xs_metadata) , dev_metadata , test_metadata
# df_merged
# matadata 다시 만들기

import glob

default_path = '/home/nas4/DB/gigaspeech/data/audio'
data_type = 's_files_additional'
search_path = os.path.join(default_path, data_type)

# 모든 .wav 파일 경로를 한 번에 가져와서 sid: path 딕셔너리로 만듦
with tqdm(total=1, desc='Searching wav files') as pbar:
    wav_paths = glob.glob(os.path.join(search_path, '**', '*.wav'), recursive=True)
    pbar.update(1)
wav_sid_dict = {}
for wav_path in wav_paths:
    # 파일명에서 sid 추출 (확장자 제거)
    sid = os.path.splitext(os.path.basename(wav_path))[0]
    wav_sid_dict[sid] = wav_path

success_count = 0
fail_count = 0
success_list = []
fail_list = []

with tqdm(total=len(s_metadata), desc=f"Success: {success_count}, Fail: {fail_count}") as pbar:
    for i in range(len(s_metadata)):
        sid = s_metadata.at[i, 'sid']
        found_path = wav_sid_dict.get(sid, None)
        s_metadata.at[i, "audio_path"] = found_path
        if found_path is not None:
            success_count += 1
        else:
            fail_count += 1
        success_list.append(success_count)
        fail_list.append(fail_count)
        pbar.set_description(f"Success: {success_count}, Fail: {fail_count}")
        pbar.update(1)

cols_to_save = ['sid', 'audio_path', 'begin_time', 'end_time', 'category', 'codec', 'channels', 'text_tn']
s_metadata[cols_to_save].to_csv('s_metadata_path.csv', index=False)

Searching wav files: 100%|██████████| 1/1 [01:37<00:00, 97.67s/it]
Success: 26522, Fail: 0:  12%|█▏        | 26521/220679 [00:31<03:50, 841.55it/s]


KeyboardInterrupt: 

In [None]:
# train_metadata (m_metadata, s_metadata, xs_metadata) , dev_metadata , test_metadata
# df_merged
# metadata 다시 만들기

import glob

default_path = '/home/nas4/DB/gigaspeech/data/audio'
data_type = 'm_files_additional'
search_path = os.path.join(default_path, data_type)

# 모든 .wav 파일 경로를 한 번에 가져와서 sid: path 딕셔너리로 만듦
with tqdm(total=1, desc='Searching wav files') as pbar:
    wav_paths = glob.glob(os.path.join(search_path, '**', '*.wav'), recursive=True)
    pbar.update(1)
wav_sid_dict = {}
for wav_path in wav_paths:
    # 파일명에서 sid 추출 (확장자 제거)
    sid = os.path.splitext(os.path.basename(wav_path))[0]
    wav_sid_dict[sid] = wav_path

success_count = 0
fail_count = 0
success_list = []
fail_list = []

with tqdm(total=len(m_metadata), desc=f"Success: {success_count}, Fail: {fail_count}") as pbar:
    for i in range(len(m_metadata)):
        sid = m_metadata.at[i, 'sid']
        found_path = wav_sid_dict.get(sid, None)
        m_metadata.at[i, "audio_path"] = found_path
        if found_path is not None:
            success_count += 1
        else:
            fail_count += 1
        success_list.append(success_count)
        fail_list.append(fail_count)
        pbar.set_description(f"Success: {success_count}, Fail: {fail_count}")
        pbar.update(1)

cols_to_save = ['sid', 'audio_path', 'begin_time', 'end_time', 'category', 'codec', 'channels', 'text_tn']
m_metadata[cols_to_save].to_csv('m_metadata_path.csv', index=False)

Searching wav files: 100%|██████████| 1/1 [04:18<00:00, 258.27s/it]
Success: 680072, Fail: 0: 100%|██████████| 680072/680072 [14:13<00:00, 796.72it/s] 


In [None]:
# train_metadata (m_metadata, s_metadata, xs_metadata) , dev_metadata , test_metadata
# df_merged
# metadata 다시 만들기

import glob

default_path = '/home/nas4/DB/gigaspeech/data/audio'
data_type = 'dev_files'
search_path = os.path.join(default_path, data_type)

# 모든 .wav 파일 경로를 한 번에 가져와서 sid: path 딕셔너리로 만듦
with tqdm(total=1, desc='Searching wav files') as pbar:
    wav_paths = glob.glob(os.path.join(search_path, '**', '*.wav'), recursive=True)
    pbar.update(1)
wav_sid_dict = {}
for wav_path in wav_paths:
    # 파일명에서 sid 추출 (확장자 제거)
    sid = os.path.splitext(os.path.basename(wav_path))[0]
    wav_sid_dict[sid] = wav_path

success_count = 0
fail_count = 0
success_list = []
fail_list = []

with tqdm(total=len(dev_metadata), desc=f"Success: {success_count}, Fail: {fail_count}") as pbar:
    for i in range(len(dev_metadata)):
        sid = dev_metadata.at[i, 'sid']
        found_path = wav_sid_dict.get(sid, None)
        dev_metadata.at[i, "audio_path"] = found_path
        if found_path is not None:
            success_count += 1
        else:
            fail_count += 1
        success_list.append(success_count)
        fail_list.append(fail_count)
        pbar.set_description(f"Success: {success_count}, Fail: {fail_count}")
        pbar.update(1)

cols_to_save = ['sid', 'audio_path', 'begin_time', 'end_time', 'category', 'codec', 'channels', 'text_tn']
dev_metadata[cols_to_save].to_csv('dev_metadata_path.csv', index=False)

Searching wav files: 100%|██████████| 1/1 [00:00<00:00,  2.71it/s]
Success: 6750, Fail: 0: 100%|██████████| 6750/6750 [00:08<00:00, 807.86it/s]


In [None]:
# train_metadata (m_metadata, s_metadata, xs_metadata) , dev_metadata , test_metadata
# df_merged
# metadata 다시 만들기

import glob

default_path = '/home/nas4/DB/gigaspeech/data/audio'
data_type = 'test_files'
search_path = os.path.join(default_path, data_type)

# 모든 .wav 파일 경로를 한 번에 가져와서 sid: path 딕셔너리로 만듦
with tqdm(total=1, desc='Searching wav files') as pbar:
    wav_paths = glob.glob(os.path.join(search_path, '**', '*.wav'), recursive=True)
    pbar.update(1)
wav_sid_dict = {}
for wav_path in wav_paths:
    # 파일명에서 sid 추출 (확장자 제거)
    sid = os.path.splitext(os.path.basename(wav_path))[0]
    wav_sid_dict[sid] = wav_path

success_count = 0
fail_count = 0
success_list = []
fail_list = []

with tqdm(total=len(test_metadata), desc=f"Success: {success_count}, Fail: {fail_count}") as pbar:
    for i in range(len(test_metadata)):
        sid = test_metadata.at[i, 'sid']
        found_path = wav_sid_dict.get(sid, None)
        test_metadata.at[i, "audio_path"] = found_path
        if found_path is not None:
            success_count += 1
        else:
            fail_count += 1
        success_list.append(success_count)
        fail_list.append(fail_count)
        pbar.set_description(f"Success: {success_count}, Fail: {fail_count}")
        pbar.update(1)

cols_to_save = ['sid', 'audio_path', 'begin_time', 'end_time', 'category', 'codec', 'channels', 'text_tn']
test_metadata[cols_to_save].to_csv('test_metadata_path.csv', index=False)

Searching wav files: 100%|██████████| 1/1 [00:26<00:00, 26.74s/it]
Success: 25619, Fail: 0: 100%|██████████| 25619/25619 [00:32<00:00, 796.54it/s] 


# SpeechCraft dataset 파악

https://github.com/thuhcsi/SpeechCraft

In [None]:
df_labels = pd.read_csv('/home/nas4/DB/gigaspeech/EN_labels.csv')
df_descriptions = pd.read_csv('/home/nas4/DB/gigaspeech/EN_description.csv', on_bad_lines='skip')
df_instructions = pd.read_csv('/home/nas4/DB/gigaspeech/EN_instruction.csv')

In [None]:
print(df_labels.head(1))
print('--------------------------------')
print(df_descriptions.head(1))
print('--------------------------------')
print(df_instructions.head(1))

                                   Key  Gender          Age Speed   Pitch  \
0  librittsr_8123_275209_000004_000007  female  Young Adult  slow  normal   

  Energy  Emotion   Category   Transcript  
0    low  natural  audiobook  Dear! dear!  
--------------------------------
                                   Key  \
0  librittsr_8123_275209_000004_000007   

                                                 Des  
0  With a natural emotion, a young female with a ...  
--------------------------------
                                   Key  \
0  librittsr_8123_275209_000004_000007   

                                                 Ins  
0  With a normal pitch and low volume, a young fe...  


In [None]:
# label_keys 중에서 3개 데이터프레임에 모두 존재하지 않는 Key 값만 추출
label_keys = set(df_labels['Key'])
description_keys = set(df_descriptions['Key'])
instruction_keys = set(df_instructions['Key'])

# 3개 데이터프레임에 모두 존재하는 Key 집합
common_keys = label_keys & description_keys & instruction_keys
print("그런 Key 개수:", len(common_keys))

# 그렇지 않은 Key 값 (label_keys에서 common_keys를 뺌)
diff_keys = label_keys - common_keys
print("3개 데이터프레임에 모두 존재하지 않는 label_keys:", diff_keys)
print("그 Key 개수:", len(diff_keys))


그런 Key 개수: 1022325
3개 데이터프레임에 모두 존재하지 않는 label_keys: {'gigaspeech_AUD0000000756_S0008116', 'librittsr_4110_11528_000036_000002', 'gigaspeech_YOU0000013256_S0000066', 'gigaspeech_POD0000006698_S0000178', 'librittsr_4110_11528_000062_000001', 'librittsr_5538_224746_000038_000000'}
그 Key 개수: 6


In [None]:
# 세 데이터프레임에 모두 존재하는 Key만 대상으로 inner join 수행
df_merged = df_labels.merge(df_descriptions, on='Key', how='inner') \
                     .merge(df_instructions, on='Key', how='inner')

print("Inner join 결과 shape:", df_merged.shape)
print("Key의 unique 개수 (inner join):", df_merged['Key'].nunique())

Inner join 결과 shape: (1022325, 11)
Key의 unique 개수 (inner join): 1022325


In [None]:
# gigaspeech만 살리기: Key에 'gigaspeech'가 포함된 row만 남긴다.
df_merged = df_merged[df_merged['Key'].str.contains('gigaspeech')]
df_merged.reset_index(drop=True, inplace=True)
print(df_merged['Key'].unique())
print("남은 gigaspeech Key 개수:", df_merged['Key'].nunique())


['gigaspeech_YOU0000004134_S0000088' 'gigaspeech_POD0000012099_S0000074'
 'gigaspeech_POD0000003784_S0000084' ...
 'gigaspeech_AUD0000000882_S0004175' 'gigaspeech_AUD0000001768_S0001270'
 'gigaspeech_AUD0000001677_S0001402']
남은 gigaspeech Key 개수: 670067


In [None]:
df_merged.head()

Unnamed: 0,Key,Gender,Age,Speed,Pitch,Energy,Emotion,Category,Transcript,Des,Ins
0,gigaspeech_YOU0000004134_S0000088,female,Elderly,normal,normal,high,sad,Education,TWO-THIRDS OF THE VEHICLES THE EPA TESTS THEMS...,Sharing a heartfelt observation in the field o...,Sharing a heartfelt observation in the field o...
1,gigaspeech_POD0000012099_S0000074,male,Elderly,slow,normal,high,sad,News and Politics,THE DISEASES FOR WHICH THERE ARE PREVENTIVE ME...,Speaking with a slow pace and a hint of sadnes...,Speaking with a slow pace and a hint of sadnes...
2,gigaspeech_POD0000003784_S0000084,female,Middle-aged,normal,high,high,happy,News and Politics,AND THEY'RE MORE CAUTIOUS WENDY WAS ABLE TO CO...,"Happily states an adult female, high-pitched a...","""AND THEY'RE MORE CAUTIOUS WENDY WAS ABLE TO C..."
3,gigaspeech_YOU0000012125_S0000863,female,Young Adult,slow,high,low,neutral,News and Politics,I KNOW <PERIOD>,Reflecting on a topic in the realm of News and...,Reflecting on a topic in the realm of News and...
4,gigaspeech_POD0000003364_S0000299,male,Teenager,fast,normal,high,happy,News and Politics,IT'S TIME YOU GOT TOGETHER I R L <PERIOD> YOU ...,Expressing happiness in the realm of News and ...,Expressing happiness in the realm of News and ...


# gigaspeech, speechcraft 연결

In [None]:
import pandas as pd
import os
import torch
import torchaudio
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

df_xs = pd.read_csv('/home/nas4/DB/gigaspeech/xs_metadata_path.csv')
df_s = pd.read_csv('/home/nas4/DB/gigaspeech/s_metadata_path.csv')
df_m = pd.read_csv('/home/nas4/DB/gigaspeech/m_metadata_path.csv')
df_dev = pd.read_csv('/home/nas4/DB/gigaspeech/dev_metadata_path.csv')
df_test = pd.read_csv('/home/nas4/DB/gigaspeech/test_metadata_path.csv')

print(len(df_xs))
print(len(df_s))
print(len(df_m))
print(len(df_dev))
print(len(df_test))
print(len(df_merged))

9389
220679
680072
6750
25619
670067


In [None]:
# gigaspeech_ 빼고 sid 추출
df_merged['sid_no_prefix'] = df_merged['Key'].str.replace('gigaspeech_', '', regex=False)

# df_xs의 audio_path에서 sid 추출 함수
def extract_sid(audio_path):
    # 예: /path/to/POD0000003364_S0000299.wav
    base = os.path.basename(audio_path)
    sid = os.path.splitext(base)[0]
    return sid

df_xs['sid_from_path'] = df_xs['audio_path'].apply(extract_sid)

# df_xs의 sid와 df_merged의 sid_no_prefix가 일치하는지 확인
df_xs['exists_in_merged'] = df_xs['sid_from_path'].isin(df_merged['sid_no_prefix'])

print("df_xs에서 df_merged Key에 일치하는 sid(=Key) 개수:", df_xs['exists_in_merged'].sum())
print(df_xs[df_xs['exists_in_merged']].head())


df_xs에서 df_merged Key에 일치하는 sid(=Key) 개수: 0
Empty DataFrame
Columns: [sid, audio_path, begin_time, end_time, category, codec, channels, text_tn, sid_from_path, exists_in_merged]
Index: []


In [None]:
df_s['sid_from_path'] = df_s['audio_path'].apply(extract_sid)

# df_xs의 sid와 df_merged의 sid_no_prefix가 일치하는지 확인
df_s['exists_in_merged'] = df_s['sid_from_path'].isin(df_merged['sid_no_prefix'])

print("df_s에서 df_merged Key에 일치하는 sid(=Key) 개수:", df_s['exists_in_merged'].sum())
print(df_s[df_s['exists_in_merged']].head())


df_xs에서 df_merged Key에 일치하는 sid(=Key) 개수: 0
Empty DataFrame
Columns: [sid, audio_path, begin_time, end_time, category, codec, channels, text_tn, sid_from_path, exists_in_merged]
Index: []


In [None]:
df_m['sid_from_path'] = df_m['audio_path'].apply(extract_sid)

# df_xs의 sid와 df_merged의 sid_no_prefix가 일치하는지 확인
df_m['exists_in_merged'] = df_m['sid_from_path'].isin(df_merged['sid_no_prefix'])

print("df_m에서 df_merged Key에 일치하는 sid(=Key) 개수:", df_m['exists_in_merged'].sum())
print(df_m[df_m['exists_in_merged']].head())

df_xs에서 df_merged Key에 일치하는 sid(=Key) 개수: 670067
                      sid                                         audio_path  \
0  POD0000005680_S0000426  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...   
1  YOU0000006180_S0000494  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...   
2  POD0000008564_S0000055  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...   
3  YOU0000008223_S0001191  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...   
4  YOU0000002475_S0000597  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...   

   begin_time  end_time             category  codec  channels  \
0     1849.09   1851.22                Crime  s16le         1   
1     2977.70   2978.63  News  and  Politics  s16le         1   
2      328.15    335.77    News and Politics  s16le         1   
3     6667.10   6668.06               Gaming  s16le         1   
4     3388.23   3389.82               Comedy  s16le         1   

                                             text_tn           sid_from_path  \

In [None]:
df_dev['sid_from_path'] = df_dev['audio_path'].apply(extract_sid)

# df_xs의 sid와 df_merged의 sid_no_prefix가 일치하는지 확인
df_dev['exists_in_merged'] = df_dev['sid_from_path'].isin(df_merged['sid_no_prefix'])

print("df_m에서 df_merged Key에 일치하는 sid(=Key) 개수:", df_dev['exists_in_merged'].sum())
print(df_dev[df_dev['exists_in_merged']].head())

df_m에서 df_merged Key에 일치하는 sid(=Key) 개수: 0
Empty DataFrame
Columns: [sid, audio_path, begin_time, end_time, category, codec, channels, text_tn, sid_from_path, exists_in_merged]
Index: []


In [None]:
df_test['sid_from_path'] = df_test['audio_path'].apply(extract_sid)

# df_xs의 sid와 df_merged의 sid_no_prefix가 일치하는지 확인
df_test['exists_in_merged'] = df_test['sid_from_path'].isin(df_merged['sid_no_prefix'])

print("df_m에서 df_merged Key에 일치하는 sid(=Key) 개수:", df_test['exists_in_merged'].sum())
print(df_test[df_test['exists_in_merged']].head())

df_m에서 df_merged Key에 일치하는 sid(=Key) 개수: 0
Empty DataFrame
Columns: [sid, audio_path, begin_time, end_time, category, codec, channels, text_tn, sid_from_path, exists_in_merged]
Index: []


In [None]:
# df_merged와 df_m을 inner join하여 둘 다에 있는 경우만 추출합니다.
df_merged_m_inner = pd.merge(
    df_merged,
    df_m,
    left_on='sid_no_prefix',
    right_on='sid',
    how='inner',
    suffixes=('_merged', '_m')
)

# 필요한 컬럼만 선택
final_df = df_merged_m_inner[['sid', 'audio_path', 'Gender', 'Age', 'Speed', 'Pitch', 'Energy', 'Emotion', 'Category', 'Transcript', 'Des', 'Ins']]
print(f"최종 df shape: {final_df.shape}")
print(final_df.head())

# 랜덤하게 train/val로 일정 비율로 나눔 (예: 90% train, 10% val)
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(final_df, test_size=0.1, random_state=42, shuffle=True)

print(f"Train shape: {train_df.shape}, Val shape: {val_df.shape}")
print("Train example:")
print(train_df.head())
print("Val example:")
print(val_df.head())

# 파일로 저장
train_df.to_csv('SpeechCraft_final_train.csv', index=False)
val_df.to_csv('SpeechCraft_final_val.csv', index=False)


최종 df shape: (670067, 12)
                      sid                                         audio_path  \
0  YOU0000004134_S0000088  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...   
1  POD0000012099_S0000074  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...   
2  POD0000003784_S0000084  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...   
3  YOU0000012125_S0000863  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...   
4  POD0000003364_S0000299  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...   

   Gender          Age   Speed   Pitch Energy  Emotion             Category  \
0  female      Elderly  normal  normal   high      sad            Education   
1    male      Elderly    slow  normal   high      sad    News and Politics   
2  female  Middle-aged  normal    high   high    happy    News and Politics   
3  female  Young Adult    slow    high    low  neutral  News  and  Politics   
4    male     Teenager    fast  normal   high    happy    News and Politics   

                  

In [5]:
# 이미 저장되어 있는 @dataset/SpeechCraft_final_df.csv 파일을 읽어서 train/val로 나눠 저장하는 코드
import pandas as pd
from sklearn.model_selection import train_test_split

# @dataset/SpeechCraft_final_df.csv에서 데이터프레임 로드
df = pd.read_csv('SpeechCraft_final_df.csv')

# train/val로 나누기 (예: 90% train, 10% val)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)

print(f"Train shape: {train_df.shape}, Val shape: {val_df.shape}")
print("Train example:")
print(train_df.head())
print("Val example:")
print(val_df.head())

# 분할된 데이터 저장
train_df.to_csv('SpeechCraft_final_train.csv', index=False)
val_df.to_csv('SpeechCraft_final_val.csv', index=False)

Train shape: (603060, 12), Val shape: (67007, 12)
Train example:
                           sid  \
348617  YOU0000007231_S0000296   
435180  YOU0000006857_S0000333   
192695  YOU0000009235_S0000174   
398294  POD0000010219_S0000078   
517749  AUD0000001562_S0003871   

                                               audio_path  Gender  \
348617  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...  female   
435180  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...    male   
192695  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...  female   
398294  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...    male   
517749  /home/nas4/DB/gigaspeech/data/audio/m_files_ad...    male   

                Age   Speed   Pitch Energy  Emotion            Category  \
348617  Young Adult  normal  normal    low  neutral           Education   
435180  Middle-aged    fast     low   high  neutral  People  and  Blogs   
192695  Middle-aged    slow  normal    low    angry                 NaN   
398294  Middle-