In [8]:
import pandas as pd
import numpy as np

In [9]:
# 1. 读取 CSV
df = pd.read_csv("input/train_curated.csv")
df.head()

Unnamed: 0,fname,labels
0,0006ae4e.wav,Bark
1,0019ef41.wav,Raindrop
2,001ec0ad.wav,Finger_snapping
3,0026c7cb.wav,Run
4,0026f116.wav,Finger_snapping


In [10]:
# 2. 拆分标签字符串 -> 列表
df['label_list'] = df['labels'].apply(lambda x: [label.strip() for label in x.split(',')])

# 3. 找到所有独特的标签（去重）
all_labels = sorted({label for labels in df['label_list'] for label in labels})

# 4. 建立标签到索引的映射
label_to_idx = {label: idx for idx, label in enumerate(all_labels)}

# 5. 转换为 one-hot 向量
def labels_to_onehot(label_list):
    onehot = np.zeros(len(all_labels), dtype=int)
    for label in label_list:
        onehot[label_to_idx[label]] = 1
    return onehot

df['onehot'] = df['label_list'].apply(labels_to_onehot)

# 6. 得到最终的 numpy 矩阵
X = np.vstack(df['onehot'].values)  # shape: (样本数, 类别数)

print("标签索引：", label_to_idx)
print("one-hot 矩阵形状：", X.shape)
print(X[:5])

标签索引： {'Accelerating_and_revving_and_vroom': 0, 'Accordion': 1, 'Acoustic_guitar': 2, 'Applause': 3, 'Bark': 4, 'Bass_drum': 5, 'Bass_guitar': 6, 'Bathtub_(filling_or_washing)': 7, 'Bicycle_bell': 8, 'Burping_and_eructation': 9, 'Bus': 10, 'Buzz': 11, 'Car_passing_by': 12, 'Cheering': 13, 'Chewing_and_mastication': 14, 'Child_speech_and_kid_speaking': 15, 'Chink_and_clink': 16, 'Chirp_and_tweet': 17, 'Church_bell': 18, 'Clapping': 19, 'Computer_keyboard': 20, 'Crackle': 21, 'Cricket': 22, 'Crowd': 23, 'Cupboard_open_or_close': 24, 'Cutlery_and_silverware': 25, 'Dishes_and_pots_and_pans': 26, 'Drawer_open_or_close': 27, 'Drip': 28, 'Electric_guitar': 29, 'Fart': 30, 'Female_singing': 31, 'Female_speech_and_woman_speaking': 32, 'Fill_(with_liquid)': 33, 'Finger_snapping': 34, 'Frying_(food)': 35, 'Gasp': 36, 'Glockenspiel': 37, 'Gong': 38, 'Gurgling': 39, 'Harmonica': 40, 'Hi-hat': 41, 'Hiss': 42, 'Keys_jangling': 43, 'Knock': 44, 'Male_singing': 45, 'Male_speech_and_man_speaking': 46, '

In [11]:
onehot_df = pd.DataFrame(np.vstack(df['onehot'].values), columns=all_labels)
onehot_df.head()

Unnamed: 0,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,Burping_and_eructation,...,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# 合并回原始文件名等信息
final_df = pd.concat([df[['fname']], onehot_df], axis=1)
final_df.head()

Unnamed: 0,fname,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,...,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,0006ae4e.wav,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0019ef41.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,001ec0ad.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0026c7cb.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0026f116.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# 保存到 CSV
final_df.to_csv('data_onehot.csv', index=False)

print("✅ 已保存为 data_onehot.csv")

✅ 已保存为 data_onehot.csv
