# Download raw Audio files from Xenocanto

In [None]:
import requests
import os
import csv
import time

# 设定根目录
ROOT_DOWNLOAD_FOLDER = "E:/AMR/DA/Projekt/data/Audio_files_ori"
CSV_PATH = os.path.join(ROOT_DOWNLOAD_FOLDER, "audio_file_counts.csv")

# 叫声类型映射表
vocal_type_map = {
    "Song": "song",
    "Flight call": "flight+call",
    "Begging call": "begging+call",
    "Alarm call": "alarm+call",
    "Call": "call"
}

# 鸟类字典（学名: 文件夹名）
birds = {
    "Aegithalos+caudatus": "Long-tailed Tit",
    "Poecile+palustris": "Marsh Tit",
    "Chroicocephalus+ridibundus": "Black-headed Gull",
    "Corvus+frugilegus": "Rook",
    "Turdus+iliacus": "Redwing",
    "Alcedo+atthis": "Common Kingfisher",
    "Prunella+modularis": "Dunnock",
    "Coccothraustes+coccothraustes": "Hawfinch",
    "Sitta+europaea": "Eurasian Nuthatch",
    "Columba+oenas": "Stock Dove",
    "Larus+fuscus+heuglini": "Lesser Black-backed Gull"
}

# 读取 CSV 文件，找到文件数量为 0 的文件夹
empty_folders = []
with open(CSV_PATH, mode="r", encoding="utf-8") as file:
    reader = csv.reader(file)
    next(reader)  # 跳过表头
    for row in reader:
        folder_name, file_count = row
        if int(file_count) == 0:
            empty_folders.append(folder_name)

# 记录仍然下载失败的项目
failed_downloads = []

def download_audio(name, name_for_folder, vocal, folder_name):
    """ 下载单个音频文件 """
    SEARCH_QUERY = f"{name}+type:%22{vocal}%22"
    API_URL = f"https://www.xeno-canto.org/api/2/recordings?query={SEARCH_QUERY}"

    download_folder = os.path.join(ROOT_DOWNLOAD_FOLDER, folder_name)
    os.makedirs(download_folder, exist_ok=True)

    print(f"\n📌 重新下载: {folder_name}")

    try:
        response = requests.get(API_URL, timeout=30)
        response.raise_for_status()
        data = response.json()
    except requests.RequestException as e:
        print(f"❌ 请求失败: {name_for_folder} - {vocal}: {e}")
        failed_downloads.append((name, name_for_folder, vocal, folder_name))
        return

    recordings = data.get("recordings", [])

    if not recordings:
        print(f"⚠️ 仍然没有找到 {vocal} 的音频，跳过...")
        failed_downloads.append((name, name_for_folder, vocal, folder_name))
        return

    print(f"🎧 找到 {len(recordings)} 个符合条件的 {vocal} 录音，开始下载...")

    count = 0
    for rec in recordings:
        if count >= 50:
            print("🎯 已达到 50 个文件的下载限制，停止下载。")
            break

        file_url = f"https://www.xeno-canto.org/{rec['id']}/download"
        filename = f"XC{rec['id']}_{rec['gen']} {rec['sp']}.wav"
        filepath = os.path.join(download_folder, filename)

        try:
            with requests.get(file_url, stream=True, timeout=30) as r:
                r.raise_for_status()
                with open(filepath, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)

            count += 1
            print(f"✅ 下载完成 ({count}/50): {filename}")

        except requests.RequestException as e:
            print(f"❌ 下载失败: {filename}: {e}")
            failed_downloads.append((name, name_for_folder, vocal, folder_name))

    print(f"🎉 {vocal} 的音频下载完成！")

# **解析文件夹名称并下载**
for folder in empty_folders:
    try:
        # 拆分文件夹名称，例如 "Call - Redwing"
        vocal_folder, bird_name = folder.split(" - ")

        # 获取拉丁学名
        bird_latin_name = None
        for latin_name, common_name in birds.items():
            if common_name == bird_name:
                bird_latin_name = latin_name
                break

        # 确保名称匹配
        if not bird_latin_name or vocal_folder not in vocal_type_map:
            print(f"⚠️ 无法解析文件夹: {folder}, 跳过...")
            continue

        # 获取 vocal type（用于 API 查询）
        vocal_type = vocal_type_map[vocal_folder]

        # 下载音频
        download_audio(bird_latin_name, bird_name, vocal_type, folder)

    except ValueError:
        print(f"⚠️ 无法解析文件夹: {folder}, 跳过...")

# **最终打印仍然失败的音频**
if failed_downloads:
    print("\n⚠️ 以下音频仍然无法下载：")
    for name, name_for_folder, vocal, folder_name in failed_downloads:
        print(f"   - {name_for_folder} - {vocal}")
else:
    print("\n✅ 所有空文件夹的音频已成功下载！")


# 统一补充文件夹中数量不够100

In [None]:
import requests
import os
import csv
import time

# 设定根目录
ROOT_DOWNLOAD_FOLDER = "E:/AMR/DA/Projekt/data/Audio_files_ori"
CSV_PATH = os.path.join(ROOT_DOWNLOAD_FOLDER, "audio_file_counts.csv")

# 叫声类型映射表
vocal_type_map = {
    "Song": "song",
    # "Flight call": "flight+call",
    # "Begging call": "begging+call",
    # "Alarm call": "alarm+call",
    "Call": "call"
}

# 鸟类字典（学名: 文件夹名）
birds = {
    #"Aegithalos+caudatus": "Long-tailed Tit",
    #"Poecile+palustris": "Marsh Tit",
    "Chroicocephalus+ridibundus": "Black-headed Gull", #
    #"Corvus+frugilegus": "Rook",
    #"Turdus+iliacus": "Redwing",
    "Alcedo+atthis": "Common Kingfisher", #
    #"Prunella+modularis": "Dunnock",
    #"Coccothraustes+coccothraustes": "Hawfinch",
    #"Sitta+europaea": "Eurasian Nuthatch",
    "Columba+oenas": "Stock Dove", #
    "Branta+canadensis": "Canada Goose", #
    "Corvus+corone": "Carrion Crow", #
    #"Turdus+merula": "Common Blackbird",
    #"Fringilla+coelebs": "Common Chaffinch",
    "Phoenicurus+phoenicurus+phoenicurus": "Common Redstart", #
    "Sylvia+atricapilla": "Eurasian Blackcap", #
    #"Cyanistes+caeruleus": "Eurasian Blue tit",
    #"Pyrrhula+pyrrhula": "Eurasian Bullfinch",
    "Fulica+atra": "Eurasian Coot", #
    #"Oriolus+oriolus": "Eurasian Golden Oriole",
    "Garrulus+glandarius": "Eurasian Jay", #
    "Spinus+spinus": "Eurasian Siskin", #
    "Certhia+familiaris": "Eurasian Treecreeper", #
    "Troglodytes+troglodytes": "Eurasian Wren", #
    #"Carduelis+carduelis": "European Goldfinch",
    #"Erithacus+rubecula": "European Robin",
    #"Regulus+regulus": "Goldcrest",
    #"Dendrocopos+major": "Great Spotted Woodpecker",
    #"Parus+major": "Great Tit",
    "Corvus+cornix": "Hooded Crow", #
    "Anas+platyrhynchos": "Mallard", #
    "Certhia+brachydactyla": "Short-toed Treecreeper", #
}

# **1️⃣ 读取 CSV 文件，找出需要补充的文件夹**
empty_folders = {}
with open(CSV_PATH, mode="r", encoding="utf-8") as file:
    reader = csv.reader(file)
    next(reader)  # 跳过表头
    for row in reader:
        folder_name, file_count = row
        file_count = int(file_count)
        if file_count < 100:  # 只关注少于 100 个音频的文件夹
            empty_folders[folder_name] = 100 - file_count  # 计算需要补充的数量

# **记录下载失败的项目**
failed_downloads = []

def download_audio(name, name_for_folder, vocal, folder_name, needed_count):
    """ 下载音频，直到该文件夹内有 150 个文件 """
    SEARCH_QUERY = f"{name}+type:%22{vocal}%22"
    API_URL = f"https://www.xeno-canto.org/api/2/recordings?query={SEARCH_QUERY}"

    download_folder = os.path.join(ROOT_DOWNLOAD_FOLDER, folder_name)
    os.makedirs(download_folder, exist_ok=True)

    print(f"\n📌 下载 {folder_name}，需要补充 {needed_count} 个文件...")

    try:
        response = requests.get(API_URL, timeout=30)
        response.raise_for_status()
        data = response.json()
    except requests.RequestException as e:
        print(f"❌ 请求失败: {name_for_folder} - {vocal}: {e}")
        failed_downloads.append((name, name_for_folder, vocal, folder_name))
        return

    recordings = data.get("recordings", [])

    if not recordings:
        print(f"⚠️ 没有找到 {vocal} 的音频，跳过...")
        failed_downloads.append((name, name_for_folder, vocal, folder_name))
        return

    print(f"🎧 找到 {len(recordings)} 个符合条件的 {vocal} 录音，开始下载...")

    existing_files = set(os.listdir(download_folder))  # 读取已有文件，避免重复
    count = 0

    for rec in recordings:
        if count >= needed_count:
            print("🎯 已达到目标文件数，停止下载。")
            break

        file_id = rec['id']
        filename = f"XC{file_id}_{rec['gen']} {rec['sp']}.wav"
        filepath = os.path.join(download_folder, filename)

        if filename in existing_files:
            print(f"🔄 文件已存在，跳过: {filename}")
            continue

        try:
            with requests.get(f"https://www.xeno-canto.org/{file_id}/download", stream=True, timeout=30) as r:
                r.raise_for_status()
                with open(filepath, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)

            count += 1
            print(f"✅ 下载完成 ({count}/{needed_count}): {filename}")

        except requests.RequestException as e:
            print(f"❌ 下载失败: {filename}: {e}")
            failed_downloads.append((name, name_for_folder, vocal, folder_name))

    print(f"🎉 {vocal} 的音频下载完成！")

# **解析文件夹名称并下载**
for folder, needed_count in empty_folders.items():
    try:
        # 拆分文件夹名称，例如 "Call - Redwing"
        vocal_folder, bird_name = folder.split(" - ")

        # 获取拉丁学名
        bird_latin_name = None
        for latin_name, common_name in birds.items():
            if common_name == bird_name:
                bird_latin_name = latin_name
                break

        # 确保名称匹配
        if not bird_latin_name or vocal_folder not in vocal_type_map:
            print(f"⚠️ 无法解析文件夹: {folder}, 跳过...")
            continue

        # 获取 vocal type（用于 API 查询）
        vocal_type = vocal_type_map[vocal_folder]

        # 下载音频
        download_audio(bird_latin_name, bird_name, vocal_type, folder, needed_count)

    except ValueError:
        print(f"⚠️ 无法解析文件夹: {folder}, 跳过...")

# **最终打印仍然失败的音频**
if failed_downloads:
    print("\n⚠️ 以下音频仍然无法下载：")
    for name, name_for_folder, vocal, folder_name in failed_downloads:
        print(f"   - {name_for_folder} - {vocal}")
else:
    print("\n✅ 所有需要补充的音频已成功下载！")


⚠️ 无法解析文件夹: Alarm call - Black-headed Gull, 跳过...
⚠️ 无法解析文件夹: Alarm call - Canada Goose, 跳过...
⚠️ 无法解析文件夹: Alarm call - Carrion Crow, 跳过...
⚠️ 无法解析文件夹: Alarm call - Common Blackbird, 跳过...
⚠️ 无法解析文件夹: Alarm call - Common Chaffinch, 跳过...
⚠️ 无法解析文件夹: Alarm call - Common Kingfisher, 跳过...
⚠️ 无法解析文件夹: Alarm call - Dunnock, 跳过...
⚠️ 无法解析文件夹: Alarm call - Eurasian Blue tit, 跳过...
⚠️ 无法解析文件夹: Alarm call - Eurasian Bullfinch, 跳过...
⚠️ 无法解析文件夹: Alarm call - Eurasian Coot, 跳过...
⚠️ 无法解析文件夹: Alarm call - Eurasian Jay, 跳过...
⚠️ 无法解析文件夹: Alarm call - Eurasian Nuthatch, 跳过...
⚠️ 无法解析文件夹: Alarm call - European Robin, 跳过...
⚠️ 无法解析文件夹: Alarm call - Great Tit, 跳过...
⚠️ 无法解析文件夹: Alarm call - Hawfinch, 跳过...
⚠️ 无法解析文件夹: Alarm call - Hooded Crow, 跳过...
⚠️ 无法解析文件夹: Alarm call - Lesser Black-backed Gull, 跳过...
⚠️ 无法解析文件夹: Alarm call - Long-tailed Tit, 跳过...
⚠️ 无法解析文件夹: Alarm call - Marsh Tit, 跳过...
⚠️ 无法解析文件夹: Alarm call - Redwing, 跳过...
⚠️ 无法解析文件夹: Alarm call - Rook, 跳过...
⚠️ 无法解析文件夹: Alarm call - Short-t

# Counter for Audio_files_ori (raw audio files)

In [4]:
import os

# 设置存储音频文件的根目录
ROOT_DOWNLOAD_FOLDER = "E:/AMR/DA/Projekt/data/Audio_files_ori"

# 统计结果存储
folder_counts = {}

# 遍历根目录下的所有文件夹
for folder_name in os.listdir(ROOT_DOWNLOAD_FOLDER):
    folder_path = os.path.join(ROOT_DOWNLOAD_FOLDER, folder_name)

    # 确保是文件夹
    if os.path.isdir(folder_path):
        # 获取该文件夹中的所有文件
        file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
        
        # 记录到字典
        folder_counts[folder_name] = file_count

# **打印统计结果**
print("\n📊 音频文件统计结果：")
for folder, count in folder_counts.items():
    print(f" {folder}: {count} 个文件")

# # **如果你希望以 CSV 格式导出**
# import csv

# csv_path = os.path.join(ROOT_DOWNLOAD_FOLDER, "audio_file_counts.csv")
# with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
#     writer = csv.writer(file)
#     writer.writerow(["Folder Name", "File Count"])  # 表头
#     for folder, count in folder_counts.items():
#         writer.writerow([folder, count])
# 
# print(f"\n✅ 统计数据已保存到 {csv_path}")



📊 音频文件统计结果：
 Alarm call - Black-headed Gull: 50 个文件
 Alarm call - Canada Goose: 40 个文件
 Alarm call - Carrion Crow: 50 个文件
 Alarm call - Common Blackbird: 57 个文件
 Alarm call - Common Chaffinch: 50 个文件
 Alarm call - Common Kingfisher: 46 个文件
 Alarm call - Common Redstart: 25 个文件
 Alarm call - Common Wood Pigeon: 3 个文件
 Alarm call - Dunnock: 50 个文件
 Alarm call - Eurasian Blackcap: 50 个文件
 Alarm call - Eurasian Blue tit: 54 个文件
 Alarm call - Eurasian Bullfinch: 18 个文件
 Alarm call - Eurasian Coot: 37 个文件
 Alarm call - Eurasian Golden Oriole: 50 个文件
 Alarm call - Eurasian Jay: 50 个文件
 Alarm call - Eurasian Nuthatch: 50 个文件
 Alarm call - Eurasian Siskin: 26 个文件
 Alarm call - Eurasian Treecreeper: 45 个文件
 Alarm call - Eurasian Wren: 50 个文件
 Alarm call - European Goldfinch: 31 个文件
 Alarm call - European Robin: 50 个文件
 Alarm call - Goldcrest: 48 个文件
 Alarm call - Great Spotted Woodpecker: 50 个文件
 Alarm call - Great Tit: 50 个文件
 Alarm call - Hawfinch: 46 个文件
 Alarm call - Hooded Crow: 50 个文件
 Al

# 只补充需要的鸟类

In [1]:
import requests
import os
import time

# 下载音频保存目录
ROOT_DOWNLOAD_FOLDER = "E:/AMR/DA/Projekt/data/Audio_files_ori"

# vocalization 类型映射
vocal_type_map = {
    "Song": "song",
    "Call": "call"
}

# 映射：拉丁学名 → 英文常用名（只保留未注释的）
birds = {
    "Chroicocephalus+ridibundus": "Black-headed Gull",
    "Alcedo+atthis": "Common Kingfisher",
    "Columba+oenas": "Stock Dove",
    "Branta+canadensis": "Canada Goose",
    "Corvus+corone": "Carrion Crow",
    "Phoenicurus+phoenicurus+phoenicurus": "Common Redstart",
    "Sylvia+atricapilla": "Eurasian Blackcap",
    "Fulica+atra": "Eurasian Coot",
    "Garrulus+glandarius": "Eurasian Jay",
    "Spinus+spinus": "Eurasian Siskin",
    "Certhia+familiaris": "Eurasian Treecreeper",
    "Troglodytes+troglodytes": "Eurasian Wren",
    "Corvus+cornix": "Hooded Crow",
    "Anas+platyrhynchos": "Mallard",
    "Certhia+brachydactyla": "Short-toed Treecreeper",
}

# 下载失败记录
failed_downloads = []

def download_audio(name, bird_name, vocal, save_folder, max_download=100):
    query = f"{name}+type:%22{vocal}%22"
    url = f"https://www.xeno-canto.org/api/2/recordings?query={query}"

    os.makedirs(save_folder, exist_ok=True)
    print(f"\n🎯 下载 {bird_name} ({vocal})，保存至 {save_folder}")

    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        data = response.json()
    except Exception as e:
        print(f"❌ 请求失败: {bird_name} - {vocal}: {e}")
        failed_downloads.append((name, bird_name, vocal))
        return

    recordings = data.get("recordings", [])
    print(f"🔎 找到 {len(recordings)} 条录音")

    count = 0
    for rec in recordings:
        if count >= max_download:
            break
        file_id = rec['id']
        filename = f"XC{file_id}_{rec['gen']} {rec['sp']}.wav"
        filepath = os.path.join(save_folder, filename)

        if os.path.exists(filepath):
            continue

        try:
            with requests.get(f"https://www.xeno-canto.org/{file_id}/download", stream=True, timeout=30) as r:
                r.raise_for_status()
                with open(filepath, "wb") as f:
                    for chunk in r.iter_content(8192):
                        f.write(chunk)
            print(f"✅ 下载成功: {filename}")
            count += 1
        except Exception as e:
            print(f"⚠️ 下载失败: {filename} - {e}")
            failed_downloads.append((name, bird_name, vocal))

# 遍历 bird 列表下载对应 vocal
for latin_name, common_name in birds.items():
    for vocal_folder, vocal_query in vocal_type_map.items():
        folder_name = f"{vocal_folder} - {common_name}"
        full_path = os.path.join(ROOT_DOWNLOAD_FOLDER, folder_name)
        download_audio(latin_name, common_name, vocal_query, full_path)

# 打印失败项
if failed_downloads:
    print("\n⚠️ 以下下载失败：")
    for item in failed_downloads:
        print(f" - {item}")
else:
    print("\n✅ 所有音频下载完成！")



🎯 下载 Black-headed Gull (song)，保存至 E:/AMR/DA/Projekt/data/Audio_files_ori\Song - Black-headed Gull
🔎 找到 32 条录音

🎯 下载 Black-headed Gull (call)，保存至 E:/AMR/DA/Projekt/data/Audio_files_ori\Call - Black-headed Gull
🔎 找到 500 条录音
✅ 下载成功: XC985869_Chroicocephalus ridibundus.wav
✅ 下载成功: XC985414_Chroicocephalus ridibundus.wav
✅ 下载成功: XC772956_Chroicocephalus ridibundus.wav
✅ 下载成功: XC772796_Chroicocephalus ridibundus.wav
✅ 下载成功: XC765369_Chroicocephalus ridibundus.wav
✅ 下载成功: XC765368_Chroicocephalus ridibundus.wav
✅ 下载成功: XC754807_Chroicocephalus ridibundus.wav
✅ 下载成功: XC749629_Chroicocephalus ridibundus.wav
✅ 下载成功: XC749626_Chroicocephalus ridibundus.wav
✅ 下载成功: XC748959_Chroicocephalus ridibundus.wav
✅ 下载成功: XC748958_Chroicocephalus ridibundus.wav
✅ 下载成功: XC743393_Chroicocephalus ridibundus.wav
✅ 下载成功: XC742715_Chroicocephalus ridibundus.wav
✅ 下载成功: XC742340_Chroicocephalus ridibundus.wav
✅ 下载成功: XC739025_Chroicocephalus ridibundus.wav
✅ 下载成功: XC737676_Chroicocephalus ridibundus.wav
✅ 下载成功: X