In [1]:
import requests
import json
import pandas as pd
import os
from pathlib import Path
import math
from bs4 import BeautifulSoup
import pprint
from itertools import compress

# self-definition function
from func4crawl import src01 

In [2]:
path_cd = os.getcwd()
path_root = Path(os.getcwd()).parent
path_data = os.path.join(path_root, "./data")

# configure the folder path for data stored
b = range(0,10)
path_category = [f'{path_data}/category_{i:02}' for i in b]
path_metadata = [os.path.join(path_data, "./metadata")]

---

# target website 01

[BBC Rewind - Sound Effects](https://sound-effects.bbcrewind.co.uk/)

網站觀察與動作:

- 有 JS 渲染 (javascript render): 用 Chrome Extension - Quick Javascript switch 檢查首頁可知。
- AJAX類型網頁: 滑鼠右鍵檢視搜尋結果的網頁原始碼時，會完全找不到網頁上所顯示的內容，由此可知。
- 進一步用 F12 開發人員工具，檢查 Network，會發現有一個 search 的呼叫，這個API回傳結果，就是我們要的東西。
- API POST: 用 POSTMAN 測試可成功。見 POSTMAN 的 source01。
- 要做進一步的 filter 
  - keyword
  - duration

不知道 filter 的機制，因為有些聽起來很不準。  
e.g. dog barking 搜尋結果比 dog bark 多

--

**作法01:** 

發現失敗XD~  
網頁抓下來是沒有 JS 渲染的結果

- 先用動態載入網頁: `https://sound-effects.bbcrewind.co.uk/search?q=dog+bark`
- 知道篩選條件下的筆數
- 重新動態載入網頁，並新增筆數參數: `https://sound-effects.bbcrewind.co.uk/search?q=dog+bark&resultSize=200`
- 解析網頁程式碼

**做法02:**


- 搜尋
  - 用 requests 模組，加上 POST 方法，對 API 抓取資料。
  - 確認筆數再搜尋: 先給篩選條件 -> 取得所有筆數 -> 再一次 query 資料
- 篩選(optional)
  - 進一步篩選搜尋結果的 metadata。e.g. description, tags 的內容
- 資料清理
  - 因為可能有多組關鍵字，然後會搜尋到一樣的結果，所以要做 unique
- 下載資料
  - 抓出 id
  - 製造 url `https://sound-effects-media.bbcrewind.co.uk/mp3/NHU05099205.mp3` or `https://sound-effects-media.bbcrewind.co.uk/wav/NHU05099205.wav`

--

**Conclusion:**

- keyword
- duration < 300 sec
- check keyword in 'description' and 'tags'

---

## Barking

方法02

- %20 = ' '
- %2C = ','

e.g. `https://sound-effects.bbcrewind.co.uk/search?durations=0-9&q=dog%20bark`

In [3]:
# 設定關鍵字
query_kws = ['dog bark', 'dog barking']

# 設定存檔母路徑
path_category_save = path_category[0]

# 設定爬取網址的路徑
url = 'https://sound-effects-api.bbcrewind.co.uk/api/sfx/search'

In [4]:
res_all = []
for i_kws in query_kws:
    print(i_kws)
    query_cond = src01.query_cond(i_kws)
    
    
    # ---
    # 將資料加入 POST 請求中
    r = requests.post(url = url, json = query_cond)
    
    # 解析網頁
    response_json = r.json()
    query_cond['criteria']['size'] = response_json['total']
    
    
    # ---
    # 從 API 撈出所有資料
    r = requests.post(url = url, json = query_cond)
    r_json = r.json()
    
    
    # ---
    # 設定進一步關鍵字篩選
    cond_list = []
    kws = i_kws.split()
    while len(kws)!=0:
        kw = kws.pop(0)
        cond_list = cond_list + src01.cond_filter(keyword = kw, response_json = r_json)
    
    cond = [any(i) for i in zip(*cond_list)]
    res_all = res_all + list(compress(r_json['results'], cond))


dog bark
dog barking


In [5]:
# unique result
res_uni = []
for x in res_all:
    if x not in res_uni:
        res_uni.append(x)

In [6]:
# 抓出 wav 的 uid + 計算檔案個數
wav_ids = [x['id'] for x in res_uni]
digits = int( math.log10(len(wav_ids)) ) + 1
print(len(wav_ids))

# - 設定檔名
# - 設定存檔路徑
all_filename = ["s01-" + str(i).zfill(digits) + ".wav" for i in range(0, len(wav_ids))]
all_path = [os.path.join(path_category_save, x) for x in all_filename]
all_url = [f'https://sound-effects-media.bbcrewind.co.uk/wav/{x}.wav' for x in wav_ids]
# f'https://sound-effects-media.bbcrewind.co.uk/mp3/{wav_id}.mp3'

319


In [7]:
%%time

csv_filename = []
csv_url = []
for i in range(len(wav_ids)):
# for i in range(5):
    print(i, '\t', all_filename[i], '\t', wav_ids[i])
    src01.download_wav(all_url[i], wav_ids[i], all_path[i])
    csv_filename.append(all_filename[i])
    csv_url.append(all_url[i])

0 	 s01-000.wav 	 NHU05099205
1 	 s01-001.wav 	 NHU05102196
2 	 s01-002.wav 	 NHU05099203
3 	 s01-003.wav 	 NHU05099204
4 	 s01-004.wav 	 NHU05085050
5 	 s01-005.wav 	 NHU05085051
6 	 s01-006.wav 	 NHU05018209
7 	 s01-007.wav 	 NHU05081081
8 	 s01-008.wav 	 07037490
9 	 s01-009.wav 	 07037489
10 	 s01-010.wav 	 07020114
11 	 s01-011.wav 	 07054125
12 	 s01-012.wav 	 07020115
13 	 s01-013.wav 	 NHU05003026
14 	 s01-014.wav 	 NHU05013070
15 	 s01-015.wav 	 07023376
16 	 s01-016.wav 	 07037475
17 	 s01-017.wav 	 07037474
18 	 s01-018.wav 	 NHU05017160
19 	 s01-019.wav 	 07020072
20 	 s01-020.wav 	 07042106
21 	 s01-021.wav 	 07037436
22 	 s01-022.wav 	 NHU05014104
23 	 s01-023.wav 	 NHU9710965
24 	 s01-024.wav 	 NHU9710971
25 	 s01-025.wav 	 07020097
26 	 s01-026.wav 	 07020082
27 	 s01-027.wav 	 NHU05017201
28 	 s01-028.wav 	 NHU05012171
29 	 s01-029.wav 	 07020131
30 	 s01-030.wav 	 07037437
31 	 s01-031.wav 	 07037473
32 	 s01-032.wav 	 NHU05079186
33 	 s01-033.wav 	 NHU05060001
34 	 s

272 	 s01-272.wav 	 07020063
273 	 s01-273.wav 	 07020099
274 	 s01-274.wav 	 07020086
275 	 s01-275.wav 	 07037447
276 	 s01-276.wav 	 07042107
277 	 s01-277.wav 	 NHU05017029
278 	 s01-278.wav 	 NHU05042001
279 	 s01-279.wav 	 07020132
280 	 s01-280.wav 	 NHU05102085
281 	 s01-281.wav 	 NHU05093036
282 	 s01-282.wav 	 NHU05050109
283 	 s01-283.wav 	 NHU05009088
284 	 s01-284.wav 	 NHU05098177
285 	 s01-285.wav 	 NHU05042005
286 	 s01-286.wav 	 NHU05017048
287 	 s01-287.wav 	 NHU05007148
288 	 s01-288.wav 	 NHU05019012
289 	 s01-289.wav 	 NHU05077154
290 	 s01-290.wav 	 NHU05082053
291 	 s01-291.wav 	 07050178
292 	 s01-292.wav 	 NHU05092050
293 	 s01-293.wav 	 NHU05019008
294 	 s01-294.wav 	 07050185
295 	 s01-295.wav 	 NHU05017200
296 	 s01-296.wav 	 NHU05040134
297 	 s01-297.wav 	 NHU05040129
298 	 s01-298.wav 	 NHU05041062
299 	 s01-299.wav 	 07050188
300 	 s01-300.wav 	 NHU05040132
301 	 s01-301.wav 	 NHU05029027
302 	 s01-302.wav 	 NHU05002147
303 	 s01-303.wav 	 NHU05058117
304

#### save metadata

In [8]:
# for csv
path_csv = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.csv')
data_csv = pd.DataFrame({
    'filename': csv_filename,
    'url': csv_url
})
data_csv.to_csv(path_csv, encoding = 'utf-8', index = 0)

In [9]:
# for json
res_uni2json = [{'filename':a, 'metadata':b} for a, b in zip(all_filename, res_uni)]
path_json = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.json')
with open(path_json, 'w', encoding='utf-8') as jsonfile:
    json.dump(res_uni2json, jsonfile, ensure_ascii = False)

---

## Howling

In [10]:
# 設定關鍵字
query_kws = ['dog howl', 'dog howling']

# 設定存檔母路徑
path_category_save = path_category[1]

# 設定爬取網址的路徑
url = 'https://sound-effects-api.bbcrewind.co.uk/api/sfx/search'

In [11]:
res_all = []
for i_kws in query_kws:
    print(i_kws)
    query_cond = src01.query_cond(i_kws)
    
    
    # ---
    # 將資料加入 POST 請求中
    r = requests.post(url = url, json = query_cond)
    
    # 解析網頁
    response_json = r.json()
    query_cond['criteria']['size'] = response_json['total']
    
    
    # ---
    # 從 API 撈出所有資料
    r = requests.post(url = url, json = query_cond)
    r_json = r.json()
    
    
    # ---
    # 設定進一步關鍵字篩選
    cond_list = []
    kws = i_kws.split()
    while len(kws)!=0:
        kw = kws.pop(0)
        cond_list = cond_list + src01.cond_filter(keyword = kw, response_json = r_json)
    
    cond = [any(i) for i in zip(*cond_list)]
    res_all = res_all + list(compress(r_json['results'], cond))


dog howl
dog howling


In [12]:
# unique result
res_uni = []
for x in res_all:
    if x not in res_uni:
        res_uni.append(x)

In [13]:
# 抓出 wav 的 uid + 計算檔案個數
wav_ids = [x['id'] for x in res_uni]
digits = int( math.log10(len(wav_ids)) ) + 1
print(len(wav_ids))

# - 設定檔名
# - 設定存檔路徑
all_filename = ["s01-" + str(i).zfill(digits) + ".wav" for i in range(0, len(wav_ids))]
all_path = [os.path.join(path_category_save, x) for x in all_filename]
all_url = [f'https://sound-effects-media.bbcrewind.co.uk/wav/{x}.wav' for x in wav_ids]
# f'https://sound-effects-media.bbcrewind.co.uk/mp3/{wav_id}.mp3'

217


In [14]:
%%time

csv_filename = []
csv_url = []
for i in range(len(wav_ids)):
# for i in range(5):
    print(i, '\t', all_filename[i], '\t', wav_ids[i])
    src01.download_wav(all_url[i], wav_ids[i], all_path[i])
    csv_filename.append(all_filename[i])
    csv_url.append(all_url[i])

0 	 s01-000.wav 	 NHU05077145
1 	 s01-001.wav 	 NHU9751742
2 	 s01-002.wav 	 NHU05100047
3 	 s01-003.wav 	 NHU05076112
4 	 s01-004.wav 	 NHU05017201
5 	 s01-005.wav 	 NHU05089171
6 	 s01-006.wav 	 NHU05089189
7 	 s01-007.wav 	 07037490
8 	 s01-008.wav 	 07037489
9 	 s01-009.wav 	 07020114
10 	 s01-010.wav 	 07054125
11 	 s01-011.wav 	 07020115
12 	 s01-012.wav 	 07023376
13 	 s01-013.wav 	 07037475
14 	 s01-014.wav 	 07037474
15 	 s01-015.wav 	 NHU05089194
16 	 s01-016.wav 	 NHU05089172
17 	 s01-017.wav 	 NHU05089180
18 	 s01-018.wav 	 07020072
19 	 s01-019.wav 	 07042106
20 	 s01-020.wav 	 07037436
21 	 s01-021.wav 	 NHU9710965
22 	 s01-022.wav 	 NHU9710971
23 	 s01-023.wav 	 07020097
24 	 s01-024.wav 	 07020082
25 	 s01-025.wav 	 NHU05012171
26 	 s01-026.wav 	 07020131
27 	 s01-027.wav 	 07037437
28 	 s01-028.wav 	 07037473
29 	 s01-029.wav 	 NHU05079186
30 	 s01-030.wav 	 NHU05060001
31 	 s01-031.wav 	 07020074
32 	 s01-032.wav 	 NHU9710969
33 	 s01-033.wav 	 07023377
34 	 s01-034.w

ConnectionError: HTTPSConnectionPool(host='sound-effects-media.bbcrewind.co.uk', port=443): Max retries exceeded with url: /wav/NHU05018209.wav (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000028011705BC8>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

#### save metadata

In [15]:
# for csv
path_csv = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.csv')
data_csv = pd.DataFrame({
    'filename': csv_filename,
    'url': csv_url
})
data_csv.to_csv(path_csv, encoding = 'utf-8', index = 0)

In [16]:
# for json
res_uni2json = [{'filename':a, 'metadata':b} for a, b in zip(all_filename, res_uni)]
path_json = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.json')
with open(path_json, 'w', encoding='utf-8') as jsonfile:
    json.dump(res_uni2json, jsonfile, ensure_ascii = False)

---

## Crying

In [17]:
# 設定關鍵字
query_kws = ['dog cry', 'dog crying']

# 設定存檔母路徑
path_category_save = path_category[2]

# 設定爬取網址的路徑
url = 'https://sound-effects-api.bbcrewind.co.uk/api/sfx/search'

In [18]:
res_all = []
for i_kws in query_kws:
    print(i_kws)
    query_cond = src01.query_cond(i_kws)
    
    
    # ---
    # 將資料加入 POST 請求中
    r = requests.post(url = url, json = query_cond)
    
    # 解析網頁
    response_json = r.json()
    query_cond['criteria']['size'] = response_json['total']
    
    
    # ---
    # 從 API 撈出所有資料
    r = requests.post(url = url, json = query_cond)
    r_json = r.json()
    
    
    # ---
    # 設定進一步關鍵字篩選
    cond_list = []
    kws = i_kws.split()
    while len(kws)!=0:
        kw = kws.pop(0)
        cond_list = cond_list + src01.cond_filter(keyword = kw, response_json = r_json)
    
    cond = [any(i) for i in zip(*cond_list)]
    res_all = res_all + list(compress(r_json['results'], cond))


dog cry
dog crying


In [19]:
# unique result
res_uni = []
for x in res_all:
    if x not in res_uni:
        res_uni.append(x)

In [20]:
# 抓出 wav 的 uid + 計算檔案個數
wav_ids = [x['id'] for x in res_uni]
digits = int( math.log10(len(wav_ids)) ) + 1
print(len(wav_ids))

# - 設定檔名
# - 設定存檔路徑
all_filename = ["s01-" + str(i).zfill(digits) + ".wav" for i in range(0, len(wav_ids))]
all_path = [os.path.join(path_category_save, x) for x in all_filename]
all_url = [f'https://sound-effects-media.bbcrewind.co.uk/wav/{x}.wav' for x in wav_ids]
# f'https://sound-effects-media.bbcrewind.co.uk/mp3/{wav_id}.mp3'

216


In [21]:
%%time

csv_filename = []
csv_url = []
for i in range(len(wav_ids)):
# for i in range(5):
    print(i, '\t', all_filename[i], '\t', wav_ids[i])
    src01.download_wav(all_url[i], wav_ids[i], all_path[i])
    csv_filename.append(all_filename[i])
    csv_url.append(all_url[i])

0 	 s01-000.wav 	 NHU05084153
1 	 s01-001.wav 	 NHU05083210
2 	 s01-002.wav 	 NHU05091090
3 	 s01-003.wav 	 NHU9711485
4 	 s01-004.wav 	 07005157
5 	 s01-005.wav 	 07074096
6 	 s01-006.wav 	 NHU05099201
7 	 s01-007.wav 	 NHU05025103
8 	 s01-008.wav 	 07037490
9 	 s01-009.wav 	 07037489
10 	 s01-010.wav 	 07005013
11 	 s01-011.wav 	 07020114
12 	 s01-012.wav 	 07054125
13 	 s01-013.wav 	 07045240
14 	 s01-014.wav 	 07020115
15 	 s01-015.wav 	 07023376
16 	 s01-016.wav 	 07037475
17 	 s01-017.wav 	 07037474
18 	 s01-018.wav 	 07020072
19 	 s01-019.wav 	 07042106
20 	 s01-020.wav 	 07037436
21 	 s01-021.wav 	 NHU9710965
22 	 s01-022.wav 	 NHU9710971
23 	 s01-023.wav 	 07020097
24 	 s01-024.wav 	 07020082
25 	 s01-025.wav 	 NHU05012171
26 	 s01-026.wav 	 07020131
27 	 s01-027.wav 	 07037437
28 	 s01-028.wav 	 NHU05099198
29 	 s01-029.wav 	 NHU05090064
30 	 s01-030.wav 	 07037473
31 	 s01-031.wav 	 NHU05079186
32 	 s01-032.wav 	 NHU05060001
33 	 s01-033.wav 	 07020074
34 	 s01-034.wav 	 NHU

#### save metadata

In [22]:
# for csv
path_csv = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.csv')
data_csv = pd.DataFrame({
    'filename': csv_filename,
    'url': csv_url
})
data_csv.to_csv(path_csv, encoding = 'utf-8', index = 0)

In [23]:
# for json
res_uni2json = [{'filename':a, 'metadata':b} for a, b in zip(all_filename, res_uni)]
path_json = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.json')
with open(path_json, 'w', encoding='utf-8') as jsonfile:
    json.dump(res_uni2json, jsonfile, ensure_ascii = False)

---

## CO_Smoke

---

## GlassBreaking

In [24]:
# 設定關鍵字
query_kws = ['glass crash', 'glass crashing']

# 設定存檔母路徑
path_category_save = path_category[4]

# 設定爬取網址的路徑
url = 'https://sound-effects-api.bbcrewind.co.uk/api/sfx/search'

In [25]:
res_all = []
for i_kws in query_kws:
    print(i_kws)
    query_cond = src01.query_cond(i_kws)
    
    
    # ---
    # 將資料加入 POST 請求中
    r = requests.post(url = url, json = query_cond)
    
    # 解析網頁
    response_json = r.json()
    query_cond['criteria']['size'] = response_json['total']
    
    
    # ---
    # 從 API 撈出所有資料
    r = requests.post(url = url, json = query_cond)
    r_json = r.json()
    
    
    # ---
    # 設定進一步關鍵字篩選
    cond_list = []
    kws = i_kws.split()
    while len(kws)!=0:
        kw = kws.pop(0)
        cond_list = cond_list + src01.cond_filter(keyword = kw, response_json = r_json)
    
    cond = [any(i) for i in zip(*cond_list)]
    res_all = res_all + list(compress(r_json['results'], cond))


glass crash
glass crashing


In [26]:
# unique result
res_uni = []
for x in res_all:
    if x not in res_uni:
        res_uni.append(x)

In [27]:
# 抓出 wav 的 uid + 計算檔案個數
wav_ids = [x['id'] for x in res_uni]
digits = int( math.log10(len(wav_ids)) ) + 1
print(len(wav_ids))

# - 設定檔名
# - 設定存檔路徑
all_filename = ["s01-" + str(i).zfill(digits) + ".wav" for i in range(0, len(wav_ids))]
all_path = [os.path.join(path_category_save, x) for x in all_filename]
all_url = [f'https://sound-effects-media.bbcrewind.co.uk/wav/{x}.wav' for x in wav_ids]
# f'https://sound-effects-media.bbcrewind.co.uk/mp3/{wav_id}.mp3'

331


In [28]:
%%time

csv_filename = []
csv_url = []
for i in range(len(wav_ids)):
# for i in range(5):
    print(i, '\t', all_filename[i], '\t', wav_ids[i])
    src01.download_wav(all_url[i], wav_ids[i], all_path[i])
    csv_filename.append(all_filename[i])
    csv_url.append(all_url[i])

0 	 s01-000.wav 	 07022493
1 	 s01-001.wav 	 07022392
2 	 s01-002.wav 	 07022389
3 	 s01-003.wav 	 07022376
4 	 s01-004.wav 	 07022488
5 	 s01-005.wav 	 07022393
6 	 s01-006.wav 	 07022390
7 	 s01-007.wav 	 07022377
8 	 s01-008.wav 	 07022494
9 	 s01-009.wav 	 07022489
10 	 s01-010.wav 	 07022487
11 	 s01-011.wav 	 07022485
12 	 s01-012.wav 	 07022483
13 	 s01-013.wav 	 07022394
14 	 s01-014.wav 	 07022380
15 	 s01-015.wav 	 07022492
16 	 s01-016.wav 	 07022486
17 	 s01-017.wav 	 07022484
18 	 s01-018.wav 	 07022387
19 	 s01-019.wav 	 07022491
20 	 s01-020.wav 	 07022490
21 	 s01-021.wav 	 07022391
22 	 s01-022.wav 	 07022388
23 	 s01-023.wav 	 07022381
24 	 s01-024.wav 	 07022396
25 	 s01-025.wav 	 07022385
26 	 s01-026.wav 	 07022384
27 	 s01-027.wav 	 07022386
28 	 s01-028.wav 	 07022395
29 	 s01-029.wav 	 07022504
30 	 s01-030.wav 	 07022382
31 	 s01-031.wav 	 07022379
32 	 s01-032.wav 	 07065176
33 	 s01-033.wav 	 07022378
34 	 s01-034.wav 	 07022506
35 	 s01-035.wav 	 07022496
36

287 	 s01-287.wav 	 NHU05010165
288 	 s01-288.wav 	 07050078
289 	 s01-289.wav 	 NHU05006110
290 	 s01-290.wav 	 NHU05029024
291 	 s01-291.wav 	 07022317
292 	 s01-292.wav 	 07022323
293 	 s01-293.wav 	 07022319
294 	 s01-294.wav 	 07022325
295 	 s01-295.wav 	 07022318
296 	 s01-296.wav 	 07022322
297 	 s01-297.wav 	 07022321
298 	 s01-298.wav 	 07022316
299 	 s01-299.wav 	 07022327
300 	 s01-300.wav 	 07022326
301 	 s01-301.wav 	 07022324
302 	 s01-302.wav 	 07022320
303 	 s01-303.wav 	 07022374
304 	 s01-304.wav 	 NHU9707956
305 	 s01-305.wav 	 07044067
306 	 s01-306.wav 	 07005140
307 	 s01-307.wav 	 NHU05100185
308 	 s01-308.wav 	 NHU05074124
309 	 s01-309.wav 	 07071068
310 	 s01-310.wav 	 NHU05020081
311 	 s01-311.wav 	 NHU05031085
312 	 s01-312.wav 	 07075062
313 	 s01-313.wav 	 NHU05018183
314 	 s01-314.wav 	 NHU05074126
315 	 s01-315.wav 	 NHU05017041
316 	 s01-316.wav 	 NHU05040141
317 	 s01-317.wav 	 NHU05040140
318 	 s01-318.wav 	 NHU05012142
319 	 s01-319.wav 	 NHU05013135

#### save metadata

In [29]:
# for csv
path_csv = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.csv')
data_csv = pd.DataFrame({
    'filename': csv_filename,
    'url': csv_url
})
data_csv.to_csv(path_csv, encoding = 'utf-8', index = 0)

In [30]:
# for json
res_uni2json = [{'filename':a, 'metadata':b} for a, b in zip(all_filename, res_uni)]
path_json = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.json')
with open(path_json, 'w', encoding='utf-8') as jsonfile:
    json.dump(res_uni2json, jsonfile, ensure_ascii = False)

---

## Other

Other, Vacuum, Blender, Electrics, Cat, Dishes

In [31]:
# 設定關鍵字
query_kws = ['vacuum', 'electric', 'mower', 'wave', 'electronic noise', 'noise']

# 設定存檔母路徑
path_category_save = path_category[5]

# 設定爬取網址的路徑
url = 'https://sound-effects-api.bbcrewind.co.uk/api/sfx/search'

In [32]:
res_all = []
for i_kws in query_kws:
    print(i_kws)
    query_cond = src01.query_cond(i_kws)
    
    
    # ---
    # 將資料加入 POST 請求中
    r = requests.post(url = url, json = query_cond)
    
    # 解析網頁
    response_json = r.json()
    query_cond['criteria']['size'] = response_json['total']
    
    
    # ---
    # 從 API 撈出所有資料
    r = requests.post(url = url, json = query_cond)
    r_json = r.json()
    
    
    # ---
    # 設定進一步關鍵字篩選
    cond_list = []
    kws = i_kws.split()
    while len(kws)!=0:
        kw = kws.pop(0)
        cond_list = cond_list + src01.cond_filter(keyword = kw, response_json = r_json)
    
    cond = [any(i) for i in zip(*cond_list)]
    res_all = res_all + list(compress(r_json['results'], cond))


vacuum
electric
mower
wave
electronic noise
noise


In [33]:
# unique result
res_uni = []
for x in res_all:
    if x not in res_uni:
        res_uni.append(x)

In [34]:
# 抓出 wav 的 uid + 計算檔案個數
wav_ids = [x['id'] for x in res_uni]
digits = int( math.log10(len(wav_ids)) ) + 1
print(len(wav_ids))

# - 設定檔名
# - 設定存檔路徑
all_filename = ["s01-" + str(i).zfill(digits) + ".wav" for i in range(0, len(wav_ids))]
all_path = [os.path.join(path_category_save, x) for x in all_filename]
all_url = [f'https://sound-effects-media.bbcrewind.co.uk/wav/{x}.wav' for x in wav_ids]
# f'https://sound-effects-media.bbcrewind.co.uk/mp3/{wav_id}.mp3'

952


In [35]:
%%time

csv_filename = []
csv_url = []
for i in range(len(wav_ids)):
# for i in range(5):
    print(i, '\t', all_filename[i], '\t', wav_ids[i])
    src01.download_wav(all_url[i], wav_ids[i], all_path[i])
    csv_filename.append(all_filename[i])
    csv_url.append(all_url[i])

0 	 s01-000.wav 	 07042264
1 	 s01-001.wav 	 0009071
2 	 s01-002.wav 	 07059042
3 	 s01-003.wav 	 07002002
4 	 s01-004.wav 	 07027212
5 	 s01-005.wav 	 07002006
6 	 s01-006.wav 	 07041177
7 	 s01-007.wav 	 07059041
8 	 s01-008.wav 	 07002007
9 	 s01-009.wav 	 07002001
10 	 s01-010.wav 	 07002008
11 	 s01-011.wav 	 07002004
12 	 s01-012.wav 	 07041005
13 	 s01-013.wav 	 07002005
14 	 s01-014.wav 	 07002003
15 	 s01-015.wav 	 07059039
16 	 s01-016.wav 	 07059040
17 	 s01-017.wav 	 NHU05004117
18 	 s01-018.wav 	 07045154
19 	 s01-019.wav 	 07042275
20 	 s01-020.wav 	 NHU05004119
21 	 s01-021.wav 	 07042276
22 	 s01-022.wav 	 07004187
23 	 s01-023.wav 	 07032092
24 	 s01-024.wav 	 07042277
25 	 s01-025.wav 	 07010191
26 	 s01-026.wav 	 07027194
27 	 s01-027.wav 	 07055170
28 	 s01-028.wav 	 NHU05088005
29 	 s01-029.wav 	 NHU05004115
30 	 s01-030.wav 	 NHU05004116
31 	 s01-031.wav 	 07027216
32 	 s01-032.wav 	 07046063
33 	 s01-033.wav 	 07045153
34 	 s01-034.wav 	 07049028
35 	 s01-035.wav

286 	 s01-286.wav 	 07032324
287 	 s01-287.wav 	 07032318
288 	 s01-288.wav 	 07070198
289 	 s01-289.wav 	 07032313
290 	 s01-290.wav 	 07025090
291 	 s01-291.wav 	 07032320
292 	 s01-292.wav 	 07041089
293 	 s01-293.wav 	 07032316
294 	 s01-294.wav 	 07032306
295 	 s01-295.wav 	 07074009
296 	 s01-296.wav 	 07032283
297 	 s01-297.wav 	 07059029
298 	 s01-298.wav 	 07059005
299 	 s01-299.wav 	 07032282
300 	 s01-300.wav 	 07059006
301 	 s01-301.wav 	 07059002
302 	 s01-302.wav 	 07001028
303 	 s01-303.wav 	 07032279
304 	 s01-304.wav 	 07032278
305 	 s01-305.wav 	 07059012
306 	 s01-306.wav 	 07059010
307 	 s01-307.wav 	 07059003
308 	 s01-308.wav 	 07032289
309 	 s01-309.wav 	 07032287
310 	 s01-310.wav 	 07002288
311 	 s01-311.wav 	 07032288
312 	 s01-312.wav 	 07059001
313 	 s01-313.wav 	 07032285
314 	 s01-314.wav 	 07059008
315 	 s01-315.wav 	 07032280
316 	 s01-316.wav 	 07032281
317 	 s01-317.wav 	 07059009
318 	 s01-318.wav 	 07032286
319 	 s01-319.wav 	 07002287
320 	 s01-320.

568 	 s01-568.wav 	 07057067
569 	 s01-569.wav 	 07023321
570 	 s01-570.wav 	 07053023
571 	 s01-571.wav 	 NHU05004097
572 	 s01-572.wav 	 07043373
573 	 s01-573.wav 	 07074224
574 	 s01-574.wav 	 07074231
575 	 s01-575.wav 	 NHU05097031
576 	 s01-576.wav 	 NHU05009112
577 	 s01-577.wav 	 07072006
578 	 s01-578.wav 	 07049106
579 	 s01-579.wav 	 NHU05021054
580 	 s01-580.wav 	 07050142
581 	 s01-581.wav 	 07047203
582 	 s01-582.wav 	 07074235
583 	 s01-583.wav 	 07063002
584 	 s01-584.wav 	 NHU05009114
585 	 s01-585.wav 	 07034186
586 	 s01-586.wav 	 07045074
587 	 s01-587.wav 	 07050107
588 	 s01-588.wav 	 07023322
589 	 s01-589.wav 	 07000112
590 	 s01-590.wav 	 07000110
591 	 s01-591.wav 	 07000107
592 	 s01-592.wav 	 07000104
593 	 s01-593.wav 	 07000103
594 	 s01-594.wav 	 07000102
595 	 s01-595.wav 	 07049107
596 	 s01-596.wav 	 NHU05064138
597 	 s01-597.wav 	 NHU05007121
598 	 s01-598.wav 	 NHU05074086
599 	 s01-599.wav 	 NHU05076018
600 	 s01-600.wav 	 07062064
601 	 s01-601.wa

834 	 s01-834.wav 	 07063066
835 	 s01-835.wav 	 NHU05068138
836 	 s01-836.wav 	 NHU05021062
837 	 s01-837.wav 	 NHU05017221
838 	 s01-838.wav 	 NHU05018056
839 	 s01-839.wav 	 NHU05015027
840 	 s01-840.wav 	 NHU05050024
841 	 s01-841.wav 	 NHU05094071
842 	 s01-842.wav 	 NHU05019093
843 	 s01-843.wav 	 NHU05103031
844 	 s01-844.wav 	 NHU05037028
845 	 s01-845.wav 	 NHU05021194
846 	 s01-846.wav 	 NHU05016051
847 	 s01-847.wav 	 NHU05042036
848 	 s01-848.wav 	 NHU05007058
849 	 s01-849.wav 	 NHU05008127
850 	 s01-850.wav 	 NHU05056009
851 	 s01-851.wav 	 NHU05044057
852 	 s01-852.wav 	 NHU05030029
853 	 s01-853.wav 	 NHU05044008
854 	 s01-854.wav 	 NHU05021063
855 	 s01-855.wav 	 NHU05015185
856 	 s01-856.wav 	 07048139
857 	 s01-857.wav 	 NHU05068128
858 	 s01-858.wav 	 NHU05085049
859 	 s01-859.wav 	 NHU05006106
860 	 s01-860.wav 	 NHU05060122
861 	 s01-861.wav 	 07062066
862 	 s01-862.wav 	 NHU05089030
863 	 s01-863.wav 	 NHU05004071
864 	 s01-864.wav 	 NHU05014132
865 	 s01-865.wav

#### save metadata

In [36]:
# for csv
path_csv = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.csv')
data_csv = pd.DataFrame({
    'filename': csv_filename,
    'url': csv_url
})
data_csv.to_csv(path_csv, encoding = 'utf-8', index = 0)

In [37]:
# for json
res_uni2json = [{'filename':a, 'metadata':b} for a, b in zip(all_filename, res_uni)]
path_json = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.json')
with open(path_json, 'w', encoding='utf-8') as jsonfile:
    json.dump(res_uni2json, jsonfile, ensure_ascii = False)

---

## Doorbell

In [38]:
# 設定關鍵字
query_kws = ['door bell', 'bell', 'doorbell', 'chime']

# 設定存檔母路徑
path_category_save = path_category[6]

# 設定爬取網址的路徑
url = 'https://sound-effects-api.bbcrewind.co.uk/api/sfx/search'

In [39]:
res_all = []
for i_kws in query_kws:
    print(i_kws)
    query_cond = src01.query_cond(i_kws)
    
    
    # ---
    # 將資料加入 POST 請求中
    r = requests.post(url = url, json = query_cond)
    
    # 解析網頁
    response_json = r.json()
    query_cond['criteria']['size'] = response_json['total']
    
    
    # ---
    # 從 API 撈出所有資料
    r = requests.post(url = url, json = query_cond)
    r_json = r.json()
    
    
    # ---
    # 設定進一步關鍵字篩選
    cond_list = []
    kws = i_kws.split()
    while len(kws)!=0:
        kw = kws.pop(0)
        cond_list = cond_list + src01.cond_filter(keyword = kw, response_json = r_json)
    
    cond = [any(i) for i in zip(*cond_list)]
    res_all = res_all + list(compress(r_json['results'], cond))


door bell
bell
doorbell
chime


In [40]:
# unique result
res_uni = []
for x in res_all:
    if x not in res_uni:
        res_uni.append(x)

In [41]:
# 抓出 wav 的 uid + 計算檔案個數
wav_ids = [x['id'] for x in res_uni]
digits = int( math.log10(len(wav_ids)) ) + 1
print(len(wav_ids))

# - 設定檔名
# - 設定存檔路徑
all_filename = ["s01-" + str(i).zfill(digits) + ".wav" for i in range(0, len(wav_ids))]
all_path = [os.path.join(path_category_save, x) for x in all_filename]
all_url = [f'https://sound-effects-media.bbcrewind.co.uk/wav/{x}.wav' for x in wav_ids]
# f'https://sound-effects-media.bbcrewind.co.uk/mp3/{wav_id}.mp3'

832


In [42]:
%%time

csv_filename = []
csv_url = []
for i in range(len(wav_ids)):
# for i in range(5):
    print(i, '\t', all_filename[i], '\t', wav_ids[i])
    src01.download_wav(all_url[i], wav_ids[i], all_path[i])
    csv_filename.append(all_filename[i])
    csv_url.append(all_url[i])

0 	 s01-000.wav 	 07012082
1 	 s01-001.wav 	 07012067
2 	 s01-002.wav 	 07012068
3 	 s01-003.wav 	 07012066
4 	 s01-004.wav 	 NHU05014051
5 	 s01-005.wav 	 07012073
6 	 s01-006.wav 	 07012072
7 	 s01-007.wav 	 07043044
8 	 s01-008.wav 	 07012071
9 	 s01-009.wav 	 07012070
10 	 s01-010.wav 	 07012069
11 	 s01-011.wav 	 07012074
12 	 s01-012.wav 	 07050201
13 	 s01-013.wav 	 07047196
14 	 s01-014.wav 	 07027081
15 	 s01-015.wav 	 07015058
16 	 s01-016.wav 	 07037546
17 	 s01-017.wav 	 07027086
18 	 s01-018.wav 	 07027080
19 	 s01-019.wav 	 07037354
20 	 s01-020.wav 	 07011092
21 	 s01-021.wav 	 07037424
22 	 s01-022.wav 	 07011096
23 	 s01-023.wav 	 07037507
24 	 s01-024.wav 	 07012087
25 	 s01-025.wav 	 07011093
26 	 s01-026.wav 	 07037425
27 	 s01-027.wav 	 07037422
28 	 s01-028.wav 	 07011094
29 	 s01-029.wav 	 07022328
30 	 s01-030.wav 	 07027096
31 	 s01-031.wav 	 07037477
32 	 s01-032.wav 	 07037552
33 	 s01-033.wav 	 07037519
34 	 s01-034.wav 	 07037621
35 	 s01-035.wav 	 07037555

287 	 s01-287.wav 	 07037537
288 	 s01-288.wav 	 07037512
289 	 s01-289.wav 	 07025159
290 	 s01-290.wav 	 07043245
291 	 s01-291.wav 	 07061006
292 	 s01-292.wav 	 07025151
293 	 s01-293.wav 	 07032063
294 	 s01-294.wav 	 07032034
295 	 s01-295.wav 	 07072159
296 	 s01-296.wav 	 07023184
297 	 s01-297.wav 	 07000042
298 	 s01-298.wav 	 07012016
299 	 s01-299.wav 	 07005121
300 	 s01-300.wav 	 07027104
301 	 s01-301.wav 	 07025136
302 	 s01-302.wav 	 07037605
303 	 s01-303.wav 	 07037601
304 	 s01-304.wav 	 07037596
305 	 s01-305.wav 	 07043095
306 	 s01-306.wav 	 07012075
307 	 s01-307.wav 	 07011132
308 	 s01-308.wav 	 07004014
309 	 s01-309.wav 	 07025137
310 	 s01-310.wav 	 07037599
311 	 s01-311.wav 	 07037592
312 	 s01-312.wav 	 07040184
313 	 s01-313.wav 	 07040183
314 	 s01-314.wav 	 07023185
315 	 s01-315.wav 	 07011128
316 	 s01-316.wav 	 07011087
317 	 s01-317.wav 	 07004015
318 	 s01-318.wav 	 07027103
319 	 s01-319.wav 	 07037603
320 	 s01-320.wav 	 07037602
321 	 s01-321.

569 	 s01-569.wav 	 07027094
570 	 s01-570.wav 	 07063146
571 	 s01-571.wav 	 07037516
572 	 s01-572.wav 	 07037511
573 	 s01-573.wav 	 07054093
574 	 s01-574.wav 	 07012005
575 	 s01-575.wav 	 07013055
576 	 s01-576.wav 	 07038174
577 	 s01-577.wav 	 07039369
578 	 s01-578.wav 	 07042286
579 	 s01-579.wav 	 07001099
580 	 s01-580.wav 	 07011082
581 	 s01-581.wav 	 07011043
582 	 s01-582.wav 	 07011012
583 	 s01-583.wav 	 07037550
584 	 s01-584.wav 	 NHU05103069
585 	 s01-585.wav 	 07022070
586 	 s01-586.wav 	 07032064
587 	 s01-587.wav 	 07043270
588 	 s01-588.wav 	 07022047
589 	 s01-589.wav 	 07022040
590 	 s01-590.wav 	 07022037
591 	 s01-591.wav 	 07046036
592 	 s01-592.wav 	 07011101
593 	 s01-593.wav 	 07008155
594 	 s01-594.wav 	 07022270
595 	 s01-595.wav 	 07027111
596 	 s01-596.wav 	 07058156
597 	 s01-597.wav 	 07065039
598 	 s01-598.wav 	 NHU05013113
599 	 s01-599.wav 	 07022072
600 	 s01-600.wav 	 07022068
601 	 s01-601.wav 	 07022066
602 	 s01-602.wav 	 07022063
603 	 s0

#### save metadata

In [43]:
# for csv
path_csv = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.csv')
data_csv = pd.DataFrame({
    'filename': csv_filename,
    'url': csv_url
})
data_csv.to_csv(path_csv, encoding = 'utf-8', index = 0)

In [44]:
# for json
res_uni2json = [{'filename':a, 'metadata':b} for a, b in zip(all_filename, res_uni)]
path_json = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.json')
with open(path_json, 'w', encoding='utf-8') as jsonfile:
    json.dump(res_uni2json, jsonfile, ensure_ascii = False)

---

## Bird

In [45]:
# 設定關鍵字
query_kws = ['bird']

# 設定存檔母路徑
path_category_save = path_category[7]

# 設定爬取網址的路徑
url = 'https://sound-effects-api.bbcrewind.co.uk/api/sfx/search'

In [46]:
res_all = []
for i_kws in query_kws:
    print(i_kws)
    query_cond = src01.query_cond(i_kws)
    
    
    # ---
    # 將資料加入 POST 請求中
    r = requests.post(url = url, json = query_cond)
    
    # 解析網頁
    response_json = r.json()
    query_cond['criteria']['size'] = response_json['total']
    
    
    # ---
    # 從 API 撈出所有資料
    r = requests.post(url = url, json = query_cond)
    r_json = r.json()
    
    
    # ---
    # 設定進一步關鍵字篩選
    cond_list = []
    kws = i_kws.split()
    while len(kws)!=0:
        kw = kws.pop(0)
        cond_list = cond_list + src01.cond_filter(keyword = kw, response_json = r_json)
    
    cond = [any(i) for i in zip(*cond_list)]
    res_all = res_all + list(compress(r_json['results'], cond))


bird


In [47]:
# unique result
res_uni = []
for x in res_all:
    if x not in res_uni:
        res_uni.append(x)

In [48]:
# 抓出 wav 的 uid + 計算檔案個數
wav_ids = [x['id'] for x in res_uni]
digits = int( math.log10(len(wav_ids)) ) + 1
print(len(wav_ids))

# - 設定檔名
# - 設定存檔路徑
all_filename = ["s01-" + str(i).zfill(digits) + ".wav" for i in range(0, len(wav_ids))]
all_path = [os.path.join(path_category_save, x) for x in all_filename]
all_url = [f'https://sound-effects-media.bbcrewind.co.uk/wav/{x}.wav' for x in wav_ids]
# f'https://sound-effects-media.bbcrewind.co.uk/mp3/{wav_id}.mp3'

1910


In [49]:
%%time

csv_filename = []
csv_url = []
for i in range(len(wav_ids)):
# for i in range(5):
    print(i, '\t', all_filename[i], '\t', wav_ids[i])
    src01.download_wav(all_url[i], wav_ids[i], all_path[i])
    csv_filename.append(all_filename[i])
    csv_url.append(all_url[i])

0 	 s01-0000.wav 	 NHU05060101
1 	 s01-0001.wav 	 NHU05096030
2 	 s01-0002.wav 	 NHU05056179
3 	 s01-0003.wav 	 NHU05058154
4 	 s01-0004.wav 	 NHU05050147
5 	 s01-0005.wav 	 NHU05049139
6 	 s01-0006.wav 	 NHU05050146
7 	 s01-0007.wav 	 NHU05096011
8 	 s01-0008.wav 	 NHU05056180
9 	 s01-0009.wav 	 NHU05058038
10 	 s01-0010.wav 	 NHU05056095
11 	 s01-0011.wav 	 NHU05052020
12 	 s01-0012.wav 	 NHU05097170
13 	 s01-0013.wav 	 NHU05096037
14 	 s01-0014.wav 	 NHU05058042
15 	 s01-0015.wav 	 NHU05054149
16 	 s01-0016.wav 	 NHU05005121
17 	 s01-0017.wav 	 NHU05060102
18 	 s01-0018.wav 	 NHU05088201
19 	 s01-0019.wav 	 NHU05011096
20 	 s01-0020.wav 	 NHU05006074
21 	 s01-0021.wav 	 NHU05008042
22 	 s01-0022.wav 	 NHU05008043
23 	 s01-0023.wav 	 NHU05046083
24 	 s01-0024.wav 	 NHU05046084
25 	 s01-0025.wav 	 NHU05078124
26 	 s01-0026.wav 	 NHU05011072
27 	 s01-0027.wav 	 NHU05011097
28 	 s01-0028.wav 	 NHU05011098
29 	 s01-0029.wav 	 NHU05011100
30 	 s01-0030.wav 	 NHU05083064
31 	 s01-0031.wav 

252 	 s01-0252.wav 	 NHU05007045
253 	 s01-0253.wav 	 NHU05007050
254 	 s01-0254.wav 	 NHU05009072
255 	 s01-0255.wav 	 NHU05010120
256 	 s01-0256.wav 	 NHU05003118
257 	 s01-0257.wav 	 NHU05003119
258 	 s01-0258.wav 	 NHU05003121
259 	 s01-0259.wav 	 NHU05004090
260 	 s01-0260.wav 	 NHU05072031
261 	 s01-0261.wav 	 NHU05056104
262 	 s01-0262.wav 	 NHU05032082
263 	 s01-0263.wav 	 07012175
264 	 s01-0264.wav 	 NHU05070075
265 	 s01-0265.wav 	 NHU05070076
266 	 s01-0266.wav 	 NHU05070096
267 	 s01-0267.wav 	 NHU05099041
268 	 s01-0268.wav 	 NHU05100146
269 	 s01-0269.wav 	 NHU05090057
270 	 s01-0270.wav 	 NHU05050044
271 	 s01-0271.wav 	 NHU05074135
272 	 s01-0272.wav 	 NHU05075019
273 	 s01-0273.wav 	 NHU05102011
274 	 s01-0274.wav 	 NHU05102012
275 	 s01-0275.wav 	 NHU05102088
276 	 s01-0276.wav 	 NHU05102107
277 	 s01-0277.wav 	 NHU05034078
278 	 s01-0278.wav 	 NHU05033030
279 	 s01-0279.wav 	 NHU05019105
280 	 s01-0280.wav 	 NHU05019106
281 	 s01-0281.wav 	 NHU05019107
282 	 s01-028

502 	 s01-0502.wav 	 NHU05003104
503 	 s01-0503.wav 	 NHU05003108
504 	 s01-0504.wav 	 NHU05004044
505 	 s01-0505.wav 	 NHU05019049
506 	 s01-0506.wav 	 NHU05012063
507 	 s01-0507.wav 	 NHU05028118
508 	 s01-0508.wav 	 NHU05004003
509 	 s01-0509.wav 	 NHU05006120
510 	 s01-0510.wav 	 NHU05064020
511 	 s01-0511.wav 	 NHU05087001
512 	 s01-0512.wav 	 NHU05084034
513 	 s01-0513.wav 	 NHU05087002
514 	 s01-0514.wav 	 NHU05083181
515 	 s01-0515.wav 	 NHU9679405
516 	 s01-0516.wav 	 NHU05072125
517 	 s01-0517.wav 	 NHU05073093
518 	 s01-0518.wav 	 NHU05073132
519 	 s01-0519.wav 	 NHU05007121
520 	 s01-0520.wav 	 NHU05012049
521 	 s01-0521.wav 	 NHU05092005
522 	 s01-0522.wav 	 NHU05089066
523 	 s01-0523.wav 	 NHU05056106
524 	 s01-0524.wav 	 NHU05047074
525 	 s01-0525.wav 	 NHU05047075
526 	 s01-0526.wav 	 NHU05048083
527 	 s01-0527.wav 	 NHU05018166
528 	 s01-0528.wav 	 NHU05018167
529 	 s01-0529.wav 	 NHU05021025
530 	 s01-0530.wav 	 NHU05021027
531 	 s01-0531.wav 	 NHU05021033
532 	 s01-0

752 	 s01-0752.wav 	 NHU05079033
753 	 s01-0753.wav 	 NHU05011149
754 	 s01-0754.wav 	 NHU05064023
755 	 s01-0755.wav 	 NHU05051061
756 	 s01-0756.wav 	 NHU05047040
757 	 s01-0757.wav 	 NHU05049135
758 	 s01-0758.wav 	 NHU05016096
759 	 s01-0759.wav 	 NHU05004053
760 	 s01-0760.wav 	 NHU05033044
761 	 s01-0761.wav 	 NHU05040116
762 	 s01-0762.wav 	 NHU05059167
763 	 s01-0763.wav 	 NHU05050078
764 	 s01-0764.wav 	 NHU05011146
765 	 s01-0765.wav 	 NHU05040140
766 	 s01-0766.wav 	 NHU05040141
767 	 s01-0767.wav 	 NHU05028004
768 	 s01-0768.wav 	 NHU05028041
769 	 s01-0769.wav 	 NHU05014055
770 	 s01-0770.wav 	 NHU05008060
771 	 s01-0771.wav 	 NHU05087009
772 	 s01-0772.wav 	 NHU05080054
773 	 s01-0773.wav 	 NHU10405655
774 	 s01-0774.wav 	 NHU05028030
775 	 s01-0775.wav 	 NHU05051154
776 	 s01-0776.wav 	 NHU05051156
777 	 s01-0777.wav 	 NHU05008095
778 	 s01-0778.wav 	 NHU05069104
779 	 s01-0779.wav 	 NHU05038099
780 	 s01-0780.wav 	 NHU05038052
781 	 s01-0781.wav 	 NHU05101034
782 	 s01-

1001 	 s01-1001.wav 	 NHU05016198
1002 	 s01-1002.wav 	 NHU05062203
1003 	 s01-1003.wav 	 NHU05092022
1004 	 s01-1004.wav 	 NHU05050075
1005 	 s01-1005.wav 	 NHU05050079
1006 	 s01-1006.wav 	 NHU05011150
1007 	 s01-1007.wav 	 NHU05034021
1008 	 s01-1008.wav 	 NHU05031074
1009 	 s01-1009.wav 	 NHU05033045
1010 	 s01-1010.wav 	 NHU05018204
1011 	 s01-1011.wav 	 NHU05015038
1012 	 s01-1012.wav 	 NHU05016035
1013 	 s01-1013.wav 	 NHU05006083
1014 	 s01-1014.wav 	 NHU05006101
1015 	 s01-1015.wav 	 NHU05007122
1016 	 s01-1016.wav 	 NHU05046067
1017 	 s01-1017.wav 	 NHU05023026
1018 	 s01-1018.wav 	 NHU05064253
1019 	 s01-1019.wav 	 NHU05064329
1020 	 s01-1020.wav 	 NHU05097116
1021 	 s01-1021.wav 	 NHU05078300
1022 	 s01-1022.wav 	 NHU05039131
1023 	 s01-1023.wav 	 NHU05017199
1024 	 s01-1024.wav 	 NHU05045041
1025 	 s01-1025.wav 	 NHU05004004
1026 	 s01-1026.wav 	 NHU05098120
1027 	 s01-1027.wav 	 NHU05018181
1028 	 s01-1028.wav 	 NHU05018197
1029 	 s01-1029.wav 	 NHU05032110
1030 	 s01-103

1243 	 s01-1243.wav 	 NHU05072093
1244 	 s01-1244.wav 	 NHU05034051
1245 	 s01-1245.wav 	 NHU05027083
1246 	 s01-1246.wav 	 NHU05021097
1247 	 s01-1247.wav 	 NHU05054187
1248 	 s01-1248.wav 	 NHU05047067
1249 	 s01-1249.wav 	 NHU05049002
1250 	 s01-1250.wav 	 NHU05019077
1251 	 s01-1251.wav 	 NHU05019081
1252 	 s01-1252.wav 	 NHU05017067
1253 	 s01-1253.wav 	 NHU05018031
1254 	 s01-1254.wav 	 NHU05018034
1255 	 s01-1255.wav 	 NHU05014194
1256 	 s01-1256.wav 	 NHU05015092
1257 	 s01-1257.wav 	 NHU05015113
1258 	 s01-1258.wav 	 NHU05016036
1259 	 s01-1259.wav 	 NHU05013142
1260 	 s01-1260.wav 	 NHU05014092
1261 	 s01-1261.wav 	 NHU05005151
1262 	 s01-1262.wav 	 NHU05006018
1263 	 s01-1263.wav 	 NHU05008078
1264 	 s01-1264.wav 	 NHU05008086
1265 	 s01-1265.wav 	 NHU05003107
1266 	 s01-1266.wav 	 NHU05040027
1267 	 s01-1267.wav 	 NHU05040144
1268 	 s01-1268.wav 	 NHU05042040
1269 	 s01-1269.wav 	 NHU05062282
1270 	 s01-1270.wav 	 NHU05064025
1271 	 s01-1271.wav 	 NHU05074076
1272 	 s01-127

1485 	 s01-1485.wav 	 NHU05021105
1486 	 s01-1486.wav 	 NHU05036001
1487 	 s01-1487.wav 	 NHU05030040
1488 	 s01-1488.wav 	 NHU05007089
1489 	 s01-1489.wav 	 NHU05009012
1490 	 s01-1490.wav 	 NHU05042025
1491 	 s01-1491.wav 	 NHU05011137
1492 	 s01-1492.wav 	 NHU05011188
1493 	 s01-1493.wav 	 NHU05021104
1494 	 s01-1494.wav 	 NHU05019174
1495 	 s01-1495.wav 	 NHU05018049
1496 	 s01-1496.wav 	 NHU05015070
1497 	 s01-1497.wav 	 NHU05014136
1498 	 s01-1498.wav 	 NHU05009132
1499 	 s01-1499.wav 	 NHU05038079
1500 	 s01-1500.wav 	 NHU05060156
1501 	 s01-1501.wav 	 NHU05096169
1502 	 s01-1502.wav 	 NHU05089064
1503 	 s01-1503.wav 	 NHU05041017
1504 	 s01-1504.wav 	 NHU05028008
1505 	 s01-1505.wav 	 NHU05018024
1506 	 s01-1506.wav 	 NHU05018053
1507 	 s01-1507.wav 	 NHU05015072
1508 	 s01-1508.wav 	 NHU05015202
1509 	 s01-1509.wav 	 NHU05014130
1510 	 s01-1510.wav 	 NHU05006174
1511 	 s01-1511.wav 	 NHU05007123
1512 	 s01-1512.wav 	 NHU05021170
1513 	 s01-1513.wav 	 NHU05022054
1514 	 s01-151

1726 	 s01-1726.wav 	 NHU05047043
1727 	 s01-1727.wav 	 NHU05047044
1728 	 s01-1728.wav 	 NHU05011020
1729 	 s01-1729.wav 	 NHU05011091
1730 	 s01-1730.wav 	 NHU05021081
1731 	 s01-1731.wav 	 NHU05016085
1732 	 s01-1732.wav 	 NHU05011182
1733 	 s01-1733.wav 	 NHU05007030
1734 	 s01-1734.wav 	 NHU05016239
1735 	 s01-1735.wav 	 NHU05018109
1736 	 s01-1736.wav 	 NHU05006166
1737 	 s01-1737.wav 	 NHU05006167
1738 	 s01-1738.wav 	 NHU05011119
1739 	 s01-1739.wav 	 NHU9987127
1740 	 s01-1740.wav 	 NHU05048036
1741 	 s01-1741.wav 	 NHU05021063
1742 	 s01-1742.wav 	 NHU05017174
1743 	 s01-1743.wav 	 NHU05015114
1744 	 s01-1744.wav 	 NHU05015185
1745 	 s01-1745.wav 	 NHU05015200
1746 	 s01-1746.wav 	 NHU05015201
1747 	 s01-1747.wav 	 NHU05008176
1748 	 s01-1748.wav 	 NHU05021093
1749 	 s01-1749.wav 	 NHU05021095
1750 	 s01-1750.wav 	 NHU05039129
1751 	 s01-1751.wav 	 NHU05096174
1752 	 s01-1752.wav 	 NHU05011060
1753 	 s01-1753.wav 	 NHU05011061
1754 	 s01-1754.wav 	 NHU05011118
1755 	 s01-1755

#### save metadata

In [50]:
# for csv
path_csv = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.csv')
data_csv = pd.DataFrame({
    'filename': csv_filename,
    'url': csv_url
})
data_csv.to_csv(path_csv, encoding = 'utf-8', index = 0)

In [51]:
# for json
res_uni2json = [{'filename':a, 'metadata':b} for a, b in zip(all_filename, res_uni)]
path_json = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.json')
with open(path_json, 'w', encoding='utf-8') as jsonfile:
    json.dump(res_uni2json, jsonfile, ensure_ascii = False)

---

## Music_Instrument

---

## Laugh_Shout_Scream

In [52]:
# 設定關鍵字
query_kws = ['laugh', 'laughs', 'shout', 'shouts','cheer', 'crowd', 'crowds']

# 設定存檔母路徑
path_category_save = path_category[9]

# 設定爬取網址的路徑
url = 'https://sound-effects-api.bbcrewind.co.uk/api/sfx/search'

In [53]:
res_all = []
for i_kws in query_kws:
    print(i_kws)
    query_cond = src01.query_cond(i_kws)
    
    
    # ---
    # 將資料加入 POST 請求中
    r = requests.post(url = url, json = query_cond)
    
    # 解析網頁
    response_json = r.json()
    query_cond['criteria']['size'] = response_json['total']
    
    
    # ---
    # 從 API 撈出所有資料
    r = requests.post(url = url, json = query_cond)
    r_json = r.json()
    
    
    # ---
    # 設定進一步關鍵字篩選
    cond_list = []
    kws = i_kws.split()
    while len(kws)!=0:
        kw = kws.pop(0)
        cond_list = cond_list + src01.cond_filter(keyword = kw, response_json = r_json)
    
    cond = [any(i) for i in zip(*cond_list)]
    res_all = res_all + list(compress(r_json['results'], cond))


laugh
laughs
shout
shouts
cheer
crowd
crowds


In [54]:
# unique result
res_uni = []
for x in res_all:
    if x not in res_uni:
        res_uni.append(x)

In [55]:
# 抓出 wav 的 uid + 計算檔案個數
wav_ids = [x['id'] for x in res_uni]
digits = int( math.log10(len(wav_ids)) ) + 1
print(len(wav_ids))

# - 設定檔名
# - 設定存檔路徑
all_filename = ["s01-" + str(i).zfill(digits) + ".wav" for i in range(0, len(wav_ids))]
all_path = [os.path.join(path_category_save, x) for x in all_filename]
all_url = [f'https://sound-effects-media.bbcrewind.co.uk/wav/{x}.wav' for x in wav_ids]
# f'https://sound-effects-media.bbcrewind.co.uk/mp3/{wav_id}.mp3'

686


In [56]:
%%time

csv_filename = []
csv_url = []
for i in range(len(wav_ids)):
# for i in range(5):
    print(i, '\t', all_filename[i], '\t', wav_ids[i])
    src01.download_wav(all_url[i], wav_ids[i], all_path[i])
    csv_filename.append(all_filename[i])
    csv_url.append(all_url[i])

0 	 s01-000.wav 	 07043056
1 	 s01-001.wav 	 07051206
2 	 s01-002.wav 	 07043054
3 	 s01-003.wav 	 07071024
4 	 s01-004.wav 	 07039076
5 	 s01-005.wav 	 NHU05007005
6 	 s01-006.wav 	 07003076
7 	 s01-007.wav 	 07044072
8 	 s01-008.wav 	 07072176
9 	 s01-009.wav 	 NHU05069117
10 	 s01-010.wav 	 NHU05019085
11 	 s01-011.wav 	 07028108
12 	 s01-012.wav 	 07073020
13 	 s01-013.wav 	 07055025
14 	 s01-014.wav 	 NHU05079097
15 	 s01-015.wav 	 07013086
16 	 s01-016.wav 	 07030109
17 	 s01-017.wav 	 07044073
18 	 s01-018.wav 	 07039003
19 	 s01-019.wav 	 07075054
20 	 s01-020.wav 	 07017116
21 	 s01-021.wav 	 07038110
22 	 s01-022.wav 	 07038111
23 	 s01-023.wav 	 NHU05079096
24 	 s01-024.wav 	 07038112
25 	 s01-025.wav 	 07005029
26 	 s01-026.wav 	 NHU05015174
27 	 s01-027.wav 	 07005018
28 	 s01-028.wav 	 07054133
29 	 s01-029.wav 	 NHU05014107
30 	 s01-030.wav 	 07030108
31 	 s01-031.wav 	 07062093
32 	 s01-032.wav 	 NHU05008097
33 	 s01-033.wav 	 07052028
34 	 s01-034.wav 	 07052020
35 	 s

285 	 s01-285.wav 	 07054147
286 	 s01-286.wav 	 07058143
287 	 s01-287.wav 	 07015005
288 	 s01-288.wav 	 07050214
289 	 s01-289.wav 	 NHU05013113
290 	 s01-290.wav 	 NHU05015170
291 	 s01-291.wav 	 NHU05014072
292 	 s01-292.wav 	 NHU05015169
293 	 s01-293.wav 	 07017038
294 	 s01-294.wav 	 07028040
295 	 s01-295.wav 	 07028039
296 	 s01-296.wav 	 07058145
297 	 s01-297.wav 	 07047049
298 	 s01-298.wav 	 07028047
299 	 s01-299.wav 	 07057031
300 	 s01-300.wav 	 07015006
301 	 s01-301.wav 	 07026095
302 	 s01-302.wav 	 07052040
303 	 s01-303.wav 	 07052015
304 	 s01-304.wav 	 07028041
305 	 s01-305.wav 	 07017046
306 	 s01-306.wav 	 07017043
307 	 s01-307.wav 	 07017034
308 	 s01-308.wav 	 07017032
309 	 s01-309.wav 	 07017031
310 	 s01-310.wav 	 07028016
311 	 s01-311.wav 	 07028006
312 	 s01-312.wav 	 07050014
313 	 s01-313.wav 	 07017036
314 	 s01-314.wav 	 07017035
315 	 s01-315.wav 	 07017045
316 	 s01-316.wav 	 07017044
317 	 s01-317.wav 	 07017037
318 	 s01-318.wav 	 07017030
31

565 	 s01-565.wav 	 07057047
566 	 s01-566.wav 	 07064063
567 	 s01-567.wav 	 07064061
568 	 s01-568.wav 	 07063050
569 	 s01-569.wav 	 07063038
570 	 s01-570.wav 	 07017049
571 	 s01-571.wav 	 07017013
572 	 s01-572.wav 	 07039007
573 	 s01-573.wav 	 07043068
574 	 s01-574.wav 	 07043059
575 	 s01-575.wav 	 07055119
576 	 s01-576.wav 	 07053084
577 	 s01-577.wav 	 07019098
578 	 s01-578.wav 	 07050209
579 	 s01-579.wav 	 07035141
580 	 s01-580.wav 	 07058140
581 	 s01-581.wav 	 07057045
582 	 s01-582.wav 	 07057032
583 	 s01-583.wav 	 07017021
584 	 s01-584.wav 	 07064059
585 	 s01-585.wav 	 07063044
586 	 s01-586.wav 	 07039049
587 	 s01-587.wav 	 07043060
588 	 s01-588.wav 	 07043049
589 	 s01-589.wav 	 07060016
590 	 s01-590.wav 	 07057038
591 	 s01-591.wav 	 07057018
592 	 s01-592.wav 	 07003057
593 	 s01-593.wav 	 07003053
594 	 s01-594.wav 	 07017048
595 	 s01-595.wav 	 07017014
596 	 s01-596.wav 	 07035143
597 	 s01-597.wav 	 07039052
598 	 s01-598.wav 	 07039009
599 	 s01-599.

ConnectionError: HTTPSConnectionPool(host='sound-effects-media.bbcrewind.co.uk', port=443): Max retries exceeded with url: /wav/07003044.wav (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000028017870C48>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

#### save metadata

In [57]:
# for csv
path_csv = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.csv')
data_csv = pd.DataFrame({
    'filename': csv_filename,
    'url': csv_url
})
data_csv.to_csv(path_csv, encoding = 'utf-8', index = 0)

In [58]:
# for json
res_uni2json = [{'filename':a, 'metadata':b} for a, b in zip(all_filename, res_uni)]
path_json = os.path.join(path_metadata[0], os.path.basename(path_category_save) + '.json')
with open(path_json, 'w', encoding='utf-8') as jsonfile:
    json.dump(res_uni2json, jsonfile, ensure_ascii = False)

# END

## archive

### 爬取-方法01

---

### 進一步篩選

---

### 爬取迴圈中-檔名路徑

---

# reference

- [URL Encode Decode - URL Percent Encoding and Decoding.](https://www.url-encode-decode.com/)
- [URL Decoder/Encoder](https://meyerweb.com/eric/tools/dencoder/)