In [1]:
import re
import sys
import os
import xml.etree.ElementTree as ET
import xml.dom.minidom
import xml
import requests
import time
import json
from pathlib import Path
import PyPDF2 as ppdf
import string
import pickle
from tqdm import tqdm
from ratelimit import limits, sleep_and_retry

In [2]:
NAMES = sorted([
    'Amis',
    'Atayal',
    'Paiwan',
    'Bunun',
    'Puyma',
    'Rukai',
    'Tsou',
    'Saisiyat',
    'Yami',
    'Thao',
    'Kavalan',
    'Truku',
    'Sakizaya',
    'Seediq',
    'Saaroa',
    'Kanakanavu'
])

XML_FOLDER = '.PanglossXML'

In [3]:
def getRoot(tr: str):
    # tr = tribe name with proper capitalization
    cwd = os.getcwd()
    folder = XML_FOLDER
    file = os.path.join(cwd, folder, tr, tr + ".xml")
    data = ET.parse(file)
    root = data.getroot()
    return root

In [4]:
# folder setup
for tr in NAMES:
    audioDir = os.path.join(os.getcwd(), XML_FOLDER, tr, 'Audio')
    if not os.path.exists(audioDir):
        os.makedirs(audioDir)

In [5]:
def dlHelper(url, audioPath, dlRate=1024*1024*5, maxRetries=10, retryDelay=5):
    retries = 0
    while retries < maxRetries:
        try:
            with requests.get(url, stream=True, timeout=60) as response:
                response.raise_for_status()

                total_size = int(response.headers.get('content-length', 0))
                with open(audioPath, 'wb') as file:
                    for chunk in response.iter_content(chunk_size=dlRate):
                        file.write(chunk)
                break
        except:
            retries += 1
            time.sleep(retryDelay)

            if retries == maxRetries:
                print(f'Skipped {os.path.basename(audioPath)}')

In [8]:
# done check, good for segmenting
done = ['Amis', 'Atayal', 'Bunun']

toDo = [i for i in NAMES if i not in done]

In [40]:
def getTotals():
    total = 0
    for tr in NAMES:
        # file path setup + get xml file
        file = os.path.join(os.getcwd(), XML_FOLDER, tr, tr + ".xml")
        data = ET.parse(file)
        root = data.getroot()

        # parse into dict of {xml id: url to audio}
        urlPairs = {k.attrib['id']: k[2].attrib['url'] for k in root}
        total += len(urlPairs)
        print(f"{tr[0:6]}\t{len(urlPairs)}")

    print(f"\nTotal\t{total}")

In [None]:
# mp3 downloading loop
for tr in toDo:
    # file path setup + get xml file
    file = os.path.join(os.getcwd(), XML_FOLDER, tr, tr + ".xml")
    data = ET.parse(file)
    root = data.getroot()

    # parse into dict of {xml id: url to audio}
    urlPairs = {k.attrib['id']: k[2].attrib['url'] for k in root}

    # downloading loop
    print(f"Processing {tr}")
    for i, (sentenceID, url) in enumerate(tqdm(urlPairs.items())):
        # check
        assert root[i].attrib['id'] == sentenceID 
        assert root[i][2].attrib['url'] == url

        # get path + download
        audioPath = os.path.join(os.getcwd(), XML_FOLDER, tr, 'Audio', sentenceID + '.mp3')
        dlHelper(url, audioPath)

    # save new file
    for elem in root:
        relaudioPath = os.path.join(".", "Audio", elem.attrib['id'] + '.mp3')
        elem[2].attrib['file'] = relaudioPath

    tree = ET.ElementTree(root)
    xml_str = ET.tostring(root, encoding='utf-8').decode('utf8')
    newFile = os.path.join(
        os.getcwd(),
        XML_FOLDER,
        tr,
        tr + '_filePaths.xml'
    )

    with open(newFile, 'w', encoding='utf-8') as f:
        f.write(xml_str)

In [30]:
# failchecking + redownloading
for tr in NAMES:
    # get xml w/ filepaths + urls
    file = os.path.join(os.getcwd(), XML_FOLDER, tr, tr + "_filePaths.xml")
    data = ET.parse(file)
    root = data.getroot()

    # check if file exists
    fails = []
    filePairs = {k.attrib['id']: k[2] for k in root}
    

    for i, (sent, duo) in enumerate((filePairs.items())):
        file = duo.attrib['file']
        fpath = os.path.join(
            os.getcwd(),
            XML_FOLDER,
            tr,
            file[2:]
        )

        if not os.path.exists(fpath):
            fails.append(sent)
    
    # redownload
    if fails:
        print(f"Fails in {tr}, trying to fix...")
        for id in fails:
            relpath, url = filePairs[id].attrib['file'], filePairs[id].attrib['url']
            fpath = os.path.join(
                os.getcwd(),
                XML_FOLDER,
                tr,
                'Audio',
                id + '.mp3'
            )
            
            dlHelper(url, fpath)
