In [55]:
import re
import sys
import os
import xml.etree.ElementTree as ET
import xml
import requests
import json
from pathlib import Path
import PyPDF2 as ppdf
import string
import pickle
from tqdm import tqdm
from ratelimit import limits, sleep_and_retry

In [56]:
# define vars
URL = "https://e-dictionary.ilrdf.org.tw/wsReDictionary.htm"

original_dict = {
    2: 'Amis',
    6: 'Atayal',
    24: 'Paiwan',
    22: 'Bunun',
    38: 'Puyma',
    28: 'Rukai',
    35: 'Tsou',
    13: 'Saisiyat',
    42: 'Yami',
    14: 'Thao',
    34: 'Kavalan',
    33: 'Truku',
    43: 'Sakizaya',
    16: 'Seediq',
    37: 'Saaroa',
    36: 'Kanakanavu'
}

# Create a new dictionary with keys and values swapped
TRIBES = {v: k for k, v in original_dict.items()}
NAMES = sorted([
    'Amis',
    'Atayal',
    'Paiwan',
    'Bunun',
    'Puyma',
    'Rukai',
    'Tsou',
    'Saisiyat',
    'Yami',
    'Thao',
    'Kavalan',
    'Truku',
    'Sakizaya',
    'Seediq',
    'Saaroa',
    'Kanakanavu'
])

INTERVAL = 1000

NAME_TO_IDX = {name: i for i, name in enumerate(NAMES)}

In [57]:
# rate vars
# this mostly shouldn't matter since API times are already slow-ish
RATE_LIMIT = 25
RATE_PERIOD = 1

In [58]:
def getWords(index: int) -> list: # scrape, do once -> provides wordlist
    # Get path
    folders = [folder for folder in os.listdir(os.getcwd()) if os.path.isdir(folder) and not folder[0] == "."]
    folder = folders[index] # index
    get_pdf = [file for file in os.listdir(folder)]
    get_pdf = [file for file in get_pdf if re.search(r".*\.pdf", file.lower())]
    filepath = os.path.join(os.getcwd(), folder, get_pdf[0])

    # Open it + scrape
    all_tx = []
    with open(filepath, 'rb') as f:
        reader = ppdf.PdfReader(f)

        for num in range(len(reader.pages)):
            page = reader.pages[num]
            all_tx.append(page.extract_text())

    # Get words
    fullstring = ""
    for line in all_tx:
        fullstring += line
    
    # Split + identify words
    sep = fullstring.split("\n")
    words = [word for word in sep if "★" in word]
    phrase = r"^([a-zA-Z'ʉ0-9\s]+)"
    return [result.group().replace(" ", "") for word in words if (result := re.search(phrase, word))]

In [59]:
@sleep_and_retry # if uncommented, ratelimiting
@limits(calls=RATE_LIMIT, period=RATE_PERIOD)
def getData(tribeName: str, qw: str) -> 'str or dict':
    ask = {
        "FMT": 1,
        "account": "E202403005",
        "TribesCode": TRIBES[tribeName],
        "qw": qw
    }

    jsn_response = requests.post(URL, data=ask)
    text = json.loads(jsn_response.text)
    try:
        assert jsn_response.status_code == 200
        return text["GenericData"]['DATA']
    except:
        return "FAIL"

In [60]:
def extractSentences(entry: dict) -> 'str or dict': # helper for processRequest
    word = entry["Name"]
    check = entry['Explanation']
    fr, zh = '', ''

    try:
        if isinstance(check, list):
            check = check[0] # it works
            if isinstance(check['Sentence'], dict):
                fr = check['Sentence']['Original']
                zh = check['Sentence']['Chinese']

            elif isinstance(check['Sentence'], list):
                fr = check['Sentence'][0]['Original']
                zh = check['Sentence'][0]['Chinese']
        
        elif isinstance(check, dict):
            if isinstance(check['Sentence'], dict):
                fr = check['Sentence']['Original']
                zh = check['Sentence']['Chinese']

            elif isinstance(check['Sentence'], list):
                fr = check['Sentence'][0]['Original']
                zh = check['Sentence'][0]['Chinese']
        return {word: (fr, zh)}
    except:
        return "FAIL"

In [61]:
def processRequest(response):  # response into dict of {word: (fr, zh)}
    ret = {}
    fails = []
    if isinstance(response, list): # multiple entries
        for i, entry in enumerate(response):
            result = extractSentences(entry)
            word = entry["Name"]
            if result == "FAIL":
                print(word + " has failed in extraction.") # comment out if annoying 
                fails.append(word)
            else:
                ret.update(result)

    elif isinstance(response, dict): # only 1 entry
        result = extractSentences(response)
        word = response["Name"]
        if result == "FAIL":
            print(word + " has failed in extraction.") # comment out if annoying 
            fails.append(word)
        else:
            ret.update(result)
        
    return ret, fails

In [62]:
# setup: set path var + make folders + set done var
base_path = os.path.join(os.getcwd(), '.PickleScrapes')

for name in NAMES:
    check = os.path.join(base_path, name)
    if not os.path.exists(check):
        os.mkdir(check)

done = []


In [63]:
# # Uncomment and adjust accordingly if scraping loop got stopped
# done = ['Amis', 'Atayal', 'Bunun',
#         'Kanakanavu', 'Kavalan', 'Paiwan',
#         'Puyma', 'Rukai', 'Saaroa', 
#         'Saisiyat', 'Sakizaya', 'Seediq',
#         'Thao', 'Tsou']

# splittribe = 'ENTER TRIBE NAME HERE'
# last_ckpt = CHANGE TO NUMBER OF LARGEST CHECKPOINT

In [64]:
for i in range(len(NAMES)):
    scrapeTribe = NAMES[i]

    # relevant code for broken api runs
    if scrapeTribe in done:
        continue
    
    # get wordlist
    print(f"Processing {scrapeTribe}")
    print(f"\tGetting wordlist...")
    words = set(getWords(NAME_TO_IDX[scrapeTribe]))
    words = sorted(list(words)) 
    words = [w for w in words if any(char.isalnum() for char in w)]

    # setup vars to save
    word_sent_dict = {}
    fails = []
    seen = set()
    tribepath = os.path.join(base_path, scrapeTribe, scrapeTribe)

    # api request loop
    print(f"\tDoing API requests...")
    for i, query in enumerate(tqdm(words)):
        try:
            response = getData(scrapeTribe, query)

            # relevant code for broken api runs
            # if scrapeTribe == splittribe:
            #     if i <= last_ckpt:
            #         continue          

            # add to buckets
            if response == 'FAIL':
                fails.append(query)
            word_sent_dict[query] = response

            # checkpoint progress
            if i % INTERVAL == 0 and i != 0:
                with open(tribepath + '_ckpt_{0}.pkl'.format(i), 'wb') as f: 
                    pickle.dump(word_sent_dict, f)
                if fails: 
                    with open(tribepath + '_fails_{0}.pkl'.format(i), 'wb') as f: 
                        pickle.dump(fails, f)
        except:
            continue

    # write final result
    with open(tribepath + '_ckpt_END.pkl', 'wb') as f: 
        pickle.dump(word_sent_dict, f)
    with open(tribepath + '_fails_END.pkl', 'wb') as f: 
        pickle.dump(fails, f)

Processing Truku
	Getting wordlist...
	Doing API requests...


100%|██████████| 31348/31348 [13:22:55<00:00,  1.54s/it]   


Processing Yami
	Getting wordlist...
	Doing API requests...


100%|██████████| 6320/6320 [2:40:40<00:00,  1.53s/it]   


In [49]:
# # stitching -- code may be relevant if you had to run in seperate runs
# # may be applicable especially to truku

# tribepath = '.PickleScrapes\\TRIBE_NAME\\'

# tpic1, tpic2 = None, None
# with open(tribepath + 'Truku_ckpt_aside-pt1.pkl', 'rb') as f: 
#     tpic1 = pickle.load(f)

# with open(tribepath + 'Truku_ckpt_aside-pt2.pkl', 'rb') as f: 
#     tpic2 = pickle.load(f)

# clone = tpic1.copy()
# clone.update(tpic2)

# with open(tribepath + "Truku_ckpt_END.pkl", 'wb') as f:
#     pickle.dump(clone, f)

# print(f"{len(clone)}\t{len(tpic1) + len(tpic2)}")


31303