# ASG wiki downloader

Downloads all grammar points articles from AllSet Learning Chinese Grammar Wiki (https://resources.allsetlearning.com/chinese/grammar/)

* Saves each article's wiki markup to `wiki/<ID>.txt`.
* List of article IDs and titles is saved to `wiki/index.tsv`. Titles from redirect pages's text.
* Titles are also stored in a `<!-- -->` comment on the first line of each file.

In [1]:
import pandas as pd
import requests, os, re, html, hashlib, time, urllib

WIKI_DIR = '../wiki'
CACHE_DIR = '../../data/asg/cache'
PACE = 30    # delay between issuing http requests

!mkdir -p {CACHE_DIR} {WIKI_DIR}

def get_cached_or_download(url):
    id = hashlib.sha256(url.encode('utf-8')).hexdigest()[:8]
    cache_path = f'{CACHE_DIR}/{id}.html'
    if os.path.exists(cache_path):
        return open(cache_path).read()
    print('Downloading %s' % url)
    resp = requests.get(url, allow_redirects=True)
    assert resp.ok
    text = resp.content.decode('utf-8')
    if '<html' in text.lower():
        assert '</html>' in text.lower()
    with open(cache_path, 'w') as f:
        f.write(text)
    print('OK')
    time.sleep(PACE)
    return text

def get_wiki_source(page_id):
    quoted = urllib.request.quote(page_id)
    url = f'https://resources.allsetlearning.com/gramwiki/?title={quoted}&action=edit'
    text = get_cached_or_download(url)
    assert text.count('<textarea ') == 1, (url, text)
    assert text.count('</textarea>') == 1
    text = text[text.index('<textarea '):text.index('</textarea>')]
    text = re.sub('^<textarea.* name="wpTextbox1">', '', text)
    assert '<textarea' not in text
    assert '<' not in text
    return html.unescape(text)

In [2]:
ASG_TO_NAME = {}

def visit_asg_page(asg_id):
    assert asg_id.startswith('ASG')

    text = get_wiki_source(asg_id)
    m = re.match(r'#REDIRECT *\[\[(.*)\]\]', text.strip())
    assert m
    redir_id = m[1].strip()
    redir_text = text
    text = get_wiki_source(redir_id)

    assert re.search(r'grammar point\|\s*%s' % asg_id, text) or asg_id in [
        'ASGE8810' # used to have {{Basic grammar}}, but intentionally pulled of grammar lists, still has ID and examples
    ], (asg_id, text)
    assert '{{Grammar' in text or asg_id in [
        'ASGE8810',
        'ASGBA782','ASG23903', 'ASG87B11',   #stubs
    ], (asg_id, text)

    assert redir_id != asg_id
    assert not redir_id.startswith('ASG')
    assert '\n' not in redir_id
    assert redir_id == redir_id.strip()
    title = redir_id

    with open(f'{WIKI_DIR}/{asg_id}.txt', 'w') as f:
        f.write(f'<!-- {title} -->\n{text}')

    ASG_TO_NAME.setdefault(asg_id, title)
    assert ASG_TO_NAME[asg_id] == title

In [3]:
ids = set()
for url in [
        'https://resources.allsetlearning.com/chinese/grammar/A1_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/A2_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/B1_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/B2_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/C1_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/HSK_1_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/HSK_2_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/HSK_3_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/HSK_4_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/HSK_5_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/HSK_6_grammar_points',
    ]:
    text = get_cached_or_download(url)
    m = re.search('There are <b>([0-9]+)</b> total.{1,20}grammar points', text)
    assert m, url
    kexpect = int(m[1])
    kfound = 0
    for link in re.findall('<a [^>]*href="([^"]+)"', text):
        m = re.match('/chinese/grammar/(ASG.*)', link)
        if m:
            assert len(m[1]) == 8
            kfound += 1
            ids.add(m[1])
    print(f'{url} found {kfound} expected {kexpect}')

print('Total %d unique IDs' % len(ids))
for page_id in sorted(set(ids)):
    visit_asg_page(page_id)

# some discrepancies because of stubs / incomplete articles

https://resources.allsetlearning.com/chinese/grammar/A1_grammar_points found 40 expected 40
https://resources.allsetlearning.com/chinese/grammar/A2_grammar_points found 99 expected 99
https://resources.allsetlearning.com/chinese/grammar/B1_grammar_points found 143 expected 143
https://resources.allsetlearning.com/chinese/grammar/B2_grammar_points found 152 expected 154
https://resources.allsetlearning.com/chinese/grammar/C1_grammar_points found 61 expected 69
https://resources.allsetlearning.com/chinese/grammar/HSK_1_grammar_points found 54 expected 54
https://resources.allsetlearning.com/chinese/grammar/HSK_2_grammar_points found 78 expected 79
https://resources.allsetlearning.com/chinese/grammar/HSK_3_grammar_points found 86 expected 87
https://resources.allsetlearning.com/chinese/grammar/HSK_4_grammar_points found 113 expected 115
https://resources.allsetlearning.com/chinese/grammar/HSK_5_grammar_points found 107 expected 109
https://resources.allsetlearning.com/chinese/grammar/HSK_

In [4]:
# Find the rest of articles via 'what links here' and category lists

links = set()
for url in [
        'https://resources.allsetlearning.com/gramwiki/?title=Special:WhatLinksHere/Template:Grammar_Box&limit=500',
        'https://resources.allsetlearning.com/gramwiki/?title=Special:WhatLinksHere/Template:Grammar_Box&limit=500&from=4166&back=0',
        'https://resources.allsetlearning.com/gramwiki/?title=Special:WhatLinksHere/Template:HSK&limit=500',
        'https://resources.allsetlearning.com/gramwiki/?title=Special:WhatLinksHere/Template:Structure&limit=500',
        'https://resources.allsetlearning.com/gramwiki/?title=Special:WhatLinksHere/Template:Used_for&limit=500',
        'https://resources.allsetlearning.com/chinese/grammar/Category:A1_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/Category:A2_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/Category:B1_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/Category:B2_grammar_points',
        'https://resources.allsetlearning.com/chinese/grammar/Category:C1_grammar_points',
        #'https://resources.allsetlearning.com/gramwiki/?title=Special:ListRedirects&limit=500&offset=0',
        #'https://resources.allsetlearning.com/gramwiki/?title=Special:ListRedirects&limit=500&offset=500',
        #'https://resources.allsetlearning.com/gramwiki/?title=Special:ListRedirects&limit=500&offset=1000',
        #'https://resources.allsetlearning.com/gramwiki/?title=Special:ListRedirects&limit=500&offset=1500',
    ]:
    text = get_cached_or_download(url)
    for link in re.findall('<a [^>]*href="([^"]+)"', text):
        if not link.startswith('/chinese/grammar/'): continue
        if re.match('.*/(Special:.*|Chinese_Grammar_Wiki:.*|Category:.*|Template:.*|File:.*[.]jpg$)$', link):continue
        link = link[len('/chinese/grammar/'):]
        link = urllib.request.unquote(link)
        links.add(link)

links = set(links) - set('''
A1_grammar_points
A2_grammar_points
Acknowledgments
B1_grammar_points
B2_book_omissions
B2_grammar_points
C1_grammar_points
Chinese_textbook_grammar_index
Contact
Forums
Grammar_points_by_level
HSK_1_grammar_points
HSK_2_grammar_points
HSK_3_grammar_points
Interrogative_pronouns
Keywords
Learner_FAQ
Main_Page
Measure_word
Result_complement
State_complement
Tools
Translations
'''.strip().split())

print('%d links' % len(links))

for page_id in sorted(set(links)):
    text = get_wiki_source(page_id)
    m = re.search(r'\|grammar point\|\s*(ASG.....)}}', text)
    if not m:
        if text.strip().startswith('#REDIRECT'): continue
        print('Missing ID: %s%s' % (page_id, ' {{stub}}' if '{{stub}}' in text.lower() else ''))
        #print('\t', repr(text)[:min(len(text), 500)])
        continue
    asg_id = m[1]
    visit_asg_page(asg_id)

visit_asg_page('ASGE8810')  # missing main grammar templates to make discoverable

521 links
Missing ID: "Prefer...rather_than"_as_"与其......宁可" {{stub}}
Missing ID: Additional_way_to_express_"in_the_name_of" {{stub}}
Missing ID: Advanced_uses_of_"bei" {{stub}}
Missing ID: After_or_before_a_specific_time
Missing ID: Ba-test
Missing ID: Comparing_"jingli"_and_"tiyan" {{stub}}
Missing ID: Comparing_"tuiguang"_and_“xuanchuan" {{stub}}
Missing ID: Express_an_action_and_its_effect_by_using_"tongguo…_shi" {{stub}}
Missing ID: Expressing_"age_difference"_with_"da_and_xiao"
Missing ID: Expressing_"however"_with_"ran'er"
Missing ID: Expressing_"inevitably"_with_"shibi" {{stub}}
Missing ID: Expressing_uncertainty_about_"whether_or_not" {{stub}}
Missing ID: Rhetorical_questions {{stub}}
Missing ID: Taiwanese_Mandarin_use_of_"bucuo" {{stub}}
Missing ID: Use_"tangruo"_to_express_"if" {{stub}}
Missing ID: Using_the_"suo"_structure {{stub}}


In [5]:
asg_df = pd.DataFrame(ASG_TO_NAME.items(), columns=['ID', 'Title']).sort_values('ID')
asg_df.to_csv(f'{WIKI_DIR}/index.tsv', sep='\t', index=False)
print('%d total grammar point articles downloaded' % len(asg_df))

506 total grammar point articles downloaded


In [6]:
!ls -l {WIKI_DIR}/index.tsv

-rw-r--r-- 1 jovyan users 24526 Oct 14 15:13 ../wiki/index.tsv
