diff --git a/api_fetch/Makefile b/api_fetch/Makefile deleted file mode 100644 index f381578..0000000 --- a/api_fetch/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -RAW_HTML=data/raw/raw_html.jsonl - -$(RAW_HTML): fetch_api.py - ./fetch_api.py - -.PHONY: clean -clean: - rm -rf data diff --git a/api_fetch/.gitignore b/json_api_gen/.gitignore similarity index 100% rename from api_fetch/.gitignore rename to json_api_gen/.gitignore diff --git a/json_api_gen/Makefile b/json_api_gen/Makefile new file mode 100644 index 0000000..46f9521 --- /dev/null +++ b/json_api_gen/Makefile @@ -0,0 +1,12 @@ +RAW_HTML=data/raw/raw_html.jsonl +JSON_API=data/clean/rest_api.json + +$(JSON_API): api_pipeline.py generate_json_apis.py $(RAW_HTML) + ./generate_json_apis.py + +$(RAW_HTML): fetch_html.py + ./fetch_html.py + +.PHONY: clean +clean: + rm -rf data diff --git a/json_api_gen/api_pipeline.py b/json_api_gen/api_pipeline.py new file mode 100644 index 0000000..e6d64c8 --- /dev/null +++ b/json_api_gen/api_pipeline.py @@ -0,0 +1,375 @@ +import re +from vaquero import transformations as t +from lxml.html import fromstring as _fromstring + + +def prepare(src, dst): + # Copy non-private keys to the destination. + for k in src: + if not k.startswith('_'): + dst[k] = src[k] + + # Convert the html to a lxml node + src['doc'] = _fromstring(src['_raw_html']) + del src['_raw_html'] + + +MISSING_URLS = {'get-media-upload-status': + "https://upload.twitter.com/1.1/media/upload.json", + 'post-media-upload-finalize': + "https://upload.twitter.com/1.1/media/upload.json", + 'post-batch-accounts-account-id-line-items': + "https://ads-api.twitter.com/1/batch/accounts/:account_id/line_items", + 'post-batch-accounts-account-id-campaigns': + "https://ads-api.twitter.com/1/batch/accounts/:account_id/campaigns"} + + +def _match_first(node, *css_selectors): + for selector in css_selectors: + m = node.cssselect(selector) + if m: + return m[0] + return None + + +def extract_and_verify_basic_info(src, dst): + doc = src['doc'] + snippet = doc.cssselect("div[itemprop='articleBody'] > div.section")[0] + dst['service'] = snippet.attrib['id'] + + method, *path = snippet.cssselect('h1')[0].text.split(" ") + assert dst['method'] == method + dst['path'] = " ".join(path) + dst['desc'] = snippet.cssselect("p")[0].text + + try: + url_section = snippet.cssselect("#resource-url")[0] + m = _match_first(url_section, + 'span.pre', 'code.docutils', 'p > em', 'p > cite') + + url = (m if m is not None else url_section).text_content().strip() + dst['url'] = url + except IndexError: + patch_url = MISSING_URLS.get(dst['service']) + if patch_url: + dst['url'] = patch_url + else: + raise + + +RESOURCE_INFO_RENAMES = {'response_formats': 'resp_format', + 'requires_authentication': 'authentication_required'} + + +def extract_resource_information(src, dst): + doc = src['doc'] + snippet = doc.cssselect('div#resource-information tr') + d = {} + for row in snippet: + k, v = [item.text_content() for item in row.cssselect('td')] + d[k] = v + t.pythonize_ks(d) + t.rename_ks(d, RESOURCE_INFO_RENAMES) + for k, v in d.items(): + if k == 'resp_format': + src[k] = v + else: + dst[k] = v + + +def rate_limited_to_boolean(src, dst): + dst['rate_limited'] = dst.get('rate_limited') == 'Yes' + + +def authentication_required_to_boolean(src, dst): + b = dst.get('authentication_required', "No") != 'No' + dst['authentication_required'] = b + + +FORMAT_MAPPING = {'JSON': 'JSON', "204 - No Content": None} + + +def extract_resp_formats(src, dst): + if 'resp_format' in src: + dst['resp_format'] = FORMAT_MAPPING[src['resp_format']] + else: + dst['resp_format'] = None + + +LIMIT_MAP = {'requests_15_min_window_app_auth': 'app', + 'requests_15_min_window_per_app': 'app', + 'requests_15_min_window_user_auth': 'user'} + + +def extract_limits(src, dst): + dst['limits'] = {} + + for k, v in dst.items(): + if not k.startswith('requests'): + continue + + v = v.strip() + + # Special case. + if v == '15/user and 750/app': + dst['limits']['user'] = 15 + dst['limits']['app'] = 750 + elif v == 'Refer to your existing rate limit agreements.': + service = dst['service'] + assert service == 'post-direct-messages-events-new-message-create' + else: + k = LIMIT_MAP[k] + assert k not in dst['limits'] + dst['limits'][k] = int(v) + + +def _fix_param_desc(d): + s = d.get('description') + if s is None: + return + s = s.replace(" . ", ". ").replace(" , ", ", ") + if s.endswith(" ."): + s = s[:-2] + "." + d['description'] = s + + +def _fix_param_default(d): + default = d.get('default_value') + if default is not None and default == '': + del d['default_value'] + + +TYPE_REGEXP = re.compile(r"([\d\w\_]+)\s+Type: (.*)$") + + +def _fix_param_type(d): + m = TYPE_REGEXP.match(d['name']) + if not m: + return + d['name'] = m.group(1) + d['type'] = m.group(2) + + +VALID_PARAM_KS = {'name', 'default_value', 'example', 'required', + 'description', 'constraints'} + + +def _extract_header_from_rows(rows): + top_row = rows[0] + items = [td.text_content() or '' for td in top_row.cssselect('td')] + if not items: + items = [td.text_content() or '' for td in top_row.cssselect('th')] + if not items: + return None + + t.sstrip_all(items) + + ks = [t.pythonize_identifier(k) for k in items] + return tuple(ks) + + +def _tail(items): + head_consumed = False + for item in items: + if not head_consumed: + head_consumed = True + else: + yield item + + +SPECIAL_NAMES = {'follow see note*': 'follow', + 'track see note*': 'track', + 'user_ids DEPRECATED': 'user_ids', + 'locations see note*': 'locations'} + + +def _extract_param_rows_normal(header, rows): + data = [] + + for tr in _tail(rows): + items = [td.text_content() or '' for td in tr.cssselect('td')] + t.sstrip_all(items) + + d = dict(zip(header, items)) + + _fix_param_desc(d) + _fix_param_default(d) + _fix_param_type(d) + + # Special cases + d['name'] = SPECIAL_NAMES.get(d['name'], d['name']) + + data.append(d) + + return data + + +EMBEDDED_PARAM_RE = re.compile("([\w\d\_]+)\s+\(([^\)]+)\)") + + +def _extract_param_rows_no_header(rows): + data = [] + + for tr in rows: + items = [td.text_content() or '' for td in tr.cssselect('td')] + t.sstrip_all(items) + k, desc = items + + d = {'desc': desc} + _fix_param_desc(d) + + name, required = EMBEDDED_PARAM_RE.match(k).groups() + d['name'] = name + d['required'] = required + data.append(d) + + return data + + +def extract_params(src, dst): + doc = src['doc'] + rows = doc.cssselect('#parameters > table tr') + + if not rows: + return + + header, data = _extract_header_from_rows(rows), None + + rows = doc.cssselect('#parameters > table tr') + if set(header).issubset(VALID_PARAM_KS): + data = _extract_param_rows_normal(header, rows) + elif len(header) == 2: + data = _extract_param_rows_no_header(rows) + else: + dst['_bad_ks'] = header + + if data: + dst['params'] = data + + +def fix_edge_case_params(src, dst): + for param in dst.get('params', []): + parts = param['name'].split(' ') + if len(parts) == 1: + continue + param['name'] = parts[0] + param['required'] = parts[1] + + +REQUIRED_VALUES = {'True', 'requied', 'required'} + + +def normalize_required_params(src, dst): + for param in dst.get('params', []): + param['required'] = param.get('required') in REQUIRED_VALUES + + +def extract_example_request(src, dst): + doc = src['doc'] + example_req = doc.cssselect('#example-request > p') + if example_req: + dst['example_request'] = example_req[0].text_content() + + +def extract_example_response(src, dst): + doc = src['doc'] + code_sections = doc.cssselect('#example-response > div.code') + if code_sections: + dst['example_response'] = code_sections[0].text_content() + + +def extract_family(_, dst): + path = dst['path'] + + if path.startswith('/1/'): # Ads + path = path.replace("/1/", "") + + sub_family = path.split('/')[0] + dst['family'] = dst['group'] + ":" + sub_family + assert sub_family + + +VALID_GROUPS = {'ads', 'rest', 'streaming', 'webhooks'} + +VALID_FAMILES = {'ads:accounts', + 'ads:batch', + 'ads:bidding_rules', + 'ads:conversion_attribution', + 'ads:conversion_event', + 'ads:iab_categories', + 'ads:insights', + 'ads:line_items', + 'ads:stats', + 'ads:tailored_audience_memberships', + 'ads:targeting_criteria', + 'rest:account', + 'rest:application', + 'rest:blocks', + 'rest:collections', + 'rest:direct_messages', + 'rest:favorites', + 'rest:followers', + 'rest:friends', + 'rest:friendships', + 'rest:geo', + 'rest:help', + 'rest:lists', + 'rest:media', + 'rest:mutes', + 'rest:saved_searches', + 'rest:search', + 'rest:statuses', + 'rest:trends', + 'rest:users', + 'streaming:c', + 'streaming:site', + 'streaming:statuses', + 'streaming:user', + 'webhooks:account_activity'} + + +VALIDATION_SPEC = { + 'authentication_required': ({True, False}, False), + 'desc': (None, None), + 'family': (VALID_FAMILES, None), + 'group': (VALID_GROUPS, None), + 'method': ({'GET', 'POST', 'DELETE'}, 'GET'), + 'resp_format': ({'JSON', None}, 'JSON'), + 'rate_limited': ({True, False}, True), + 'path': (None, None), + 'url': (None, None), + 'reference_url': (None, None) +} + + +def validate(_, dst): + for k, (valid_set, default) in VALIDATION_SPEC.items(): + if k not in dst: + if default is not None: + dst[k] = default + else: + assert False, "Missing and no default on {}".format(k) + elif valid_set is not None: + assert dst[k] in valid_set, dst[k] + + +PARAM_VALIDATION_SPEC = { + 'name': (None, None), + 'required': ({True, False}, None) +} + + +def validate_params(_, dst): + if 'params' not in dst: + dst['params'] = [] + return + + for param in dst['params']: + for k, (valid_set, default) in PARAM_VALIDATION_SPEC.items(): + if k not in param: + if default is not None: + param[k] = default + else: + assert False, "Missing and no default on {}".format(k) + elif valid_set is not None: + assert param[k] in valid_set, param[k] diff --git a/api_fetch/fetch_api.py b/json_api_gen/fetch_html.py similarity index 88% rename from api_fetch/fetch_api.py rename to json_api_gen/fetch_html.py index 162a348..f07ace5 100755 --- a/api_fetch/fetch_api.py +++ b/json_api_gen/fetch_html.py @@ -16,7 +16,7 @@ def fetch_endpoints_from(root_url): doc = fromstring(requests.get(root_url).content) - doc.make_links_absolute("https://dev.twitter.com/rest/reference") + doc.make_links_absolute(root_url) valid_methods = {'DELETE', 'GET', 'POST'} @@ -33,6 +33,7 @@ def fetch_endpoints_from(root_url): def fetch_and_save_all(fp, group, reference_root_url): + already_fetched = set() endpoints = fetch_endpoints_from(reference_root_url) for d in endpoints: @@ -40,7 +41,9 @@ def fetch_and_save_all(fp, group, reference_root_url): resp = requests.get(d['reference_url']) d['_raw_html'] = resp.content.decode(resp.encoding) d['group'] = group - fp.write(json.dumps(d) + "\n") + if d['reference_url'] not in already_fetched: + fp.write(json.dumps(d) + "\n") + already_fetched.add(d['reference_url']) if __name__ == '__main__': diff --git a/json_api_gen/generate_json_apis.py b/json_api_gen/generate_json_apis.py new file mode 100755 index 0000000..9fbaf46 --- /dev/null +++ b/json_api_gen/generate_json_apis.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +import json +import os +from collections import defaultdict +from vaquero import Vaquero, ModulePipeline +from vaquero.util import jsonlines_reader + + +SELF_DIR = os.path.dirname(os.path.realpath(__file__)) +RAW_DIR = os.path.join(SELF_DIR, "data", "raw") +CLEAN_DIR = os.path.join(SELF_DIR, "data", "clean") +INPUT_PATH = os.path.join(RAW_DIR, "raw_html.jsonl") + + +if __name__ == '__main__': + os.makedirs(CLEAN_DIR, exist_ok=True) + definitions = defaultdict(list) + + vaq = Vaquero(max_failures=0) + import api_pipeline + pipeline = ModulePipeline(api_pipeline) + + # These reset so you can do `%run -i extract_definitions.py` + vaq.reset() + pipeline.reload() + + for src in jsonlines_reader(INPUT_PATH): + dst = {} + with vaq: + pipeline(src, dst) + definitions[dst['group']].append(dst) + + if not os.environ.get('DEBUG'): + for group, defns in definitions.items(): + with open(os.path.join(CLEAN_DIR, group + "_api.json"), "w") as fp: + json.dump(defns, fp, indent=4)