Skip to content

Commit

Permalink
Add first cut of API extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
John B Nelson committed Jun 25, 2017
1 parent bbd0d38 commit 62b5d61
Show file tree
Hide file tree
Showing 6 changed files with 429 additions and 10 deletions.
8 changes: 0 additions & 8 deletions api_fetch/Makefile

This file was deleted.

File renamed without changes.
12 changes: 12 additions & 0 deletions json_api_gen/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
RAW_HTML=data/raw/raw_html.jsonl
JSON_API=data/clean/rest_api.json

$(JSON_API): api_pipeline.py generate_json_apis.py $(RAW_HTML)
./generate_json_apis.py

$(RAW_HTML): fetch_html.py
./fetch_html.py

.PHONY: clean
clean:
rm -rf data
375 changes: 375 additions & 0 deletions json_api_gen/api_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,375 @@
import re
from vaquero import transformations as t
from lxml.html import fromstring as _fromstring


def prepare(src, dst):
# Copy non-private keys to the destination.
for k in src:
if not k.startswith('_'):
dst[k] = src[k]

# Convert the html to a lxml node
src['doc'] = _fromstring(src['_raw_html'])
del src['_raw_html']


MISSING_URLS = {'get-media-upload-status':
"https://upload.twitter.com/1.1/media/upload.json",
'post-media-upload-finalize':
"https://upload.twitter.com/1.1/media/upload.json",
'post-batch-accounts-account-id-line-items':
"https://ads-api.twitter.com/1/batch/accounts/:account_id/line_items",
'post-batch-accounts-account-id-campaigns':
"https://ads-api.twitter.com/1/batch/accounts/:account_id/campaigns"}


def _match_first(node, *css_selectors):
for selector in css_selectors:
m = node.cssselect(selector)
if m:
return m[0]
return None


def extract_and_verify_basic_info(src, dst):
doc = src['doc']
snippet = doc.cssselect("div[itemprop='articleBody'] > div.section")[0]
dst['service'] = snippet.attrib['id']

method, *path = snippet.cssselect('h1')[0].text.split(" ")
assert dst['method'] == method
dst['path'] = " ".join(path)
dst['desc'] = snippet.cssselect("p")[0].text

try:
url_section = snippet.cssselect("#resource-url")[0]
m = _match_first(url_section,
'span.pre', 'code.docutils', 'p > em', 'p > cite')

url = (m if m is not None else url_section).text_content().strip()
dst['url'] = url
except IndexError:
patch_url = MISSING_URLS.get(dst['service'])
if patch_url:
dst['url'] = patch_url
else:
raise


RESOURCE_INFO_RENAMES = {'response_formats': 'resp_format',
'requires_authentication': 'authentication_required'}


def extract_resource_information(src, dst):
doc = src['doc']
snippet = doc.cssselect('div#resource-information tr')
d = {}
for row in snippet:
k, v = [item.text_content() for item in row.cssselect('td')]
d[k] = v
t.pythonize_ks(d)
t.rename_ks(d, RESOURCE_INFO_RENAMES)
for k, v in d.items():
if k == 'resp_format':
src[k] = v
else:
dst[k] = v


def rate_limited_to_boolean(src, dst):
dst['rate_limited'] = dst.get('rate_limited') == 'Yes'


def authentication_required_to_boolean(src, dst):
b = dst.get('authentication_required', "No") != 'No'
dst['authentication_required'] = b


FORMAT_MAPPING = {'JSON': 'JSON', "204 - No Content": None}


def extract_resp_formats(src, dst):
if 'resp_format' in src:
dst['resp_format'] = FORMAT_MAPPING[src['resp_format']]
else:
dst['resp_format'] = None


LIMIT_MAP = {'requests_15_min_window_app_auth': 'app',
'requests_15_min_window_per_app': 'app',
'requests_15_min_window_user_auth': 'user'}


def extract_limits(src, dst):
dst['limits'] = {}

for k, v in dst.items():
if not k.startswith('requests'):
continue

v = v.strip()

# Special case.
if v == '15/user and 750/app':
dst['limits']['user'] = 15
dst['limits']['app'] = 750
elif v == 'Refer to your existing rate limit agreements.':
service = dst['service']
assert service == 'post-direct-messages-events-new-message-create'
else:
k = LIMIT_MAP[k]
assert k not in dst['limits']
dst['limits'][k] = int(v)


def _fix_param_desc(d):
s = d.get('description')
if s is None:
return
s = s.replace(" . ", ". ").replace(" , ", ", ")
if s.endswith(" ."):
s = s[:-2] + "."
d['description'] = s


def _fix_param_default(d):
default = d.get('default_value')
if default is not None and default == '':
del d['default_value']


TYPE_REGEXP = re.compile(r"([\d\w\_]+)\s+Type: (.*)$")


def _fix_param_type(d):
m = TYPE_REGEXP.match(d['name'])
if not m:
return
d['name'] = m.group(1)
d['type'] = m.group(2)


VALID_PARAM_KS = {'name', 'default_value', 'example', 'required',
'description', 'constraints'}


def _extract_header_from_rows(rows):
top_row = rows[0]
items = [td.text_content() or '' for td in top_row.cssselect('td')]
if not items:
items = [td.text_content() or '' for td in top_row.cssselect('th')]
if not items:
return None

t.sstrip_all(items)

ks = [t.pythonize_identifier(k) for k in items]
return tuple(ks)


def _tail(items):
head_consumed = False
for item in items:
if not head_consumed:
head_consumed = True
else:
yield item


SPECIAL_NAMES = {'follow see note*': 'follow',
'track see note*': 'track',
'user_ids DEPRECATED': 'user_ids',
'locations see note*': 'locations'}


def _extract_param_rows_normal(header, rows):
data = []

for tr in _tail(rows):
items = [td.text_content() or '' for td in tr.cssselect('td')]
t.sstrip_all(items)

d = dict(zip(header, items))

_fix_param_desc(d)
_fix_param_default(d)
_fix_param_type(d)

# Special cases
d['name'] = SPECIAL_NAMES.get(d['name'], d['name'])

data.append(d)

return data


EMBEDDED_PARAM_RE = re.compile("([\w\d\_]+)\s+\(([^\)]+)\)")


def _extract_param_rows_no_header(rows):
data = []

for tr in rows:
items = [td.text_content() or '' for td in tr.cssselect('td')]
t.sstrip_all(items)
k, desc = items

d = {'desc': desc}
_fix_param_desc(d)

name, required = EMBEDDED_PARAM_RE.match(k).groups()
d['name'] = name
d['required'] = required
data.append(d)

return data


def extract_params(src, dst):
doc = src['doc']
rows = doc.cssselect('#parameters > table tr')

if not rows:
return

header, data = _extract_header_from_rows(rows), None

rows = doc.cssselect('#parameters > table tr')
if set(header).issubset(VALID_PARAM_KS):
data = _extract_param_rows_normal(header, rows)
elif len(header) == 2:
data = _extract_param_rows_no_header(rows)
else:
dst['_bad_ks'] = header

if data:
dst['params'] = data


def fix_edge_case_params(src, dst):
for param in dst.get('params', []):
parts = param['name'].split(' ')
if len(parts) == 1:
continue
param['name'] = parts[0]
param['required'] = parts[1]


REQUIRED_VALUES = {'True', 'requied', 'required'}


def normalize_required_params(src, dst):
for param in dst.get('params', []):
param['required'] = param.get('required') in REQUIRED_VALUES


def extract_example_request(src, dst):
doc = src['doc']
example_req = doc.cssselect('#example-request > p')
if example_req:
dst['example_request'] = example_req[0].text_content()


def extract_example_response(src, dst):
doc = src['doc']
code_sections = doc.cssselect('#example-response > div.code')
if code_sections:
dst['example_response'] = code_sections[0].text_content()


def extract_family(_, dst):
path = dst['path']

if path.startswith('/1/'): # Ads
path = path.replace("/1/", "")

sub_family = path.split('/')[0]
dst['family'] = dst['group'] + ":" + sub_family
assert sub_family


VALID_GROUPS = {'ads', 'rest', 'streaming', 'webhooks'}

VALID_FAMILES = {'ads:accounts',
'ads:batch',
'ads:bidding_rules',
'ads:conversion_attribution',
'ads:conversion_event',
'ads:iab_categories',
'ads:insights',
'ads:line_items',
'ads:stats',
'ads:tailored_audience_memberships',
'ads:targeting_criteria',
'rest:account',
'rest:application',
'rest:blocks',
'rest:collections',
'rest:direct_messages',
'rest:favorites',
'rest:followers',
'rest:friends',
'rest:friendships',
'rest:geo',
'rest:help',
'rest:lists',
'rest:media',
'rest:mutes',
'rest:saved_searches',
'rest:search',
'rest:statuses',
'rest:trends',
'rest:users',
'streaming:c',
'streaming:site',
'streaming:statuses',
'streaming:user',
'webhooks:account_activity'}


VALIDATION_SPEC = {
'authentication_required': ({True, False}, False),
'desc': (None, None),
'family': (VALID_FAMILES, None),
'group': (VALID_GROUPS, None),
'method': ({'GET', 'POST', 'DELETE'}, 'GET'),
'resp_format': ({'JSON', None}, 'JSON'),
'rate_limited': ({True, False}, True),
'path': (None, None),
'url': (None, None),
'reference_url': (None, None)
}


def validate(_, dst):
for k, (valid_set, default) in VALIDATION_SPEC.items():
if k not in dst:
if default is not None:
dst[k] = default
else:
assert False, "Missing and no default on {}".format(k)
elif valid_set is not None:
assert dst[k] in valid_set, dst[k]


PARAM_VALIDATION_SPEC = {
'name': (None, None),
'required': ({True, False}, None)
}


def validate_params(_, dst):
if 'params' not in dst:
dst['params'] = []
return

for param in dst['params']:
for k, (valid_set, default) in PARAM_VALIDATION_SPEC.items():
if k not in param:
if default is not None:
param[k] = default
else:
assert False, "Missing and no default on {}".format(k)
elif valid_set is not None:
assert param[k] in valid_set, param[k]

0 comments on commit 62b5d61

Please sign in to comment.