In [90]:
import re
import requests
import lxml.html
import datetime
import json

URL = "http://lanyrd.com/profile/jacobian/coverage"

In [44]:
# Get all the dates that the wayback machine has.

resp = requests.get('https://web.archive.org/cdx/search/cdx', params={
    "url": URL,
    "showDupeCount": "true",
    "output": "json"
})

In [45]:
# This returns a weird data structure -- a list of lists, the first item being headers 
# and the remaining being fields. Sorta like CSV encoded into JSON... so munge that 
# into a better format:

cdx = resp.json()
fields = cdx[0]
snapshots = [dict(zip(fields, row)) for row in cdx[1:]]

In [60]:
def strip_archive_url(url):
    """
    Convert an archive.org URL (https://web.archive.org/web/ddddd/URL) to the original.
    """
    return re.sub(".*/web/\d+/(.*)", r"\1", url)

In [57]:
MONTHS_3_REV = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
                "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12}

def extract_date(s):
    """
    Extract a date from lanyrd-style date string
    
    This can be a range (e.g. "3rd-8th June 2018")
    or a single date ("3rd June 2018").
    
    FIXME: what if it spans a month?
    
    This fucntion always returns (start, end), even
    if it's just a single date.
    """
    # Two-date version:
    m = re.search(r'(\d{1,2})(th|st|rd)-(\d{1,2})(th|st|rd) (\w+) (\d{4})', s)
    if m:
        start_day, _, end_day, _, month, year = m.groups()
        month = MONTHS_3_REV[month.lower()[:3]]
        start_date = datetime.date(int(year), month, int(start_day))
        end_date = datetime.date(int(year), month, int(end_day))
        return (start_date, end_date)
    
    # Single-date version
    m = re.search(r'(\d{1,2})(th|st|rd) (\w+) (\d{4})', s)
    if m:
        day, _, month, year = m.groups()
        month = MONTHS_3_REV[month.lower()[:3]]
        date = datetime.date(int(year), month, int(day))
        return (date, date)

    raise ValueError(f"can't parse date string: '{s}'")

In [80]:
def scrape_coverage(snapshots):
    """
    Scrape coverage (video, slides, etc) from each snapshot
    Have to do this because pagination isn't archived (it
    uses query strings), so this is the best way to go back
    and gather as much as possible
    """
    coverage = {}

    # Map conference titles to conference info, so that we don't
    # fetch it multiple times for repeat conferences
    conferences = {}

    for snapshot in snapshots:
        ts = snapshot["timestamp"]
        resp = requests.get(f"https://web.archive.org/web/{ts}/{URL}")
        doc = lxml.html.fromstring(resp.text)
        for ci in doc.cssselect('#coverage .coverage-item'):

            # Extract link to the coverage (view, slides, etc)
            # Treat this as the unique id since as we page we might
            # see these repeatedly. Skip if we've seen it before.
            coverage_link = strip_archive_url(ci.cssselect('.title a')[0].attrib['href'])
            if coverage_link in coverage:
                continue

            coverage_type = ci.cssselect('span.type')[0].text.lower()

            # Talk and conference title, and links to the archive.org pages
            # Don't follow these now, since the same conference and talk
            # may show up multiple times.
            e_talk, e_conference, *_ = ci.cssselect('p.meta a')
            talk_title = e_talk.text

            con_title = e_conference.text
            if con_title not in conferences:
                conferences[con_title] = scrape_conference(e_conference.attrib['href'])

            start_date, end_date = extract_date(ci.cssselect('p.meta')[0].text_content())

            coverage[coverage_link] = {
                'type': coverage_type,
                'talk_title': talk_title,
                'conference_title': con_title,
                'conference': conferences[con_title],
                'start_date': start_date,
                'end_date': end_date
            }
            
    return coverage

In [86]:
def scrape_conference(url):
    resp = requests.get(url)
    if resp.status_code != 200:
        return {'error': resp.status_code}
        
    doc = lxml.html.fromstring(resp.text)
    
    return {
        'title': doc.cssselect('h1.summary')[0].text_content(),
        'link': strip_archive_url(doc.cssselect('a.icon.url.website')[0].attrib['href'])
    }

In [88]:
coverage = scrape_coverage(snapshots)

In [93]:
def serialize_dates(obj):
    if hasattr(obj, 'isoformat'):
        return obj.isoformat()
    raise TypeError(type(obj))
        
with open('/tmp/lanyrd-coverage.json', 'w') as fp:
    json.dump(coverage, fp, default=serialize_dates)