URL Crawl Status Lookup
=====================

Given a URL, this page will look the URL up in crawl-time and access-time indexes and report on the recent status.

In [32]:
%%javascript
// This is necessary to stop the output area folding up
IPython.OutputArea.prototype._should_scroll = function(lines) {return false}

<IPython.core.display.Javascript object>

In [33]:
#%pip install altair

In [1]:
from IPython.display import Javascript
from ipywidgets import interact, interactive, fixed, interact_manual, Layout, HTML
import ipywidgets as widgets
import requests
from datetime import datetime
import pandas as pd
import altair as alt
from cdx.cdx_helper import cdx_query, CRAWL_CDX

# --
# See https://github.com/voila-dashboards/voila/pull/218#issuecomment-553654037
# 
# An additional hook can be added to set the default url:
# 
# --

default_url = 'https://www.bl.uk/'



def query_to_df(query):
    data = []
    for h in query:
        data.append(h.to_dict())
    return pd.DataFrame(data,
                        columns=['urlkey', 'crawl_date', 'timestamp', 'original', 'mimetype', 'statuscode',
                                 'redirecturl', 'robotflags', 'length', 'offset', 'filename', 'digest'])

def get_year_profile(url, cdx_service='http://cdx.api.wa.bl.uk/data-heritrix'):
    years = {}
    for year in np.arange(2002, 2020):
        years[year] = 0
    for h in cdx_query(url, cdx_service=cdx_service, limit=100000):
        year = int(h.timestamp[0:4])
        years[year] += 1
    return years

def chart_events(source, size):
    return alt.Chart(source).transform_calculate(
            # Add a link to QA Wayback:
            url='https://www.webarchive.org.uk/act/wayback/' + alt.datum.timestamp+ '/' + alt.datum.original
        ).mark_circle(size=size).encode(
            alt.X('crawl_date'),
            alt.Y('statuscode'),
            color='source',
            href='url:N',
            tooltip=['urlkey', 'crawl_date', 'timestamp', 'original', 'mimetype', 'statuscode',
                     'redirecturl', 'robotflags', 'length', 'offset', 'filename', 'digest', 'url:N']
        ).properties(
            width=800,
            height=200
        ).interactive()

# ------------------------------------------------

def selected_timestamp(change):
    timestamp = change['new']
    ts_output.clear_output()
    with ts_output:
        display(timestamp, change)
        ts = "%s-%s-%s+%s%%3A%s" % (timestamp[0:4],timestamp[4:6],timestamp[6:8],timestamp[8:10],timestamp[10:12])
        display(HTML("<a href='http://192.168.45.91:90/intranet/logs/?topic=fc&from_date=%s&url_filter=%s*'>Kafka</a>" % (ts, url_widget.value)))

def lookup(url, limit):
    wbdf = query_to_df(cdx_query(url, limit=limit))
    wbdf['source'] = 'Wayback'
    fcdf = query_to_df(cdx_query(url, cdx_service=CRAWL_CDX, limit=limit))
    fcdf['source'] = 'Crawler'
    # Plot the overview:
    display(chart_events(fcdf, 500) + chart_events(wbdf, 200))    

    # Add the time-stamp selector:
    times = [(None,None)]
    for index, row in fcdf.iterrows():
        times.append(("%s (HTTP %s) %s" % (row['crawl_date'], row['statuscode'], row.get('filename',None)), row['timestamp']))
    ts_widget = widgets.Dropdown(
        options=times,
        description='Timestamp:'
    )
    ts_widget.observe(selected_timestamp, names='value')
    display(ts_widget)
    display(ts_output)



# Set up the widgets
url_widget = widgets.Text(description="URL:", placeholder=default_url, value=default_url,
                          layout=Layout(width='800px'))
limit_widget = widgets.IntText(description="# results", value=50)

ts_output = widgets.Output()

# Assemble the interactive bit:
widgets.interact_manual.opts['manual_name'] = 'Lookup this URL'
out = interact_manual(lookup, 
                      url=url_widget,
                      limit=limit_widget
)

# Auto run with the default values so you get the idea:
#lookup(url_widget.value, limit_widget.value, None)

interactive(children=(Text(value='https://www.bl.uk/', description='URL:', layout=Layout(width='800px'), place…

Note that the crawl-time index only holds relatively recent events, whereas the access-time index should hold all events except the most recent.