# Archived Holdings - DLS Status

Comparing holdings on HDFS with what's in DLS, based on the status information stored in the Tracking Database.


In [1]:
import json
import requests
import pandas as pd

headers = {'content-type': "application/json" }

json_facet = {
    # Primary facet is by date - here we break down the last month(s) into days
    'facet': {
        'dates' : { 
            'type' : 'range', 
            'field' : 'timestamp_dt', 
            'start' : "NOW/YEAR-10YEAR",
            'end' : "NOW/YEAR+1YEAR", 
            'gap' : "+1MONTH", 
            # For each day, we facet based on the CDX Index field, and make sure items with no value get recorded:
            'facet': { 
                'stream': { 
                    'type': 'terms', 
                    "field": "stream_s", 
                    'missing': True,
                    'facet': { 
                        'index_status': { 
                            'type': 'terms', 
                            "field": "dls_status_i", 
                            'missing': True,
                            'facet' : {
                                'bytes': 'sum(file_size_l)'
                            }
                        }
                    }
                }
            }
        } 
    }
}


params = {
  'q': '(kind_s:"warcs" OR kind_s:"logs") AND collection_s:"npld"',
  'rows': 0
}

r = requests.post("http://trackdb.dapi.wa.bl.uk/solr/tracking/select", params=params, data=json.dumps(json_facet), headers=headers)

if r.status_code != 200:
    print(r.text)

from solr_facet_helper import flatten_solr_buckets

df = pd.DataFrame(flatten_solr_buckets(r.json()['facets']))
# Filter empty rows:
df=df[df['count'] != 0]

# Add compound column:
df['status'] = df.apply(lambda row: "%s, status %s" % (row.stream, row.index_status), axis=1)

df

Unnamed: 0,dates,count,stream,index_status,bytes,status
39,2013-04-01T00:00:00Z,20949,domain,15,2.149665e+13,"domain, status 15"
41,2013-04-01T00:00:00Z,1304,frequent,15,1.233495e+12,"frequent, status 15"
44,2013-05-01T00:00:00Z,10448,domain,15,1.064658e+13,"domain, status 15"
46,2013-05-01T00:00:00Z,549,frequent,15,5.036877e+11,"frequent, status 15"
49,2013-06-01T00:00:00Z,1715,domain,15,1.763993e+12,"domain, status 15"
...,...,...,...,...,...,...
319,2020-02-01T00:00:00Z,1,webrecorder,missing,1.653237e+09,"webrecorder, status missing"
321,2020-03-01T00:00:00Z,4431,frequent,missing,4.517895e+12,"frequent, status missing"
323,2020-04-01T00:00:00Z,4670,frequent,missing,4.721108e+12,"frequent, status missing"
325,2020-05-01T00:00:00Z,4620,frequent,missing,4.702210e+12,"frequent, status missing"


In [6]:
import altair as alt

alt.Chart(df).mark_bar().encode(
    x=alt.X('dates:T', axis = alt.Axis(title = 'Date', format = ("%b %Y"))),
    y=alt.Y('bytes'),
    color='status:N',
    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'bytes']
).properties(width=600)

In [7]:
alt.Chart(df).mark_bar().encode(
    x=alt.X('dates:T', axis = alt.Axis(title = 'Date', format = ("%b %Y"))),
    y=alt.Y('count', stack="normalize", axis=alt.Axis(format='%')),
    color='status:N',
    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'bytes']
).properties(width=600)