# Archived Holdings - HDFS Status

Breaking down what's stored on HDFS.

In [79]:
import json
import requests
import pandas as pd

headers = {'content-type': "application/json" }

json_facet = {
    # Primary facet is by date - here we break down the last month(s) into days
    'facet': {
        'dates' : { 
            'type' : 'range', 
            'field' : 'timestamp_dt', 
            'start' : "NOW/YEAR-10YEAR",
            'end' : "NOW/YEAR+1YEAR", 
            'gap' : "+1MONTH", 
            # For each day, we facet based on the CDX Index field, and make sure items with no value get recorded:
            'facet': { 
                'collection': { 
                    'type': 'terms', 
                    "field": "collection_s", 
                    'missing': True,
                    'facet': { 
                        'stream': { 
                            'type': 'terms', 
                            "field": "stream_s", 
                            'missing': True,
                            'facet' : {
                                'bytes': 'sum(file_size_l)'
                            }
                        }
                    }
                }
            }
        } 
    }
}


params = {
  'q': '(kind_s:"warcs" OR kind_s:"logs")',
  'rows': 0
}

r = requests.post("http://solr8.api.wa.bl.uk/solr/tracking/select", params=params, data=json.dumps(json_facet), headers=headers)

if r.status_code != 200:
    print(r.text)

from solr_facet_helper import flatten_solr_buckets

df = pd.DataFrame(flatten_solr_buckets(r.json()['facets']))
# Filter empty rows:
df=df[df['count'] != 0]

# Add compound column:
df['status'] = df.apply(lambda row: "%s, %s" % (row.collection, row.stream), axis=1)
df['terabytes'] = df.apply(lambda row: row.bytes / (1000*1000*1000*1000), axis=1)

df

Unnamed: 0,dates,count,collection,stream,bytes,status,terabytes
11,2010-12-01T00:00:00Z,5120,selective,selective,8.896927e+11,"selective, selective",0.889693
14,2011-01-01T00:00:00Z,18394,selective,selective,1.682772e+12,"selective, selective",1.682772
17,2011-02-01T00:00:00Z,35234,selective,selective,2.063042e+12,"selective, selective",2.063042
20,2011-03-01T00:00:00Z,54621,selective,selective,3.134785e+12,"selective, selective",3.134785
23,2011-04-01T00:00:00Z,2633,selective,selective,2.691898e+11,"selective, selective",0.269190
...,...,...,...,...,...,...,...
456,2020-03-01T00:00:00Z,4431,npld,frequent,4.517895e+12,"npld, frequent",4.517895
459,2020-04-01T00:00:00Z,4670,npld,frequent,4.721108e+12,"npld, frequent",4.721108
462,2020-05-01T00:00:00Z,4620,npld,frequent,4.702210e+12,"npld, frequent",4.702210
465,2020-06-01T00:00:00Z,3510,npld,frequent,3.126129e+12,"npld, frequent",3.126129


In [96]:
import altair as alt

alt.Chart(df).mark_bar().encode(
    x=alt.X('dates:T', axis = alt.Axis(title = 'Date', format = ("%b %Y"))),
    y=alt.Y('terabytes', axis=alt.Axis(title='Data volume (TB)')),
    color='status:N',
    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'terabytes']
).properties(width=600)

In [81]:
df2 = df.groupby(['status'])['terabytes'].sum().groupby(level=0).cumsum().reset_index()
df2

Unnamed: 0,status,terabytes
0,"npld, domain",476.085521
1,"npld, frequent",278.396912
2,"npld, webrecorder",0.003283
3,"selective, selective",26.44858


In [95]:
import altair as alt

alt.Chart(df).transform_window(
    cumulative_terabytes="sum(terabytes)",
).mark_area().encode(
    x=alt.X('dates:T', axis=alt.Axis(title='Date', format=("%b %Y"))),
    y=alt.Y('cumulative_terabytes:Q', axis=alt.Axis(title='Cumulative total data volume (TB)')),
    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'), 'cumulative_terabytes:Q']
).properties(width=600)

In [94]:
alt.Chart(df).mark_bar().encode(
    x=alt.X('dates:T', axis = alt.Axis(title='Date', format=("%b %Y"))),
    y=alt.Y('count', stack="normalize", axis=alt.Axis(title='Percentage of files', format='%')),
    color='status:N',
    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'bytes']
).properties(width=600)