In [1]:
import pandas as pd
import os
import sys
import bagit
import ntpath
from collections import Counter
from ipywidgets import interact

This notebook is going to give us some rough whole of collection statistics about metrics like format distribution, duplication and modified dates. It does this by looping through a directory, hoovering up droid csv files (either created using Droid or Siegfried with the -droid switch). 

Assumptions:
Each package is in a bagit structure.
Each package has a droid csv report,with a name ending with 'droid.csv'.

In [2]:
def droid_sniff(file):
    'this will help us determine whether a given file is a droid csv'
    if file.endswith('csv'):
        with open(file) as f:
            sample = f.read(16)
            if 'PARENT_ID' in sample:
                return True
            else:
                return False
    else:
        return False

This will read each droid csv report into a Pandas dataframe. It also appends a checksum column from the checksums in the bag manifest, as we don't use the checksum option in Droid usually.

In [3]:
def norm_path(path):
    try:
        path = ntpath.normpath('data'+path.split('data')[1])
        return path
    except:
        return None

directory = 'corpus'
frames = []
bag_names = []
for root, _, files in os.walk(directory):
    for file in files:
        f_path = os.path.join(root, file)
        if droid_sniff(f_path):
            bag = bagit.Bag(root.split('data')[0])
            droid = pd.read_csv(f_path, parse_dates=['LAST_MODIFIED'], index_col=False)
            #I had some empty droid csvs in the corpus for some reason, so:
            if len(droid) > 0:
                #Here, we're going to append the hashes from the bag to the dataframe. 
                #Unnecessary if you use the hash function in droid
                droid['NORM_PATH'] = droid['FILE_PATH'].apply(norm_path)
                hashes = pd.Series(
                    {ntpath.normpath(k) : v.get('sha256') for k, v in bag.payload_entries().items()}, name='SHA256')
                frame = droid.join(hashes, on='NORM_PATH')
                frames.append(frame)
                bag_names.append(os.path.split(bag.path)[1])

We then concatenate the dataframes. We'll also create a subset for unique files by dropping duplicates on the checksum column.

In [4]:
def fstring(x):
    s = '{0} {1} {2}'.format(*x)
    if s == 'nan nan nan':
        s = 'Unidentified'
    else:
        s = s.replace('nan', '').strip()
    return s
    

result = pd.concat(frames, keys=bag_names, names=['bag', 'index'], sort=False)
#We're going to concatenate a couple of columns to create a distinct string for each format and version
result['FORMAT_STRING'] = result[['PUID', 'FORMAT_NAME', 'FORMAT_VERSION']].apply(fstring, axis=1)

#where droid isn't sure about a format, it creates lines for each possible match
#so we're just going to set dupe formats as multiple matches and drop the dupes
#also concatanate the PUIDs. If there's a less verbose way of doing it, I'd like to know.

uri = None
row = None
ind = None
for i, row in result.loc[result.duplicated(subset='URI', keep=False), ['URI', 'PUID', 'FORMAT_NAME', 'FORMAT_STRING']].iterrows():
    if uri != row['URI']:
        if uri is not None:
            result.at[ind, 'PUID'] = '; '.join(puid)
        uri = row['URI']
        puid = [row['PUID']]
        ind = i
    else:
        puid.append(row['PUID'])
result.loc[result.duplicated(subset='URI', keep=False), ['FORMAT_NAME', 'FORMAT_STRING', 'FORMAT_VERSION', 'MIME_TYPE']] = 'Multiple matches'
result.drop_duplicates(subset='URI', keep='first', inplace=True)

files = result[result['TYPE'] == 'File']
unique_files = files.drop_duplicates(subset='SHA256')


def f(item):
    return files.xs(item, level="bag")

itemlist = sorted(set([x[0] for x in files.index.tolist()]))
interact(f, item=itemlist)

interactive(children=(Dropdown(description='item', options=('sip_2014.0037.00599', 'sip_2014.0037.00600', 'sip…

<function __main__.f(item)>

In [5]:
def format_filter(hash):
    return files[files['FORMAT_STRING'] == hash]
interact(format_filter, hash=files.FORMAT_STRING.drop_duplicates().sort_values())

interactive(children=(Dropdown(description='hash', options=('Multiple matches', 'Unidentified', 'fmt/1070 Pref…

<function __main__.format_filter(hash)>

Now we're going to graph some data. Our semi-obvious first step is to make some pie charts of the format distribution. We'll do this by number of files and total file sizes.

In [6]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

f_numbers  = unique_files['FORMAT_STRING'].value_counts(dropna=False)
sizes = unique_files.fillna('unidentified').groupby('FORMAT_STRING')['SIZE'].sum().sort_values(ascending=False)
sizes = sizes.reset_index()
f_numbers = f_numbers.reset_index()

fig = {
    "data": [{"values": sizes['SIZE'], "labels": sizes['FORMAT_STRING'], "hoverinfo": 'label+percent', "showlegend": False,
              "domain": {"column": 1}, "name": "Formats by size", "textinfo": 'value', "hole": .4, "type": "pie"},
            {"values": f_numbers['FORMAT_STRING'], "labels": f_numbers['index'], "hoverinfo": 'label+percent', "showlegend": False,
             "textinfo": 'value', "domain": {"column": 0}, "name": "Formats by size", 
             "textinfo": 'value', "hole": .4, "type": "pie"}], 
    "layout": {"legend": {"orientation": "h"},
               "title": "File formats",
               "grid": {"rows": 1, "columns": 2},
               "annotations": [
                   {"text": "By count", "x": 0.18, "y": 0.5, "font_size": 20, "showarrow": False},
                   {"text": "By size", "x": 0.81, "y": 0.5, "font_size": 20, "showarrow": False}]}}

iplot(fig, filename="formats")

In [7]:
data = []
f = 'FORMAT_NAME'
for puid in files[f].unique():
    filtered = unique_files[unique_files[f] == puid]
    filtered = filtered['LAST_MODIFIED'].dt.strftime('%Y-%m').value_counts().sort_index()
    filtered = filtered.reset_index()
    filtered = filtered[filtered['index'] < '2014-01']
    data.append(go.Scatter(x=filtered['index'], y=filtered['LAST_MODIFIED'], name=puid, mode='markers'))
layout = go.Layout(
    legend={'orientation': 'v'}, title="Format counts by modification date")
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='pandas-time-series')

In [18]:
years = files['LAST_MODIFIED'].dt.strftime('%Y').value_counts().sort_index()[:25]
uyears = unique_files['LAST_MODIFIED'].dt.strftime('%Y').value_counts().sort_index()[:25]
uyears = uyears.reset_index()
years = years.reset_index()
year_fig = go.Figure(data=[
    go.Scatter(x=uyears['index'], y=uyears['LAST_MODIFIED'], name='Unique files'),
    go.Scatter( x=years['index'], y=years['LAST_MODIFIED'], name='Total files')],
    layout_title_text="File counts by modification date (year)")
iplot(year_fig, filename='pandas-time-series')

In [22]:
months = files['LAST_MODIFIED'].dt.strftime('%Y-%m').value_counts().sort_index()
umonths = unique_files['LAST_MODIFIED'].dt.strftime('%Y-%m').value_counts().sort_index()
umonths = umonths.reset_index()
months = months.reset_index()
umonths = umonths[umonths['index'] < '2014-01']
months = months[months['index'] < '2014-01']
month_fig = go.Figure(data=[
    go.Scatter(x=umonths['index'], y=umonths['LAST_MODIFIED'], name='Unique files'),
    go.Scatter(x=months['index'], y=months['LAST_MODIFIED'], name='Total files')],
    layout_title_text="File counts by modification date (month)")

iplot(month_fig, filename='pandas-time-series')