In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from glob import glob
from scipy.stats import norm
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.palettes import Bright6, Category20
from bokeh.transform import factor_cmap

output_notebook()

Data from IA download is stored in tsv files that list which Item was downloaded when, how many seconds that took, and whether an error occurred. Different log files equal different runs, either with slightly different configurations. E.g. checking whether the file on disk has the correct md5 checksum (slow) or not. Number of paralle downloads. Whether we're downloading WIDE15 or WIDE6.

In [33]:
def read_table(files):
    dfs = []
    for filename in files:
        if filename == 'ia-download.wide16.20221216172143.log':
            df = pd.read_table(filename,
                               names=['item', 'file', 'path', 'size', 'md5', 'ts', 'error'],
                               parse_dates=[5],
                               index_col='ts')
            df.insert(4, 'download_sec', None)
        else:
            df = pd.read_table(filename,
                               names=['ts', 'item', 'file', 'path', 'size', 'download_sec', 'md5', 'error'],
                               parse_dates=[0])
        #df['logfile'] = filename
        df['logfile'] = filename.split('.')[1]
        dfs.append(df)
    return pd.concat(dfs)

# data = read_table(
#     file_name
#     for file_name in glob('ia-download.log.*')
#     if datetime.strptime(file_name.split('.')[-1], '%Y%m%d%H%M%S') > datetime(2023,2,1)
# )

# data = pd.concat([
#     data,
#     read_table(["~/Downloads/sampl17_200_parallel540.log"])
# ])

data = read_table(
    file_name
    for file_name in glob('ia-download.wide*.*.log')
)


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1800162 entries, 0 to 100
Data columns (total 9 columns):
 #   Column        Dtype         
---  ------        -----         
 0   ts            datetime64[ns]
 1   item          object        
 2   file          object        
 3   path          object        
 4   size          float64       
 5   download_sec  float64       
 6   md5           object        
 7   error         object        
 8   logfile       object        
dtypes: datetime64[ns](1), float64(2), object(6)
memory usage: 137.3+ MB


In [35]:
data_per_hour = data.groupby(['logfile', data['ts'].dt.floor('H')])

# Download time

In [37]:
p = figure(title="Download time (binned)",
          x_axis_label="Seconds",
          y_axis_label="Density")
# downloads = data[data['error'].isna() & (data['logfile'] == 'ia-download.log.20221231114150')]
downloads = data
bins = np.linspace(0, 3600, 200)
hist, edges = np.histogram(downloads['download_sec'], density=True, bins=bins)
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:])
show(p)


Seconds it takes to download a file. Looks like there are multiple humps hidden in this graph. I expect that if you'd graph them per log file you'll see slightly different means.

In [39]:
Colors = {
    **Category20,
    1: ('#1f77b4',),
    2: ('#1f77b4', '#aec7e8'),
}

In [40]:
def lines(p, x, y, color_field, data, *, palette=Colors, legend=False):
    """Helper function to plot groups as separete lines"""
    groups = data[color_field].unique()
    for color, group in zip(palette[len(groups)], groups):
        p.line(x, y, color=color, source=ColumnDataSource(data[data[color_field]==group]), **(dict(legend_label=group) if legend else dict()))

In [41]:
errors_per_hour = data_per_hour['error'].count().reset_index(name='errors')

requests_per_hour = data_per_hour.size().reset_index(name='requests')

errors_per_hour['error_rate'] = errors_per_hour['errors'] / requests_per_hour['requests']

p = figure(title="Fraction of downloads that fail (over time, per hour)",
           x_axis_type="datetime",
           x_axis_label="Hour",
           y_axis_label="Error rate",
           sizing_mode="stretch_width",
           height=250,)
lines(p, 'ts', 'error_rate', 'logfile', errors_per_hour)
show(p)

Error rate seems to be about 10% of all downloads when downloading WIDE15 with 128 or 256 threads. The yellow bit at the end is WIDE6 with 64 threads, eventually failing on a certificate issue. Graph below splits errors out per category.

In [42]:
def classify_error(error):
    if error == 'None: None' or 'Max retries exceeded' in error:
        return 'timeout'
    elif error == 'md5 mismatch':
        return 'md5 mismatch'
    elif error.startswith('[Errno None: None] '):
        return 'other'
    elif "'Connection aborted.'" in error:
        return 'connection'
    else:
        return error

data['error_norm'] = data['error'].map(classify_error, na_action='ignore')

errors_per_hour = data.groupby(['error_norm', data['ts'].dt.floor('H')])['ts'].count().reset_index(name='error_count')
p = figure(title="Errors per category (per hour)",
           x_axis_type="datetime",
           x_axis_label="Hour",
           y_axis_label="Error count",
           sizing_mode="stretch_width",
           height=250,)
lines(p, 'ts', 'error_count', 'error_norm', errors_per_hour, legend=True)
p.legend.location = 'top_left'
show(p)

Initially the most occurring problem were timeouts, but then as more files finished downloading correctly and failed downloads were re-attempted, the md5 errors became more prevalent. I should check whether the error rate is consistent per file or per item really. Error count is much lower at the end where I was doing md5 checking of existing files (so less actual downloading) and downloading WIDE6 (with 64 in parallel as opposed to 256, so again less actual downloading.)

In [43]:
errors_per_hour.groupby('error_norm').sum(numeric_only=True)

Unnamed: 0_level_0,error_count
error_norm,Unnamed: 1_level_1
Bad Gateway,104
Internal Server Error,4
Not Found,162
connection,7
md5 mismatch,70104
other,1422
timeout,24636


In [44]:
bytes_per_hour = data_per_hour['size'].sum().reset_index(name='amount') # size is NA on error, files that fail md5 check are not counted
p = figure(title="Amount of data downloaded (over time, per hour)",
           x_axis_type="datetime",
           x_axis_label="Hour",
           y_axis_label="Downloaded data",
           sizing_mode="stretch_width",
           height=250,)
lines(p, 'ts', 'amount', 'logfile', bytes_per_hour)
show(p)

Going by eye, average seems to be about 1TB/hour with the 256 parallel downloads I was using around Chirstmas. New year started with checking existing files (so fewer downloads) and then completing WIDE6 (again fewer downloads as I was checking md5 of existing files and only using 64 downloads in parallel.)

In [45]:
f"{data[data['error'].isna()]['item'].unique().size} files downloaded"

'186199 files downloaded'

In [46]:
f"{data['ts'].dt.floor('H').unique().size / 24:.1f} days of downloading"

'82.8 days of downloading'

In [47]:
f"{data_per_hour['size'].sum().mean() / (1000 * 1000 * 1000):.0f}GB per hour on average"

'729GB per hour on average'

In [89]:
data_per_hour['size'].sum().groupby('logfile').agg(lambda a: np.mean(a[a>0.0]) / (1000 * 1000 * 1000 * 1000) * 24)

logfile
wide11    15.573869
wide12    34.604055
wide16    17.557098
wide5     12.049658
wide6     16.402942
Name: size, dtype: float64

In [87]:
data.groupby(['logfile'])['ts'].agg(lambda a: max(a) - min(a))

logfile
wide11   41 days 18:23:50.606526
wide12   18 days 19:30:10.137653
wide16   26 days 22:53:31.681237
wide5    39 days 04:30:52.387444
wide6     6 days 23:08:08.973246
Name: ts, dtype: timedelta64[ns]

Above average is a bad estimate for top download speed. It includes periods that I mostly did md5 checking of existing files, and only re-downloading files that did not pass this check.

In [48]:
p = figure(title="Number of files downloaded (over time, per hour)",
           x_axis_type="datetime",
           x_axis_label="Hour",
           y_axis_label="Downloaded files",
           sizing_mode="stretch_width",
           height=250,)
files_per_hour = data_per_hour['md5'].count().reset_index(name='file_count') # md5 is NA on errors
lines(p, 'ts', 'file_count', 'logfile', files_per_hour)
show(p)

Surprise surprise above graph closely matches the amount of data downloaded graph.