In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib.pyplot as plt

import mmap
from mmap import mmap as fileview, ACCESS_READ

from collections import defaultdict
import hashlib

import os
from os.path import basename

In [2]:
import comparedirs
comparison = comparedirs.Comparison()

In [3]:
comparison.scan(paths=("G:\\",), excludes=(".svn", ".git", "G:\\SEM", "G:\\$RECYCLE.BIN", ".idea", ".venv"))

[WinError 5] Access is denied: 'G:\\System Volume Information'


In [4]:
file_counts = dict((sz, len(f)) for sz, f in comparison.sizes.items())
total_files = sum(file_counts.values())
total_data  = sum(sz * count for sz, count in file_counts.items())

print("Files:", total_files, "Data:", total_data)
min_size = min(comparison.sizes.keys())
max_size = max(comparison.sizes.keys())

if total_files > 0:
    print("Min size:", min_size, "Max size:", max_size)
    print("Min files:", min(file_counts.values()), "Max files:", max(file_counts.values()))

sizes = {}
for size in sorted(comparison.sizes.keys()):
    sizes[size] = len(sizes)

Files: 65673 Data: 266908891008
Min size: 1 Max size: 34159919104
Min files: 2 Max files: 2258


In [5]:
import ipywidgets
from ipywidgets import IntProgress, FloatProgress, HBox, Label, Layout, IntSlider, FloatSlider
from IPython.display import display
from typing import Union

class Presenter(object):
    left_layout  = Layout(width='65%')
    right_layout = Layout(width='35%')
    box_layout   = Layout(width='100%')

    
    def __init__(self, title: str, max: Union[int, float], min: Union[int, float] = 0, **klsargs) -> None:
        self.data  = self.CLASS(min=min, max=max, layout=self.left_layout, description=title, **klsargs)
        self.label = Label(value='Starting', layout=self.right_layout)
        self.hbox  = HBox([self.data, self.label], layout=self.box_layout)

        
    def display(self) -> None:
        display(self.hbox)

        
    def increment(self, inc: Union[int, float], suffix: str = "") -> None:
        data = self.data
        data.value += inc
        self.label.value = "{:6.2f}% :: {:,}/{:,} {}".format(
            (data.value * 100) / data.max,
            int(data.value / self.units),
            self.umax,
            suffix,
        )

        
class ProgressBar(Presenter):
    CLASS = IntProgress
    def __init__(self, title: str, max: Union[int, float], units: int = 1, **kwargs) -> None:
        super().__init__(title, max, bar_style='info')
        self.units = units or 1
        self.umax  = int(self.data.max / self.units)

        
class ProgressBarF(ProgressBar):
    CLASS = FloatProgress

    
class Slider(Presenter):
    CLASS = IntSlider
    def __init__(self, title: str, max: Union[int, float], min: Union[int, float] = 0, default: Union[int, float] = min, **kwargs) -> None:
        super().__init__(title, max, min=min, value=default, **kwargs)
        self.data.value = default

        
class SliderF(Slider):
    CLASS = FloatSlider
    

# Data bar
db = ProgressBarF("Data", total_data, 1024)
fb = ProgressBarF("Files", total_files)
sz = Slider("Size", len(sizes), min=0, default=0, readout=False)
ch = SliderF("Cache", 1.0, default=1.0, readout=False)

db.display()
fb.display()
sz.display()
ch.display()

loaded, nfiles = 0, 0
update_size = 64 * 1024 * 1024

def inc_pb(bar, inc, div, sfx=""):
    bar.data.value += inc
    bar.label.value = "{:6.2f}% -> {:,}/{:,} {}".format((bar.data.value * 100) / bar.data.max, int(bar.data.value / div), int(bar.data.max / div), sfx)

def update_bars(rem, size, force=False):
    global loaded, nfiles
    if force or nfiles > 50 or loaded >= update_size:
        sfx = "KB ({:,}/{:,})".format(int(rem / 1024), int(size / 1024))
        db.increment(loaded, sfx)
        fb.increment(nfiles)
        loaded, nfiles = 0, 0
    if size:
        size_num = sizes[size]
        if size_num != sz.data.value:
            sz.data.value = sizes[size]
            sz.label.value = "{:,} byte{:s}" . format(size, "" if size == 1 else "s")
    
def update_data(path, size, chunk, rem):
    global loaded
    loaded += chunk
    update_bars(rem, size)

def update_cache():
    ratio = comparison.cache_hit / (comparison.cache_hit + comparison.cache_miss)
    ch.data.value = ratio
    ch.label.value = "{:,} hits : {:,} miss".format(comparison.cache_hit, comparison.cache_miss)
    
def update_files(file_info):
    global nfiles
    nfiles += 1
    update_bars(0, file_info.size)
    update_cache()

comparison.hash(readcallback=update_data, filecallback=update_files)
update_bars(0, 0, True)
update_cache()

HBox(children=(FloatProgress(value=0.0, bar_style='info', description='Data', layout=Layout(width='65%'), max=…

HBox(children=(FloatProgress(value=0.0, bar_style='info', description='Files', layout=Layout(width='65%'), max…

HBox(children=(IntSlider(value=0, description='Size', layout=Layout(width='65%'), max=10444, readout=False), L…

HBox(children=(FloatSlider(value=1.0, description='Cache', layout=Layout(width='65%'), max=1.0, readout=False)…