In [25]:
import os
import io
import hashlib
import collections
import time
import pickle

from tqdm import tqdm

In [3]:
def get_all_file_path(inpath: str, outlog: list):
    print(inpath, end='\r')
    
    if os.path.exists('stop'):
        return

    if os.path.isfile(inpath):
        outlog.append(inpath)
    elif os.path.isdir(inpath):
        child_file_list = os.listdir(inpath)
        for child_fn in child_file_list:
            child_path = os.path.join(inpath, child_fn)
            get_all_file_path(child_path, outlog)

In [4]:
all_file_path_list = []

In [5]:
get_all_file_path('.requests_cache', all_file_path_list)

.requests_cache\ff\ffff3b55502c0762b7d7f8dca3db02e3

In [6]:
len(all_file_path_list)

195094

In [8]:
file_path_list_log_filename = f'all_file_path_log-{time.time_ns()}.pickle'
file_path_list_log_filename

'all_file_path_log-1650108252097001700.pickle'

In [9]:
with open(file_path_list_log_filename, mode='wb') as outfile:
    pickle.dump(all_file_path_list, outfile)

In [13]:
def get_file_content_md5_hash(inpath: str):
    content_bs = open(inpath, mode='rb').read()
    file_size = len(content_bs)
    md5_hash_bs = hashlib.md5(content_bs).digest()
    return {
        'path': inpath,
        'size': file_size,
        'md5hash': md5_hash_bs,
    }

In [14]:
get_file_content_md5_hash(file_path_list_log_filename)

{'path': 'all_file_path_log-1650108252097001700.pickle',
 'size': 11900367,
 'md5hash': b'\xb9\x14\xc0\t\xdd\xdb\x8a\xdc\xdaI\x17\xcc\xc9\xd9\xf3\xb6'}

In [16]:
hash_log_list = []
hash_log_list

[]

In [17]:
for filepath in tqdm(all_file_path_list):
    if os.path.exists('stop'):
        break
    info_obj = get_file_content_md5_hash(filepath)
    hash_log_list.append(info_obj)

100%|█████████████████████████████████████████████████████████████████████████| 195094/195094 [13:47<00:00, 235.64it/s]


In [18]:
hash_log_filename = f'hash_log-{time.time_ns()}.pickle'
hash_log_filename

'hash_log-1650109654532523300.pickle'

In [19]:
with open(hash_log_filename, mode='wb') as outfile:
    retval = pickle.dump(hash_log_list, outfile)
    print(retval)

None


In [20]:
os.path.getsize(hash_log_filename)

20001503

In [26]:
duplicate_hash_log = collections.defaultdict(list)
duplicate_hash_log

defaultdict(list, {})

In [27]:
for info_obj in tqdm(hash_log_list):
    filepath = info_obj['path']
    size = info_obj['size']
    md5hash = info_obj['md5hash']
    duplicate_hash_log[md5hash].append({
        'path': filepath,
        'size': size,
    })

100%|██████████████████████████████████████████████████████████████████████| 195094/195094 [00:00<00:00, 380517.18it/s]


In [28]:
duplicated_hash_list = []
duplicated_hash_list

[]

In [29]:
for md5hash, item_list in duplicate_hash_log.items():
    if len(item_list) > 1:
        duplicated_hash_list.append({
            'md5hash': md5hash,
            'item_list': item_list,
        })

In [30]:
len(duplicated_hash_list)

38459

In [31]:
_*2

76918

In [None]:
duplicated_hash_list[2]

In [36]:
len(duplicated_hash_list[2]['item_list'])

9595

In [37]:
duplicate_with_different_size_log = []
duplicate_with_different_size_log

[]

In [38]:
for entry in duplicated_hash_list:
    item_list = entry['item_list']
    base_size = item_list[0]['size']
    for i in range(1, len(item_list)):
        size = item_list[i]['size']
        if size != base_size:
            duplicate_with_different_size_log.append(entry)
            break

In [39]:
len(duplicate_with_different_size_log)

0

In [40]:
possible_size_reduction = 0
possible_size_reduction

0

In [41]:
for entry in duplicated_hash_list:
    item_list = entry['item_list']
    size = item_list[0]['size']
    for i in range(1, len(item_list)):
        possible_size_reduction += size
possible_size_reduction

26957498768

In [42]:
possible_size_reduction / (1024**3)

25.10612715780735

In [43]:
cache_content_size = 0
cache_content_size

0

In [44]:
for info_obj in hash_log_list:
    size = info_obj['size']
    cache_content_size += size
cache_content_size

77063600815

In [45]:
cache_content_size - possible_size_reduction

50106102047

In [46]:
_  / (1024**3)

46.66494396235794