In [18]:
import os
import io
import time
import hashlib
import json
import pickle
import traceback
import urllib
import urllib.parse
import stat

import tqdm

In [3]:
with open('tmp_pickle_files/url_info_list-1650285069904349100', 'rb') as infile:
    url_info_list = pickle.load(infile)
type(url_info_list)

list

In [4]:
url_info_list[0]

{'url': 'http://images.7723.cn/admin/zzxpic/200511151517531850533E-02.gif',
 'url_hash': '117e9d406e5e67ba5d0b275e8b347c26',
 'cache_path': '.requests_cache\\11\\117e9d406e5e67ba5d0b275e8b347c26',
 'response_size': 0,
 'response_md5hash': b'\xd4\x1d\x8c\xd9\x8f\x00\xb2\x04\xe9\x80\t\x98\xec\xf8B~'}

In [20]:
new_cache_format_list = []
error_entry_list = []
zero_size_entry_list = []

for url_info in tqdm.tqdm(url_info_list):
    try:
        body_content_size = url_info['response_size']
        if body_content_size == 0:
            zero_size_entry_list.append(url_info)
            continue

        url = url_info['url']
        body_content_md5_hash_bs = url_info['response_md5hash']
        body_content_md5_hash = body_content_md5_hash_bs.hex()
        body_content_md5_size_key = f'{body_content_md5_hash}-{body_content_size}'

        new_cache_format_list.append({
            'url': url,
            'body_content_md5_size_key': body_content_md5_size_key,
        })
    except Exception as ex:
        stacktrace = traceback.format_exc()
        error_entry_list.append({
            'url_info': url_info,
            'exception': ex,
            'stacktrace': stacktrace,
        })

100%|██████████████████████████████████████████████████████████████████████| 182366/182366 [00:00<00:00, 744781.16it/s]


In [21]:
len(new_cache_format_list)

172770

In [23]:
len(zero_size_entry_list)

9595

In [22]:
len(error_entry_list)

1

In [11]:
error_entry_list

[{'url_info': {'url': 'http://www.7723.cn/downb.asp?idd=6&id=22&ksp=6',
   'url_hash': 'fc2cabf6e440dcc364991de186d76067',
   'cache_path': '.requests_cache\\fc\\fc2cabf6e440dcc364991de186d76067'},
  'exception': KeyError('response_md5hash'),
  'stacktrace': 'Traceback (most recent call last):\n  File "<ipython-input-8-e08d6e3996f8>", line 7, in <module>\n    body_content_md5_hash_bs = url_info[\'response_md5hash\']\nKeyError: \'response_md5hash\'\n'}]

In [12]:
ENV_VAR_NAME_CACHE_DIR = 'cacherequests_cache_dir'
# check for environment variable
if ENV_VAR_NAME_CACHE_DIR in os.environ:
    print(f'Using cache directory from environment variable {ENV_VAR_NAME_CACHE_DIR}')
    CACHE_ROOT_DIR = os.environ[ENV_VAR_NAME_CACHE_DIR]
else:
    print(f'warning: Using cache directory from default value')
    ROOT = os.path.dirname(os.path.realpath(__file__))
    CACHE_ROOT_DIR = os.path.join(ROOT, 'cache')

if not os.path.exists(CACHE_ROOT_DIR):
    os.makedirs(CACHE_ROOT_DIR)
if not os.path.isdir(CACHE_ROOT_DIR):
    raise Exception(f'{CACHE_ROOT_DIR} is not a directory!')

HEADER_CONTENT_CACHE_DIR = os.path.join(CACHE_ROOT_DIR, 'headers')
BODY_CONTENT_CACHE_DIR = os.path.join(CACHE_ROOT_DIR, 'bodies')
REQUEST_CACHE_DIR = os.path.join(CACHE_ROOT_DIR, 'requests')

MAIN_DATABASE_CACHE_DIR = os.path.join(CACHE_ROOT_DIR, 'main_database')
MAX_CACHE_SIZE_BYTES = 16777216  # 16MB

Using cache directory from environment variable cacherequests_cache_dir


In [13]:
HEADER_CONTENT_CACHE_DIR

'D:\\cacherequests_cache_dir\\headers'

In [14]:
BODY_CONTENT_CACHE_DIR

'D:\\cacherequests_cache_dir\\bodies'

In [15]:
REQUEST_CACHE_DIR

'D:\\cacherequests_cache_dir\\requests'

In [16]:
MAIN_DATABASE_CACHE_DIR

'D:\\cacherequests_cache_dir\\main_database'

In [27]:
def give_me_a_new_cache_filepath(max_count=65536):
    for i in range(max_count):
        cache_filename = f'{i}.tsv'
        cache_filepath = os.path.join(MAIN_DATABASE_CACHE_DIR, cache_filename)
        if not os.path.exists(cache_filepath):
            return cache_filepath

    raise Exception(f'The number of existing cache log files is {max_count}!')

In [28]:
def store_response(
    url: str,
    body_content_md5_size_key: str,
):
    quoted_url = urllib.parse.quote(url)
#     quoted_method = 'GET'

    body_cache_key_quoted = urllib.parse.quote(body_content_md5_size_key)

    cache_log_line_content = '\t'.join([
        quoted_url,
        'GET', #quoted_method,
        '', #quoted_status_code,
        '', #quoted_request_time_ns,
        '',
        body_cache_key_quoted,
    ])

    cache_log_line_content = f'{cache_log_line_content}'
    cache_log_line_content_bs = cache_log_line_content.encode('utf-8')
    del cache_log_line_content
    base_log_content_size = len(cache_log_line_content_bs)

    if not os.path.exists(MAIN_DATABASE_CACHE_DIR):
        os.makedirs(MAIN_DATABASE_CACHE_DIR)
    if not os.path.isdir(MAIN_DATABASE_CACHE_DIR):
        raise Exception(f'{MAIN_DATABASE_CACHE_DIR} is not a directory')

    child_filename_list = os.listdir(MAIN_DATABASE_CACHE_DIR)
    child_file_log_list = []
    for child_filename in child_filename_list:
        child_filepath = os.path.join(MAIN_DATABASE_CACHE_DIR, child_filename)
        file_stat = os.stat(child_filepath)
        if not stat.S_ISREG(file_stat.st_mode):
            continue

        modified_time_ns = file_stat.st_mtime_ns
        log_info = {
            'filename': child_filename,
            'filepath': child_filepath,
            'modified_time_ns': modified_time_ns,
        }

        child_file_log_list.append(log_info)

    # sort by modified time with the most recently modified first
    child_file_log_list.sort(key=lambda x: x['modified_time_ns'], reverse=True)

    if len(child_file_log_list) == 0:
        # no cache file exists
        cache_filepath = give_me_a_new_cache_filepath()
        with open(cache_filepath, 'wb') as outfile:
            outfile.write(cache_log_line_content_bs)
            return True

    latest_child_file_log = child_file_log_list[0]
    latest_log_filepath = latest_child_file_log['filepath']
    latest_log_filesize = os.path.getsize(latest_log_filepath)

    if (latest_log_filesize + base_log_content_size) > MAX_CACHE_SIZE_BYTES:
        # the latest cache file is too large
        # make a new cache file
        cache_filepath = give_me_a_new_cache_filepath()
        with open(cache_filepath, 'wb') as outfile:
            outfile.write(cache_log_line_content_bs)
            return True

    # get the last character from the file
    with open(latest_log_filepath, 'rb') as infile:
        infile.seek(-1, os.SEEK_END)
        last_character = infile.read(1)

    if last_character == b'\n':
        # append to the latest cache file
        with open(latest_log_filepath, 'ab') as outfile:
            outfile.write(cache_log_line_content_bs)
            return True
    else:
        # re check the size sum with 1 more byte
        if (latest_log_filesize + base_log_content_size + 1) > MAX_CACHE_SIZE_BYTES:
            # the latest cache file is too large
            # make a new cache file
            cache_filepath = give_me_a_new_cache_filepath()
            with open(cache_filepath, 'wb') as outfile:
                outfile.write(cache_log_line_content_bs)
                return True

        # append to the latest cache file
        with open(latest_log_filepath, 'ab') as outfile:
            outfile.write(b'\n')
            outfile.write(cache_log_line_content_bs)
            return True

In [29]:
import_error_list = []
not_return_true_import_list = []

for url_info in tqdm.tqdm(new_cache_format_list):
    try:
        retval = store_response(
            url=url_info['url'],
            body_content_md5_size_key=url_info['body_content_md5_size_key'],
        )
        
        if retval is not True:
            not_return_true_import_list.append(url_info)
    except Exception as ex:
        stacktrace = traceback.format_exc()
        import_error_list.append({
            'url_info': url_info,
            'exception': ex,
            'stacktrace': stacktrace,
        })

100%|████████████████████████████████████████████████████████████████████████| 172770/172770 [01:13<00:00, 2355.45it/s]


In [30]:
len(not_return_true_import_list)

0

In [31]:
len(import_error_list)

0