In [11]:
import os
import pickle
import posixpath
import argparse
import time
import io
import threading
import json
import re
import stat
import math
import traceback
import gzip
import sys

import urllib
import urllib.parse
import wsgiref.handlers

import tqdm
import requests

In [2]:
json_config_file = 'webconfig.json'
content_bs = open(json_config_file, 'rb').read()
content_str = content_bs.decode('utf-8')
config_obj = json.loads(content_str)

game_info_list_filepath = config_obj['game_info_list_filepath']
game_binary_url_list_filepath = config_obj['game_binary_url_list_filepath']
image_url_info_dict_filepath = config_obj['image_url_info_dict_filepath']

with open(game_info_list_filepath, 'rb') as infile:
    game_info_list = pickle.load(infile)
with open(game_binary_url_list_filepath, 'rb') as infile:
    game_binary_url_list = pickle.load(infile)
with open(image_url_info_dict_filepath, 'rb') as infile:
    image_url_info_dict = pickle.load(infile)

In [3]:
len(game_binary_url_list)

153953

# prepare cache file

In [5]:
import cacherequests

Using cache directory from environment variable cacherequests_cache_dir


In [6]:
child_filename_list = os.listdir(cacherequests.MAIN_DATABASE_CACHE_DIR)

cache_filepath_list = []
for child_filename in child_filename_list:
    child_filepath = os.path.join(cacherequests.MAIN_DATABASE_CACHE_DIR, child_filename)
    file_stat = os.stat(child_filepath)
    if not stat.S_ISREG(file_stat.st_mode):
        continue

    cache_filepath_list.append(child_filepath)
cache_filepath_list

['D:\\cacherequests_cache_dir\\main_database\\0.tsv',
 'D:\\cacherequests_cache_dir\\main_database\\1.tsv',
 'D:\\cacherequests_cache_dir\\main_database\\2.tsv']

# check all game binary url cache status

In [8]:
print(len(game_binary_url_list))
game_binary_url_list = list(set(game_binary_url_list))
print(len(game_binary_url_list))

153953
153953


In [4]:
remaining_url_list = game_binary_url_list.copy()
print('remaining_url_list', len(remaining_url_list))

remaining_url_list 153953


In [7]:
in_cache_info_list = []
error_cache_processing_list = []
url_list_without_content_in_cache = []

for cache_filepath in cache_filepath_list:
    print('cache_filepath', cache_filepath)
    remaining_url_list_after_processing_cache_file = remaining_url_list.copy()
    print('remaining_url_list_after_processing_cache_file', len(remaining_url_list_after_processing_cache_file))

    try:
        print('processing cache file')
        body_content_dict = {}

        content_bs = open(cache_filepath, 'rb').read()
        content_str = content_bs.decode('utf-8')
        lines = content_str.split('\n')
        # filter empty lines
        lines = [line for line in lines if line]

        for line in tqdm.tqdm(lines):
            cell_list = line.split('\t')
            # url, method, status_code, request_time_ns, header_content_md5-size, body_content_md5-size
            if len(cell_list) < 6:
                continue
            ########################################################
            quoted_url = cell_list[0]
            unquoted_url = urllib.parse.unquote(quoted_url)
            ########################################################
            quoted_key = cell_list[5]
            unquoted_key = None
            if len(quoted_key) > 0:
                unquoted_key = urllib.parse.unquote(quoted_key)
            ########################################################
            body_content_dict[unquoted_url] = unquoted_key
        print('\nprocessing image url list', flush=True)

        for url in tqdm.tqdm(remaining_url_list):
            if url in body_content_dict:
                body_content_key = body_content_dict[url]
                if body_content_key is None:
                    url_list_without_content_in_cache.append(url)
                else:
                    in_cache_info_list.append((url, body_content_key))
                remaining_url_list_after_processing_cache_file.remove(url)
        print('\n', flush=True)
    except Exception as ex:
        stacktrace = traceback.format_exc()
        print(ex)
        print(stacktrace)
        error_cache_processing_list.append({
            'filepath': cache_filepath,
            'exception': ex,
            'stacktrace': stacktrace,
        })

    remaining_url_list = remaining_url_list_after_processing_cache_file

cache_filepath D:\cacherequests_cache_dir\main_database\0.tsv
remaining_url_list_after_processing_cache_file 153953
processing cache file


100%|██████████████████████████████████████████████████████████████████████| 152821/152821 [00:01<00:00, 131444.54it/s]


processing image url list



100%|████████████████████████████████████████████████████████████████████████| 153953/153953 [01:37<00:00, 1578.12it/s]






100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]

cache_filepath D:\cacherequests_cache_dir\main_database\1.tsv
remaining_url_list_after_processing_cache_file 17969
processing cache file

processing image url list



100%|███████████████████████████████████████████████████████████████████████| 17969/17969 [00:00<00:00, 1997758.80it/s]






 63%|█████████████████████████████████████████████▌                          | 12618/19948 [00:00<00:00, 125005.56it/s]

cache_filepath D:\cacherequests_cache_dir\main_database\2.tsv
remaining_url_list_after_processing_cache_file 17968
processing cache file


100%|████████████████████████████████████████████████████████████████████████| 19948/19948 [00:00<00:00, 130446.27it/s]


processing image url list



100%|████████████████████████████████████████████████████████████████████████| 17968/17968 [00:00<00:00, 160523.93it/s]








In [9]:
print('len(in_cache_info_list)', len(in_cache_info_list))
print('len(error_cache_processing_list)', len(error_cache_processing_list))
print('len(url_list_without_content_in_cache)', len(url_list_without_content_in_cache))

len(in_cache_info_list) 153653
len(error_cache_processing_list) 0
len(url_list_without_content_in_cache) 0


In [10]:
in_cache_info_list[:5]

[('http://www.7723.cn/downb.asp?idd=13&id=10819&ksp=1',
  '1f78ed493932d681f0c1bffbb1333cfa-322778'),
 ('http://www.7723.cn/downb.asp?idd=13&id=10819&ksp=3',
  'f5fd86b724954e24761025e5891a01c7-244674'),
 ('http://www.7723.cn/downb.asp?idd=13&id=10819&ksp=5',
  '761185a194d1b3a1cb0dca98b7524fea-270962'),
 ('http://www.7723.cn/downb.asp?idd=13&id=10819&ksp=6',
  '1f78ed493932d681f0c1bffbb1333cfa-322778'),
 ('http://www.7723.cn/downb.asp?idd=13&id=10819&ksp=7',
  '1f78ed493932d681f0c1bffbb1333cfa-322778')]

In [12]:
response = requests.get('http://www.7723.cn/downb.asp?idd=13&id=10819&ksp=7')
response

<Response [200]>

In [13]:
len(response.content)

1215

In [14]:
response.content

b'<!DOCTYPE HTML><html><head><title>\xe6\x8f\x90\xe7\xa4\xba\xe4\xbf\xa1\xe6\x81\xaf</title><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><style type="text/css">body{color:#000;font:12px verdana, arial, tahoma;background:#F5F7F6 url(http://www.7723.cn/static/images/bg.gif) repeat-x left top}#box{width:520px;border:1px solid #CCC;background:#FFF url(http://www.7723.cn/static/images/success.png) no-repeat 25px 30px;margin:180px auto;padding:20px 35px 20px 100px;border-radius:5px;box-shadow:0 0 10px #C0C0C0}#box h1{font-size:20px;font-weight:normal}#box a{color:#1A7613;text-decoration:none}</style><script type="text/javascript">var _time=3; \r\n\tfunction time_m(time){ _time=time-1;document.getElementById("span_time").innerHTML=_time;if(_time==0){url();}setTimeout("time_m(" + _time + ")", 1000);} function url() { location.href="http://www.7723.cn" }setTimeout("time_m("+_time+")", 1000)</script></head><body><div id="box"><h1>\xe6\x82\xa8\xe4\xb8\x8b\xe8\xbd\xbd\xe7\x

In [15]:
response.content.decode('utf-8')

'<!DOCTYPE HTML><html><head><title>提示信息</title><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><style type="text/css">body{color:#000;font:12px verdana, arial, tahoma;background:#F5F7F6 url(http://www.7723.cn/static/images/bg.gif) repeat-x left top}#box{width:520px;border:1px solid #CCC;background:#FFF url(http://www.7723.cn/static/images/success.png) no-repeat 25px 30px;margin:180px auto;padding:20px 35px 20px 100px;border-radius:5px;box-shadow:0 0 10px #C0C0C0}#box h1{font-size:20px;font-weight:normal}#box a{color:#1A7613;text-decoration:none}</style><script type="text/javascript">var _time=3; \r\n\tfunction time_m(time){ _time=time-1;document.getElementById("span_time").innerHTML=_time;if(_time==0){url();}setTimeout("time_m(" + _time + ")", 1000);} function url() { location.href="http://www.7723.cn" }setTimeout("time_m("+_time+")", 1000)</script></head><body><div id="box"><h1>您下载的文件不存在,请关闭,关闭后刷新下载页面,为此给您带来不便,请见谅！</h1><p>如果浏览器没有自动跳转，请 <a href="javascript:;" onCli

In [16]:
log_filepath = f'game_binary_url_info_list-{time.time_ns()}.pickle'
print(log_filepath)
with open(log_filepath, 'wb') as outfile:
    pickle.dump(in_cache_info_list, outfile)
os.path.getsize(log_filepath)

game_binary_url_info_list-1654922354277758800.pickle


17531745

In [17]:
len(in_cache_info_list)

153653

In [18]:
game_binary_url_info_dict = {}
cache_key_list = []

for (url, cache_key) in in_cache_info_list:
    game_binary_url_info_dict[url] = cache_key
    cache_key_list.append(cache_key)
len(cache_key_list)

153653

In [21]:
log_filepath = f'game_binary_url_info_dict-{time.time_ns()}.pickle'
print(log_filepath)
with open(log_filepath, 'wb') as outfile:
    pickle.dump(game_binary_url_info_dict, outfile)
os.path.getsize(log_filepath)

game_binary_url_info_dict-1654930510742813900.pickle


16609827

In [19]:
print(len(cache_key_list))
cache_key_list = list(set(cache_key_list))
print(len(cache_key_list))

153653
102188


In [20]:
total_raw_cache_size = 0
for cache_key in cache_key_list:
    cache_size = int(cache_key.split('-')[1])
    total_raw_cache_size += cache_size
total_raw_cache_size

49414384453

In [22]:
total_raw_cache_size / (1024**3)

46.020731751807034

In [23]:
game_info = game_info_list[0]
list(game_info.keys())

['url', 'name', 'banner_image', 'gameplay_image_list', 'binary_info_list']

In [24]:
binary_info_list = game_info['binary_info_list']
binary_info_list

[{'url': 'http://www.7723.cn/downb.asp?idd=13&id=10819&ksp=1',
  'description': '诺基亚 N73系列(240×320)\nN73 5320 5320XM 5320di_XM 5630XM 5700 5700XM 5710XM 5730XM 6110 6110N 6120 6120C 6120ci 6121 6122C 6124C 6210S 6210ci 6220C 6290 6650F 6700S 6702S 6710N 6720C 6730c 6788 6788I 6790 C5 C5-01 E101 E50 E51 E52 E55 E65 E66 E75 X5-00 X5-01 N71 N73ie N75 N76 N76-1 N77 N78 N79 N79 Eco N81 N81 8GB N82 N85 N86 N92 N93 N93I N95 N958G N95 8GB N95-3 NAM N96',
  'url_hash': 'e41f896a073c7e2be8e6b640b614e202'},
 {'url': 'http://www.7723.cn/downb.asp?idd=13&id=10819&ksp=2',
  'description': '诺基亚 N7370系列(240×320)\n7370 2700C 2710C 2710N 2730C 3120C 3208C 3600S 3602S 3610A 3610F 3620 3710F 3711 3720C 3806 5000 5000D 5130 5130C 5132 5132XM 5220 5220XM 5300 5310 5310XM 5330XM 5610 5610D 5610XM 5611 5611XM 6126 6131 6131I 6131NF 6133 6202C 6208C 6212 6233 6234 6263 6265 6265I 6267 6268 6270 6275 6275I 6280 6282 6288 6300 6300I 6301 6303C 6303CI 6303CLASSIC 6303I 6316S 6350 6500 6500C 6500S 6555 6600F 6600I

In [25]:
binary_info = binary_info_list[0]
list(binary_info.keys())

['url', 'description', 'url_hash']

In [26]:
binary_info

{'url': 'http://www.7723.cn/downb.asp?idd=13&id=10819&ksp=1',
 'description': '诺基亚 N73系列(240×320)\nN73 5320 5320XM 5320di_XM 5630XM 5700 5700XM 5710XM 5730XM 6110 6110N 6120 6120C 6120ci 6121 6122C 6124C 6210S 6210ci 6220C 6290 6650F 6700S 6702S 6710N 6720C 6730c 6788 6788I 6790 C5 C5-01 E101 E50 E51 E52 E55 E65 E66 E75 X5-00 X5-01 N71 N73ie N75 N76 N76-1 N77 N78 N79 N79 Eco N81 N81 8GB N82 N85 N86 N92 N93 N93I N95 N958G N95 8GB N95-3 NAM N96',
 'url_hash': 'e41f896a073c7e2be8e6b640b614e202'}