In [6]:
import os
import urllib
import urllib.parse
import time
import re
import hashlib
import traceback
import typing
import pickle

import bs4
from tqdm import tqdm

In [17]:
class TermColor:
    PURPLE = '\033[95m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    ENDC = '\033[0m'


class Encoding:
    UTF8 = 'utf-8'
    UTF8_WITH_BOM = 'utf-8-sig'
    UTF16 = 'utf-16'
    GB2312 = 'gb2312'
    SHIFT_JIS = 'shift-jis'

    @classmethod
    def decode(cls, bs: bytes):
        try:
            encoding = cls.UTF8_WITH_BOM
            decoded_content = bs.decode(encoding)
            return encoding, decoded_content
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            encoding = cls.UTF8
            decoded_content = bs.decode(encoding)
            return encoding, decoded_content
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            encoding = cls.UTF16
            decoded_content = bs.decode(encoding)
            return encoding, decoded_content
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            encoding = cls.GB2312
            decoded_content = bs.decode(encoding)
            return encoding, decoded_content
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            encoding = cls.SHIFT_JIS
            decoded_content = bs.decode(encoding)
            return encoding, decoded_content
        except Exception as ex:
            # traceback.print_exc()
            pass

        return None, bs


cache_dir = '.requests_cache'
hash_prefix_length = 2


def hash_url(url: str):
    hash_str = hashlib.md5(url.encode('utf-8')).hexdigest()
    hash_str = hash_str.lower()
    return hash_str


def get_hash_file_location(hash_str: str):
    hash_prefix = hash_str[:hash_prefix_length]

    sub_cache_dir = os.path.join(cache_dir, hash_prefix)
    cache_file = os.path.join(sub_cache_dir, hash_str)

    return cache_file, sub_cache_dir


def is_game_page_url(url: str):
    parse_result = urllib.parse.urlparse(url)
    path_parts = parse_result.path.split('/')
    # filter out empty strings
    path_parts = [part for part in path_parts if part]
    if len(path_parts) != 2:
        return False

    if path_parts[0] != 'download':
        return False

    if not path_parts[1].endswith('.htm'):
        return False

    return True


def get_response_from_cache(url: str):
    url_hash = hash_url(url)

    cache_file, sub_cache_dir = get_hash_file_location(url_hash)

    if os.path.exists(cache_file):
        content = open(cache_file, mode='rb').read()
        return content


def parse_game_page_url(url: str):
    if not is_game_page_url(url):
        raise Exception(f'Not a game page url {url}')

    content_bs = get_response_from_cache(url)
    if (content_bs is None) or (len(content_bs) == 0):
        raise Exception(f'Could not get response from cache for {url}')

    _, html_str = Encoding.decode(content_bs)

    game_page_obj = {
        'url': url,
    }

    soup = bs4.BeautifulSoup(html_str)
    ####################################################################
    title_element_list = soup.select('#content .title h3')
    if len(title_element_list) > 1:
        game_name = title_element_list[0].text
        game_page_obj['name'] = game_name
    ####################################################################
    img_element_list = soup.select('#content ul.container img')
    img_url_list = []
    for img_element in img_element_list:
        if 'src' in img_element.attrs:
            img_url_list.append(img_element.attrs['src'])

    if len(img_url_list) > 0:
        game_page_obj['banner_image'] = {
            'url': img_url_list[0],
        }

        gameplay_image_info_list = []
        for img_url in img_url_list:
            gameplay_image_info_list.append({
                'url': img_url,
            })

        game_page_obj['gameplay_image_list'] = gameplay_image_info_list
    ####################################################################
    container_element_list = soup.select('#content ul.container')
    if len(container_element_list) > 2:
        li_element_list = container_element_list[2].select('li')

        game_binary_info_list = []
        for li_element in li_element_list:
            anchor_element = li_element.select_one('a')
            if anchor_element is None:
                continue

            if 'href' not in anchor_element.attrs:
                continue

            download_url = anchor_element.attrs['href']
            game_binary_info = {
                'url': download_url,
            }
            description_element = li_element.select_one('p')
            if description_element is not None:
                description_text = description_element.get_text('\n')
                game_binary_info['description'] = description_text

            game_binary_info_list.append(game_binary_info)

        game_page_obj['binary_info_list'] = game_binary_info_list
    ####################################################################
    return game_page_obj


def get_all_genre_urls(url: str):
    content_bs = get_response_from_cache(url)

    if content_bs is None:
        raise Exception(f'Could not get response from cache for {url}')

    _, content = Encoding.decode(content_bs)
    soup = bs4.BeautifulSoup(content)
    selector = '.pagenation'
    page_container = soup.select_one(selector)

    if page_container is None:
        raise Exception(f'Failed to select {selector} for getting pages container!')

    selector = 'a'
    els = page_container.select(selector)

    if len(els) == 0:
        raise Exception(f'Failed to select {selector} for page navigation anchors!')

    last_page_anchor = els[-1]
    if not 'href' in last_page_anchor.attrs:
        raise Exception(f'The last anchor element does not have href attribute!')

    last_page_url = last_page_anchor.attrs['href']
    genre_base, last_page_doc_name = os.path.split(last_page_url)

    num_sr = re.search(r'\d+', last_page_doc_name)
    if num_sr is None:
        raise Exception(f'Failed to find number of pages!')

    num_text = last_page_doc_name[num_sr.start():num_sr.end()]
    num_pages = int(num_text)

    genre_page_urls = []

    for i in range(num_pages):
        page_url = f'{genre_base}/{last_page_doc_name[:num_sr.start()]}{i+1}{last_page_doc_name[num_sr.end():]}'
        genre_page_urls.append(page_url)

    return genre_page_urls


def parse_game_listing_page(url: str):
    content_bs = get_response_from_cache(url)
    if content_bs is None:
        raise Exception(f'Could not get response from cache for {url}')

    _, decoded_content = Encoding.decode(content_bs)
    soup = bs4.BeautifulSoup(decoded_content)

    selector = '#content'
    content_div = soup.select_one(selector)
    if content_div is None:
        raise Exception(f'Failed to get content container with selector {selector}')

    selector = 'ul.container'
    ul_container = content_div.select_one(selector)
    if ul_container is None:
        raise Exception(f'Failed to get game list container with selector {selector}')

    game_pages = []

    els = ul_container.find_all('li', recursive=False)
    els.extend(ul_container.select('dd>li'))

    for el in els:
        selector = 'a'
        anchor = el.select_one(selector)
        if anchor is None:
            print(f'This game entry does not have an a element!')
            print('->', el)
            print('->', url)
            continue

        if 'href' in anchor.attrs:
            game_page_url = anchor.attrs['href']
            game_pages.append(game_page_url)

    return game_pages

In [2]:
genre_first_page_urls = [
    'http://www.7723.cn/zuixin/jiaose_1.htm',  # 角色扮演 # RPG
    'http://www.7723.cn/zuixin/yizhi_1.htm',  # 益智游戏 # Puzzle games
    'http://www.7723.cn/zuixin/dongzuo_1.htm',  # 动作游戏 # Action games
    'http://www.7723.cn/zuixin/saiche_1.htm',  # 赛车游戏 # Racing games
    'http://www.7723.cn/zuixin/maoxian_1.htm',  # 冒险游戏 # Adventure games
    'http://www.7723.cn/zuixin/yangcheng_1.htm',  # 养成游戏 # Dating sim?
    'http://www.7723.cn/zuixin/tiyu_1.htm',  # 体育游戏 # Sports games
    'http://www.7723.cn/zuixin/gedou_1.htm',  # 格斗游戏 # Fighting games
    'http://www.7723.cn/zuixin/qipai_1.htm',  # 棋牌游戏 # Board games
    'http://www.7723.cn/zuixin/celue_1.htm',  # 策略游戏 # Strategy games
    'http://www.7723.cn/zuixin/sheji_1.htm',  # 射击游戏 # Shooting games
    'http://www.7723.cn/zuixin/moni_1.htm',  # 模拟经营 # Simulation (city building, shop management, etc.)
    'http://www.7723.cn/zuixin/feixing_1.htm',  # 飞行游戏 # Flying (e.g. space ship) games
    'http://www.7723.cn/zuixin/wangyou_1.htm',  # 手机网游 # online games
]

game_listing_pages = []

for first_page_url in tqdm(genre_first_page_urls):
    try:
        genre_page_urls = get_all_genre_urls(first_page_url)
        game_listing_pages.extend(genre_page_urls)
    except Exception as ex:
        print(f'Error when getting genre page url {first_page_url}')
        print(ex)

game_page_url_list = []

for game_listing_page_url in tqdm(game_listing_pages):
    try:
        game_page_urls = parse_game_listing_page(game_listing_page_url)
        game_page_url_list.extend(game_page_urls)
    except Exception as ex:
        print(f'Error when parsing game listing page {game_listing_page_url}')
        print(ex)
len(game_page_url_list)

11732

In [4]:
game_page_url_list = list(set(game_page_url_list))
len(game_page_url_list)

11732

In [7]:
pickle_log_filename = f'game_listing_pages-{time.time_ns()}.pickle'
print(pickle_log_filename)
with open(pickle_log_filename, 'wb') as outfile:
    pickle.dump(game_listing_pages, outfile)

game_listing_pages-1650279171613542300.pickle


In [8]:
pickle_log_filename = f'game_page_url_list-{time.time_ns()}.pickle'
print(pickle_log_filename)
with open(pickle_log_filename, 'wb') as outfile:
    pickle.dump(game_page_url_list, outfile)

game_page_url_list-1650279173367714100.pickle


In [18]:
game_obj_list = []
for game_page_url in tqdm(game_page_url_list):
    try:
        obj = parse_game_page_url(game_page_url)
        game_obj_list.append(obj)
    except Exception as ex:
        print(f'Error when parsing game page {game_page_url}')
        print(ex)
len(game_obj_list)

100%|████████████████████████████████████████████████████████████████████████████| 11732/11732 [06:19<00:00, 30.95it/s]


11732

In [19]:
pickle_log_filename = f'game_obj_list-{time.time_ns()}.pickle'
print(pickle_log_filename)
with open(pickle_log_filename, 'wb') as outfile:
    pickle.dump(game_obj_list, outfile)

game_obj_list-1650280733912302100.pickle


In [20]:
os.path.getsize(pickle_log_filename)

48357112

In [21]:
image_url_list = []
game_binary_url_list = []

for game_obj in tqdm(game_obj_list):
    if 'banner_image' in game_obj:
        image_url_list.append(game_obj['banner_image']['url'])

    if 'gameplay_image_list' in game_obj:
        for gameplay_image in game_obj['gameplay_image_list']:
            image_url_list.append(gameplay_image['url'])

    if 'binary_info_list' in game_obj:
        for binary_info in game_obj['binary_info_list']:
            game_binary_url_list.append(binary_info['url'])

pickle_log_filename = f'image_url_list-{time.time_ns()}.pickle'
print(pickle_log_filename)
with open(pickle_log_filename, 'wb') as outfile:
    pickle.dump(image_url_list, outfile)

pickle_log_filename = f'game_binary_url_list-{time.time_ns()}.pickle'
print(pickle_log_filename)
with open(pickle_log_filename, 'wb') as outfile:
    pickle.dump(game_binary_url_list, outfile)

100%|████████████████████████████████████████████████████████████████████████| 11732/11732 [00:00<00:00, 144906.31it/s]

image_url_list-1650280741968099500.pickle
game_binary_url_list-1650280741999342000.pickle





In [22]:
len(image_url_list)

40337

In [25]:
image_url_list = list(set(image_url_list))
len(image_url_list)

28413

In [26]:
40337 - 28413

11924

In [23]:
len(game_binary_url_list)

153953

In [24]:
game_binary_url_list = list(set(game_binary_url_list))
len(game_binary_url_list)

153953

In [27]:
game_obj

{'url': 'http://www.7723.cn/download/5319.htm',
 'name': '口袋小精灵200合一',
 'banner_image': {'url': 'http://image.7723.cn/wuza/pic/201013O5423810.gif'},
 'gameplay_image_list': [{'url': 'http://image.7723.cn/wuza/pic/201013O5423810.gif'},
  {'url': 'http://images.7723.cn/wuza/pick/2011916O4793084.gif'},
  {'url': 'http://images.7723.cn/wuza/pick/2011916O479301.gif'}],
 'binary_info_list': [{'url': 'http://www.7723.cn/downb.asp?idd=5&id=5319&ksp=1',
   'description': '诺基亚 N70系列(176×208)\n7610 3230 6600 6260 6620 6630 6670 6680 6681 6682 N70 N72 ;松下: X700 X800 ;联想: P930'},
  {'url': 'http://www.7723.cn/downb.asp?idd=5&id=5319&ksp=2',
   'description': '诺基亚 N73系列(240×320)\nN73 5320 5320XM 5320di_XM 5630XM 5700 5700XM 5710XM 5730XM 6110 6110N 6120 6120C 6120ci 6121 6122C 6124C 6210S 6210ci 6220C 6290 6650F 6700S 6702S 6710N 6720C 6730c 6788 6788I 6790 C5 C5-01 E101 E50 E51 E52 E55 E65 E66 E75 X5-00 X5-01 N71 N73ie N75 N76 N76-1 N77 N78 N79 N79 Eco N81 N81 8GB N82 N85 N86 N92 N93 N93I N95 N958G

In [28]:
with open('hash_log-1650109654532523300.pickle', mode='rb') as infile:
    cache_file_info_list = pickle.load(infile)
len(cache_file_info_list)

195094

In [29]:
cache_file_info_list[:5]

[{'path': '.requests_cache\\00\\000000a86ac9fd0cc77dc4ecf80f0b89',
  'size': 132302,
  'md5hash': b"\t\xf9V!W\x82\xbcIt/\xcc\x8c'k\x1f\xe1"},
 {'path': '.requests_cache\\00\\00000d3a62a9fbb2f04edc87d8f0382d',
  'size': 871586,
  'md5hash': b'Gn|\x08r\xea\r\x14/s\xc7-\xdc\xff\xe6\x9b'},
 {'path': '.requests_cache\\00\\000015a80264e66f976d156552abf764',
  'size': 292275,
  'md5hash': b'Uy{\xdb$^\x1eP\x9f\x92\x975\xf3\xd5\xf4\x16'},
 {'path': '.requests_cache\\00\\00004bfa16254c038b950dac6c489e79',
  'size': 727491,
  'md5hash': b'\xa26\xf7\x16\xbc\x97\xe8\x14\xde\x81\xb618hpF'},
 {'path': '.requests_cache\\00\\00016a634884e0f305374acd8d0a2db7',
  'size': 496616,
  'md5hash': b'\xfc=2\xc0\xd3\xb8\x0b\xbc\xf1#\xed\x84\x1f\x08\xea\xad'}]

In [30]:
all_asset_url_list = []
all_asset_url_list.extend(image_url_list)
all_asset_url_list.extend(game_binary_url_list)
len(all_asset_url_list)

182366

In [31]:
all_asset_url_list = list(set(all_asset_url_list))
len(all_asset_url_list)

182366

In [33]:
zero_size_response_cache_file_list = []
for cache_file_info in tqdm(cache_file_info_list):
    if cache_file_info['size'] == 0:
        zero_size_response_cache_file_list.append(cache_file_info['path'])

100%|█████████████████████████████████████████████████████████████████████| 195094/195094 [00:00<00:00, 2440088.10it/s]


In [34]:
len(zero_size_response_cache_file_list)

9595

In [36]:
pickle_log_filename = f'zero_size_response_cache_file_list-{time.time_ns()}'
print(pickle_log_filename)

with open(pickle_log_filename, mode='wb') as outfile:
    pickle.dump(zero_size_response_cache_file_list, outfile)
print(os.path.getsize(pickle_log_filename))

zero_size_response_cache_file_list-1650281805430150900
584556


In [37]:
url_info_list = []
url_info_list

[]

In [38]:
for url in tqdm(all_asset_url_list):
    url_hash = hash_url(url)
    cache_path, sub_cache_dir = get_hash_file_location(url_hash)
    url_info_list.append({
        'url': url,
        'url_hash': url_hash,
        'cache_path': cache_path,
    })
len(url_info_list)

100%|██████████████████████████████████████████████████████████████████████| 182366/182366 [00:01<00:00, 113961.06it/s]


182366

In [39]:
for url_info in tqdm(url_info_list):
    cache_path = url_info['cache_path']
    for cache_file_info in cache_file_info_list:
        if cache_file_info['path'] == cache_path:
            url_info['response_size'] = cache_file_info['size']
            url_info['response_md5hash'] = cache_file_info['md5hash']
            break

100%|██████████████████████████████████████████████████████████████████████████| 182366/182366 [36:24<00:00, 83.50it/s]


In [40]:
url_info_list[0]

{'url': 'http://images.7723.cn/admin/zzxpic/200511151517531850533E-02.gif',
 'url_hash': '117e9d406e5e67ba5d0b275e8b347c26',
 'cache_path': '.requests_cache\\11\\117e9d406e5e67ba5d0b275e8b347c26',
 'response_size': 0,
 'response_md5hash': b'\xd4\x1d\x8c\xd9\x8f\x00\xb2\x04\xe9\x80\t\x98\xec\xf8B~'}

In [41]:
pickle_log_filename = f'url_info_list-{time.time_ns()}'
print(pickle_log_filename)

with open(pickle_log_filename, mode='wb') as outfile:
    pickle.dump(url_info_list, outfile)
print(os.path.getsize(pickle_log_filename))

url_info_list-1650285069904349100
37953718
