In [28]:
import time
import re
import os
import json
from urllib.parse import urlparse, urlunparse, urljoin
from pprint import pprint
import hashlib

# external modules
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

In [2]:
class Encoding:
    UTF8 = 'utf-8'
    UTF8_WITH_BOM = 'utf-8-sig'
    UTF16 = 'utf-16'
    GB2312 = 'gb2312' # chinese encoding

    @classmethod
    def decode(cls, bs: bytes):
        try:
            return cls.UTF8_WITH_BOM, bs.decode(cls.UTF8_WITH_BOM)
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            return cls.UTF8, bs.decode(cls.UTF8)
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            return cls.UTF16, bs.decode(cls.UTF16)
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            return cls.GB2312, bs.decode(cls.GB2312)
        except Exception as ex:
            # traceback.print_exc()
            pass

        return None, bs

In [8]:
class GameEntry:
    def __init__(
        self,
        url: str,
        title: str,
        banner_url: str,
        sample_gameplay_image_urls: list,
        versions: list,
    ):
        self.url = url
        self.title = title
        self.banner_url = banner_url
        self.sample_gameplay_image_urls = sample_gameplay_image_urls
        self.versions = versions
        
    def __repr__(self):
        return repr(self.__dict__)

In [30]:
x = 'abc'
x.lower(), x.upper()

('abc', 'ABC')

In [31]:
def hash_url(url: str):
    hash_str = hashlib.md5(url.encode('utf-8')).hexdigest()
    hash_str = hash_str.lower()
    return hash_str

cache_dir = '.requests_cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)
    

for url in requests_cache:
    url_hash = hash_url(url)
    
    res = requests_cache[url]
    if res.ok:
        cache_file = os.path.join(cache_dir, url_hash)
        with open(cache_file, mode='wb') as stream:
            stream.write(res.content)

In [9]:
try:
    requests_cache
except NameError:
    requests_cache = {}


def GET(url: str):
    global requests_cache

    if url in requests_cache:
        res = requests_cache[url]
    else:
        res = requests.get(url)
        requests_cache[url] = res
    
    return res


def POST(url: str):
    global requests_cache

    if url in requests_cache:
        res = requests_cache[url]
    else:
        res = requests.post(url)
        requests_cache[url] = res
    
    return res



In [25]:
def parse_game_entry_url(url: str):

    try:
        res = GET(url)
    except Exception as ex:
        print(f'Failed to request content!')
        print('->', url)
        print('->', ex)
        return None
    if not res.ok:
        return None
    
    _, content = Encoding.decode(res.content)
    soup = BeautifulSoup(content)
    
    selector = '#content'
    content_div = soup.select_one(selector)
    if content_div is None:
        print(f'Format for this page is not compatible! There is no element matches {selector}!')
        print('->', url)
        return None
    
    # retrieve game title
    title = None
    selector = '.title'
    title_div = content_div.select_one(selector)
    if title_div is None:
        print(f'Cannot find title! There is no element matches {selector}!')
        print('->', url)
    else:
        selector = 'h3'
        title_heading = title_div.select_one(selector)
        if title_heading is None:
            print(f'Cannot find title! There is no element matches {selector}!')
            print('->', url)
        else:
            title = title_heading.text
    
    selector = 'ul.container'
    ul_containers = content_div.select('ul.container')
    if not (len(ul_containers) == 3):
        print(f'Format for this page is not compatible! Number of elements match {selector} is not supported!')
        print('->', url)
        return None

    # retrieve game banner
    banner_url = None
    selector = 'img'
    banner_img = ul_containers[0].select_one(selector)
    if banner_img is None:
        print(f'Failed to get game banner with selector {selector}!')
        print('->', url)
    else:
        if 'src' in banner_img.attrs:
            banner_url = banner_img.attrs['src']
        else:
            print(f'Failed to get game banner. The img element does not contain src attribute!')
            print('->', url)
    
    # retrieve sample gameplay images
    sample_gameplay_image_urls = [imgE.attrs['src'] for imgE in ul_containers[1].select('img')]
    
    # retrieve game binaries for multiple phone models
    versions = []
    li_containers = ul_containers[2].select('li')
    versions = []
    for li_container in li_containers:
        selector = 'a'
        anchor_element = li_container.select_one(selector)
        if anchor_element is None:
            print(f'Failed to select anchor with {selector}!')
            print('->', url)
            print('->', li_container)
            continue
        
        if not 'href' in anchor_element.attrs:
            print(f'Failed to retrieve download url. Element does not have href attribute.')
            print('->', url)
            print(anchor_element)
            continue
            
        # send a post request to download the game
        version_url  = anchor_element.attrs['href']
        
        version_resolution = None
        version_model = None
        version_desc = None
        
        selector = 'p'
        version_desc_el = li_container.select_one(selector)
        if version_desc_el is None:
            print(f'Failed to retrieve game version description with {selector} selector!')
            print('->', url)
        else:
            version_desc = version_desc_el.text
            resolution_sr = re.search(r'\((\d+)×(\d+)\)', version_desc)
            
            if resolution_sr is None:
                print(f'Failed to search for this game version screen resolution in description!')
                print('->', url)
                print('->', version_desc)
            else:
                version_resolution = version_desc[resolution_sr.start():resolution_sr.end()]
                version_resolution = version_resolution.replace('×', 'x')
                version_resolution = re.sub(r'[\(\)]+', '', version_resolution)
                
                model_sr = re.search(r'[\x00-\x7F]+', version_desc[:resolution_sr.start()])
                
                if model_sr is None:
                    if '触摸屏通用版' in version_desc:
                        # universal touch screen phone models
                        version_model = 'touch'
                    elif '屏通用版' in version_desc:
                        # universal phone models
                        version_model = 'universal'
                    else:
                        print(f'Failed to search for supported model in description!')
                        print('->', url)
                        print('->', version_desc)
                else:
                    version_model = version_desc[model_sr.start():model_sr.end()]
                    version_model = version_model.replace(' ', '')

        versions.append({
            'url': version_url,
            'resolution': version_resolution,
            'model': version_model,
            'description': version_desc,
        })
    
    return GameEntry(
        url=url,
        title=title,
        banner_url=banner_url,
        sample_gameplay_image_urls=sample_gameplay_image_urls,
        versions=versions,
    )

In [None]:
urls = [
    'http://www.7723.cn/download/8077.htm', # 苍穹默示录完美运行版
    'http://www.7723.cn/download/10420.htm', # 苍弓默示录－吞噬时空    
]

In [21]:
requests_cache

{'http://www.7723.cn/download/8077.htm': <Response [200]>}

In [26]:
entry_url = 'http://www.7723.cn/download/8077.htm'
entry = parse_game_entry_url(entry_url)
entry

{'url': 'http://www.7723.cn/download/8077.htm', 'title': '苍穹默示录完美运行版', 'banner_url': 'http://image.7723.cn/wuza/pice/2011115O7315197.jpg', 'sample_gameplay_image_urls': ['http://images.7723.cn/wuza/pice/2011115O7315114.gif'], 'versions': [{'url': 'http://www.7723.cn/downb.asp?idd=5&id=8077&ksp=1', 'resolution': '176x208', 'model': 'N70', 'description': '诺基亚 N70系列(176×208)7610 3230 6600 6260 6620 6630 6670 6680 6681 6682 N70 N72 ;松下: X700 X800 ;联想: P930'}, {'url': 'http://www.7723.cn/downb.asp?idd=5&id=8077&ksp=2', 'resolution': '240x320', 'model': 'N73', 'description': '诺基亚 N73系列(240×320)N73 5320 5320XM 5320di_XM 5630XM 5700 5700XM 5710XM 5730XM 6110 6110N 6120 6120C 6120ci 6121 6122C 6124C 6210S 6210ci 6220C 6290 6650F 6700S 6702S 6710N 6720C 6730c 6788 6788I 6790 C5 C5-01 E101 E50 E51 E52 E55 E65 E66 E75 X5-00 X5-01 N71 N73ie N75 N76 N76-1 N77 N78 N79 N79 Eco N81 N81 8GB N82 N85 N86 N92 N93 N93I N95 N958G N95 8GB N95-3 NAM N96'}, {'url': 'http://www.7723.cn/downb.asp?idd=5&id=8077&ks

In [27]:
pprint(entry.__dict__)

{'banner_url': 'http://image.7723.cn/wuza/pice/2011115O7315197.jpg',
 'sample_gameplay_image_urls': ['http://images.7723.cn/wuza/pice/2011115O7315114.gif'],
 'title': '苍穹默示录完美运行版',
 'url': 'http://www.7723.cn/download/8077.htm',
 'versions': [{'description': '诺基亚 N70系列(176×208)7610 3230 6600 6260 6620 6630 '
                              '6670 6680 6681 6682 N70 N72 ;松下: X700 X800 ;联想: '
                              'P930',
               'model': 'N70',
               'resolution': '176x208',
               'url': 'http://www.7723.cn/downb.asp?idd=5&id=8077&ksp=1'},
              {'description': '诺基亚 N73系列(240×320)N73 5320 5320XM 5320di_XM '
                              '5630XM 5700 5700XM 5710XM 5730XM 6110 6110N '
                              '6120 6120C 6120ci 6121 6122C 6124C 6210S 6210ci '
                              '6220C 6290 6650F 6700S 6702S 6710N 6720C 6730c '
                              '6788 6788I 6790 C5 C5-01 E101 E50 E51 E52 E55 '
                              '