In [1]:
import time
import re
import os
import json
from urllib.parse import urlparse, urlunparse, urljoin
from pprint import pprint
import hashlib

# external modules
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

In [2]:
class Encoding:
    UTF8 = 'utf-8'
    UTF8_WITH_BOM = 'utf-8-sig'
    UTF16 = 'utf-16'
    GB2312 = 'gb2312' # chinese encoding

    @classmethod
    def decode(cls, bs: bytes):
        try:
            return cls.UTF8_WITH_BOM, bs.decode(cls.UTF8_WITH_BOM)
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            return cls.UTF8, bs.decode(cls.UTF8)
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            return cls.UTF16, bs.decode(cls.UTF16)
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            return cls.GB2312, bs.decode(cls.GB2312)
        except Exception as ex:
            # traceback.print_exc()
            pass

        return None, bs

In [3]:
class GameEntry:
    def __init__(
        self,
        url: str,
        title: str,
        banner_url: str,
        sample_gameplay_image_urls: list,
        versions: list,
    ):
        self.url = url
        self.title = title
        self.banner_url = banner_url
        self.sample_gameplay_image_urls = sample_gameplay_image_urls
        self.versions = versions
        
    def __repr__(self):
        return repr(self.__dict__)

In [4]:
cache_dir = '.requests_cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

last_response = None
last_exception = None


def hash_url(url: str):
    hash_str = hashlib.md5(url.encode('utf-8')).hexdigest()
    hash_str = hash_str.lower()
    return hash_str


def GET(url: str, verbose=True):
    global last_response, last_exception

    url_hash = hash_url(url)
    cache_file = os.path.join(cache_dir, url_hash)
    
    if os.path.exists(cache_file):
        if verbose:
            print('Pulling request content from cache!')
            print(url)
        content = open(cache_file, mode='rb').read()
        return content
    else:
        try:
            res = requests.get(url)
            if res.ok:
                with open(cache_file, mode='wb') as stream:
                    stream.write(res.content)

                return res.content
            else:
                last_response = res
                print('The response is not usable! Please check last_response!')
                print('->', url)
                return None
        except Exception as ex:
            last_exception = ex
            print('Failed to request the content!')
            print('->', url)
            print(ex)
    
    return None


def POST(url: str):
    global last_response, last_exception

    url_hash = hash_url(url)
    cache_file = os.path.join(cache_dir, url_hash)
    
    if os.path.exists(cache_file):
        content = open(cache_file, mode='rb').read()
        return content
    else:
        try:
            res = requests.post(url)
            if res.ok:
                with open(cache_file, mode='wb') as stream:
                    stream.write(res.content)

                return res.content
            else:
                last_response = res
                print('The response is not usable! Please check last_response!')
                print('->', url)
                return None
        except Exception as ex:
            last_exception = ex
            print('Failed to request the content!')
            print('->', url)
            print(ex)
    
    return None

In [5]:
def parse_game_entry_url(url: str):
    content = GET(url)
    
    if content is None:
        print('Please check error from global variables!')
        return None
    
    _, content = Encoding.decode(content)
    soup = BeautifulSoup(content)
    
    selector = '#content'
    content_div = soup.select_one(selector)
    if content_div is None:
        print(f'Format for this page is not compatible! There is no element matches {selector}!')
        print('->', url)
        return None
    
    # retrieve game title
    title = None
    selector = '.title'
    title_div = content_div.select_one(selector)
    if title_div is None:
        print(f'Cannot find title! There is no element matches {selector}!')
        print('->', url)
    else:
        selector = 'h3'
        title_heading = title_div.select_one(selector)
        if title_heading is None:
            print(f'Cannot find title! There is no element matches {selector}!')
            print('->', url)
        else:
            title = title_heading.text
    
    selector = 'ul.container'
    ul_containers = content_div.select('ul.container')
    if not (len(ul_containers) == 3):
        print(f'Format for this page is not compatible! Number of elements match {selector} is not supported!')
        print('->', url)
        return None

    # retrieve game banner
    banner_url = None
    selector = 'img'
    banner_img = ul_containers[0].select_one(selector)
    if banner_img is None:
        print(f'Failed to get game banner with selector {selector}!')
        print('->', url)
    else:
        if 'src' in banner_img.attrs:
            banner_url = banner_img.attrs['src']
        else:
            print(f'Failed to get game banner. The img element does not contain src attribute!')
            print('->', url)
    
    # retrieve sample gameplay images
    sample_gameplay_image_urls = [imgE.attrs['src'] for imgE in ul_containers[1].select('img')]
    
    # retrieve game binaries for multiple phone models
    versions = []
    li_containers = ul_containers[2].select('li')
    versions = []
    for li_container in li_containers:
        selector = 'a'
        anchor_element = li_container.select_one(selector)
        if anchor_element is None:
            print(f'Failed to select anchor with {selector}!')
            print('->', url)
            print('->', li_container)
            continue
        
        if not 'href' in anchor_element.attrs:
            print(f'Failed to retrieve download url. Element does not have href attribute.')
            print('->', url)
            print(anchor_element)
            continue
            
        # send a post request to download the game
        version_url  = anchor_element.attrs['href']
        
        version_resolution = None
        version_model = None
        version_desc = None
        
        selector = 'p'
        version_desc_el = li_container.select_one(selector)
        if version_desc_el is None:
            print(f'Failed to retrieve game version description with {selector} selector!')
            print('->', url)
        else:
            version_desc = version_desc_el.text
            resolution_sr = re.search(r'\((\d+)×(\d+)\)', version_desc)
            
            if resolution_sr is None:
                print(f'Failed to search for this game version screen resolution in description!')
                print('->', url)
                print('->', version_desc)
            else:
                version_resolution = version_desc[resolution_sr.start():resolution_sr.end()]
                version_resolution = version_resolution.replace('×', 'x')
                version_resolution = re.sub(r'[\(\)]+', '', version_resolution)
                
                model_sr = re.search(r'[\x00-\x7F]+', version_desc[:resolution_sr.start()])
                
                if model_sr is None:
                    if '触摸屏通用版' in version_desc:
                        # universal touch screen phone models
                        version_model = 'touch'
                    elif '屏通用版' in version_desc:
                        # universal phone models
                        version_model = 'universal'
                    else:
                        print(f'Failed to search for supported model in description!')
                        print('->', url)
                        print('->', version_desc)
                else:
                    version_model = version_desc[model_sr.start():model_sr.end()]
                    version_model = version_model.replace(' ', '')

        versions.append({
            'url': version_url,
            'resolution': version_resolution,
            'model': version_model,
            'description': version_desc,
        })
    
    return GameEntry(
        url=url,
        title=title,
        banner_url=banner_url,
        sample_gameplay_image_urls=sample_gameplay_image_urls,
        versions=versions,
    )

In [6]:
urls = [
    'http://www.7723.cn/download/10172.htm', # 战姬无双-花缭乱
    'http://www.7723.cn/download/8077.htm', # 苍穹默示录完美运行版
    'http://www.7723.cn/download/10420.htm', # 苍弓默示录－吞噬时空    
]

In [7]:
entry_url = 'http://www.7723.cn/download/8077.htm'
entry = parse_game_entry_url(entry_url)
entry

Pulling request content from cache!
http://www.7723.cn/download/8077.htm


{'url': 'http://www.7723.cn/download/8077.htm', 'title': '苍穹默示录完美运行版', 'banner_url': 'http://image.7723.cn/wuza/pice/2011115O7315197.jpg', 'sample_gameplay_image_urls': ['http://images.7723.cn/wuza/pice/2011115O7315114.gif'], 'versions': [{'url': 'http://www.7723.cn/downb.asp?idd=5&id=8077&ksp=1', 'resolution': '176x208', 'model': 'N70', 'description': '诺基亚 N70系列(176×208)7610 3230 6600 6260 6620 6630 6670 6680 6681 6682 N70 N72 ;松下: X700 X800 ;联想: P930'}, {'url': 'http://www.7723.cn/downb.asp?idd=5&id=8077&ksp=2', 'resolution': '240x320', 'model': 'N73', 'description': '诺基亚 N73系列(240×320)N73 5320 5320XM 5320di_XM 5630XM 5700 5700XM 5710XM 5730XM 6110 6110N 6120 6120C 6120ci 6121 6122C 6124C 6210S 6210ci 6220C 6290 6650F 6700S 6702S 6710N 6720C 6730c 6788 6788I 6790 C5 C5-01 E101 E50 E51 E52 E55 E65 E66 E75 X5-00 X5-01 N71 N73ie N75 N76 N76-1 N77 N78 N79 N79 Eco N81 N81 8GB N82 N85 N86 N92 N93 N93I N95 N958G N95 8GB N95-3 NAM N96'}, {'url': 'http://www.7723.cn/downb.asp?idd=5&id=8077&ks

In [8]:
pprint(entry.__dict__)

{'banner_url': 'http://image.7723.cn/wuza/pice/2011115O7315197.jpg',
 'sample_gameplay_image_urls': ['http://images.7723.cn/wuza/pice/2011115O7315114.gif'],
 'title': '苍穹默示录完美运行版',
 'url': 'http://www.7723.cn/download/8077.htm',
 'versions': [{'description': '诺基亚 N70系列(176×208)7610 3230 6600 6260 6620 6630 '
                              '6670 6680 6681 6682 N70 N72 ;松下: X700 X800 ;联想: '
                              'P930',
               'model': 'N70',
               'resolution': '176x208',
               'url': 'http://www.7723.cn/downb.asp?idd=5&id=8077&ksp=1'},
              {'description': '诺基亚 N73系列(240×320)N73 5320 5320XM 5320di_XM '
                              '5630XM 5700 5700XM 5710XM 5730XM 6110 6110N '
                              '6120 6120C 6120ci 6121 6122C 6124C 6210S 6210ci '
                              '6220C 6290 6650F 6700S 6702S 6710N 6720C 6730c '
                              '6788 6788I 6790 C5 C5-01 E101 E50 E51 E52 E55 '
                              '

# Crawl all the entry urls from multiple pages from a game genre page.

In [9]:
def get_all_genre_urls(url: str):
    content = GET(url)
    
    if content is None:
        return None
    
    _, content = Encoding.decode(content)
    soup = BeautifulSoup(content)
    selector = '.pagenation'
    page_container = soup.select_one(selector)
    
    if page_container is None:
        print(f'Failed to select {selector} for getting pages container!')
        print('->', url)
        return None
    
    selector = 'a'
    els = page_container.select(selector)
    
    if len(els) == 0:
        print(f'Failed to select {selector} for page navigation anchors!')
        print('->', url)
        return None
    
    last_page_anchor = els[-1]
    if not 'href' in last_page_anchor.attrs:
        print(f'The last anchor element does not have href attribute!')
        print('->', url)
        return None

    last_page_url = last_page_anchor.attrs['href']
    genre_base, last_page_doc_name = os.path.split(last_page_url)
    
    num_sr = re.search(r'\d+', last_page_doc_name)
    if num_sr is None:
        print(f'Failed to find number of pages!')
        print('->', url)
        return None
    
    num_text = last_page_doc_name[num_sr.start():num_sr.end()]
    num_pages = int(num_text)
    
    genre_page_urls = []

    for i in range(num_pages):
        page_url = f'{genre_base}/{last_page_doc_name[:num_sr.start()]}{i+1}{last_page_doc_name[num_sr.end():]}'
        genre_page_urls.append(page_url)

    return genre_page_urls

In [10]:
genre_first_page_urls = [
    'http://www.7723.cn/zuixin/jiaose_1.htm', # 角色扮演 # RPG
    'http://www.7723.cn/zuixin/yizhi_1.htm', # 益智游戏 # Puzzle games
    'http://www.7723.cn/zuixin/dongzuo_1.htm', # 动作游戏 # Action games
    'http://www.7723.cn/zuixin/saiche_1.htm', # 赛车游戏 # Racing games
    'http://www.7723.cn/zuixin/maoxian_1.htm', # 冒险游戏 # Adventure games
    'http://www.7723.cn/zuixin/yangcheng_1.htm', # 养成游戏 # Dating sim?
    'http://www.7723.cn/zuixin/tiyu_1.htm', # 体育游戏 # Sports games
    'http://www.7723.cn/zuixin/gedou_1.htm', # 格斗游戏 # Fighting games
    'http://www.7723.cn/zuixin/qipai_1.htm', # 棋牌游戏 # Board games
    'http://www.7723.cn/zuixin/celue_1.htm', # 策略游戏 # Strategy games
    'http://www.7723.cn/zuixin/sheji_1.htm', # 射击游戏 # Shooting games
    'http://www.7723.cn/zuixin/moni_1.htm', # 模拟经营 # Simulation (city building, shop management, etc.)
    'http://www.7723.cn/zuixin/feixing_1.htm', # 飞行游戏 # Flying (e.g. space ship) games
    'http://www.7723.cn/zuixin/wangyou_1.htm', # 手机网游 # online games
]

In [11]:
url_list = []

for first_page_url in genre_first_page_urls:
    genre_page_urls = get_all_genre_urls(first_page_url)
    
    if genre_page_urls is None:
        continue

    print(len(genre_page_urls), first_page_url)
    
    url_list.extend(genre_page_urls)

pbar = tqdm(url_list)
for page_url in pbar:
    pbar.set_description(page_url)

    GET(page_url, verbose=False)

Pulling request content from cache!
http://www.7723.cn/zuixin/jiaose_1.htm
181 http://www.7723.cn/zuixin/jiaose_1.htm
Pulling request content from cache!
http://www.7723.cn/zuixin/yizhi_1.htm
206 http://www.7723.cn/zuixin/yizhi_1.htm
Pulling request content from cache!
http://www.7723.cn/zuixin/dongzuo_1.htm
169 http://www.7723.cn/zuixin/dongzuo_1.htm
Pulling request content from cache!
http://www.7723.cn/zuixin/saiche_1.htm
42 http://www.7723.cn/zuixin/saiche_1.htm
Pulling request content from cache!
http://www.7723.cn/zuixin/maoxian_1.htm
58 http://www.7723.cn/zuixin/maoxian_1.htm
Pulling request content from cache!
http://www.7723.cn/zuixin/yangcheng_1.htm
21 http://www.7723.cn/zuixin/yangcheng_1.htm
Pulling request content from cache!
http://www.7723.cn/zuixin/tiyu_1.htm
52 http://www.7723.cn/zuixin/tiyu_1.htm
Pulling request content from cache!
http://www.7723.cn/zuixin/gedou_1.htm
27 http://www.7723.cn/zuixin/gedou_1.htm
Pulling request content from cache!
http://www.7723.cn/zuix

http://www.7723.cn/zuixin/wangyou_2.htm: 100%|████████████████████████████████████| 997/997 [00:39<00:00, 25.20it/s]


In [15]:
len(url_list)

997

In [16]:
url_set = set(url_list)
len(url_set)

997

In [14]:
url_list

['http://www.7723.cn/zuixin/jiaose_1.htm',
 'http://www.7723.cn/zuixin/jiaose_2.htm',
 'http://www.7723.cn/zuixin/jiaose_3.htm',
 'http://www.7723.cn/zuixin/jiaose_4.htm',
 'http://www.7723.cn/zuixin/jiaose_5.htm',
 'http://www.7723.cn/zuixin/jiaose_6.htm',
 'http://www.7723.cn/zuixin/jiaose_7.htm',
 'http://www.7723.cn/zuixin/jiaose_8.htm',
 'http://www.7723.cn/zuixin/jiaose_9.htm',
 'http://www.7723.cn/zuixin/jiaose_10.htm',
 'http://www.7723.cn/zuixin/jiaose_11.htm',
 'http://www.7723.cn/zuixin/jiaose_12.htm',
 'http://www.7723.cn/zuixin/jiaose_13.htm',
 'http://www.7723.cn/zuixin/jiaose_14.htm',
 'http://www.7723.cn/zuixin/jiaose_15.htm',
 'http://www.7723.cn/zuixin/jiaose_16.htm',
 'http://www.7723.cn/zuixin/jiaose_17.htm',
 'http://www.7723.cn/zuixin/jiaose_18.htm',
 'http://www.7723.cn/zuixin/jiaose_19.htm',
 'http://www.7723.cn/zuixin/jiaose_20.htm',
 'http://www.7723.cn/zuixin/jiaose_21.htm',
 'http://www.7723.cn/zuixin/jiaose_22.htm',
 'http://www.7723.cn/zuixin/jiaose_23.htm

In [17]:
url = url_list[0]
url

'http://www.7723.cn/zuixin/jiaose_1.htm'

In [18]:
content = GET(url)
len(content)

Pulling request content from cache!
http://www.7723.cn/zuixin/jiaose_1.htm


24726

In [19]:
_, content = Encoding.decode(content)
content

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n<title>角色扮演_1_手机游戏JAVA破解版下载_7723手机游戏[www.7723.cn]</title>\n<meta content="角色扮演下载" name="keywords">\n<meta content="" name="description">\n<link type="text/css" rel="stylesheet" href="/static/css/androidb.css">\n<script type="text/javascript" src="http://www.7723.cn/static/common/jquery.min.js"></script>\n<script type="text/javascript" src="http://cbjs.baidu.com/js/m.js"></script>\n<link type="image/x-icon" rel="shortcut icon" href="http://www.7723.cn/favicon.ico">\n</head>\n<body id="clickmain" class="exchange">\n<div id="contanier">\n<div id="top">\n<div class="main">\n<p>\n7723手机游戏下载中心&nbsp;\n<a target="_blank" href="http://www.7723.cn/jx/sheding.htm" title="设定机型快速下载游戏">设定机型快速下载游戏通道</a>\n&nbsp;\n把7723分享给更多的朋友：\n</p>\n<div id="bdshare" class

In [20]:
soup = BeautifulSoup(content)

In [23]:
selector = '#content'
content_div = soup.select_one(selector)
content_div

<div id="content">
<div class="frame" id="new">
<div class="title">
<ul class="primaryNav">
<li class="navon"><a href="http://www.7723.cn/zuixin/jiaose_1.htm" title="精品角色">最新角色</a></li>
<li><a href="http://www.7723.cn/jingpin/jpjiaose_1.htm" title="精品角色">精品角色</a></li>
<li><a href="http://www.7723.cn/zhongwen/zwjiaose_1.htm" title="中文角色">中文角色</a></li>
<li><a href="http://www.7723.cn/paiheng/phjiaose_1.htm" title="角色排行">角色排行</a></li>
</ul>
</div>
<ul class="container">
<li><dl><dt class="img"><a href="http://www.7723.cn/download/12095.htm" target="_blank" title="神兽传说3－救赎大陆"><img alt="神兽传说3－救赎大陆" height="80" src="http://image.7723.cn/wuza/pics/20121218O196817.jpg" width="120"/></a></dt>
<dd>
<a href="http://www.7723.cn/download/12095.htm" target="_blank" title="神兽传说3－救赎大陆">神兽传说3－救赎大陆</a></dd>
<dd>
神兽传说三部曲最终篇火爆上线！清新亮丽的游戏画面，流畅细腻的人物动作，结合简便的操作和极度火爆的战斗场景——...
<a href="http://www.7723.cn/download/12095.htm" target="_blank" title="查看《神兽传说3－救赎大陆》更多简介">查看更多&gt;&gt;</a></dd><dd class="load"><a href

In [25]:
selector = 'ul.container'
ul_container = content_div.select_one(selector)
ul_container

<ul class="container">
<li><dl><dt class="img"><a href="http://www.7723.cn/download/12095.htm" target="_blank" title="神兽传说3－救赎大陆"><img alt="神兽传说3－救赎大陆" height="80" src="http://image.7723.cn/wuza/pics/20121218O196817.jpg" width="120"/></a></dt>
<dd>
<a href="http://www.7723.cn/download/12095.htm" target="_blank" title="神兽传说3－救赎大陆">神兽传说3－救赎大陆</a></dd>
<dd>
神兽传说三部曲最终篇火爆上线！清新亮丽的游戏画面，流畅细腻的人物动作，结合简便的操作和极度火爆的战斗场景——...
<a href="http://www.7723.cn/download/12095.htm" target="_blank" title="查看《神兽传说3－救赎大陆》更多简介">查看更多&gt;&gt;</a></dd><dd class="load"><a href="http://www.7723.cn/download/12095.htm" target="_blank" title="进入下载《神兽传说3－救赎大陆》">下载</a></dd></dl>
<ul class="list"><li>类型：角色扮演</li><li>语言：中文</li><li>发布日期：2012-12-18</li></ul>
</li>
<dd> <li><dl><dt class="img"><a href="http://www.7723.cn/download/12094.htm" target="_blank" title="血雨深情－颠覆风云"><img alt="血雨深情－颠覆风云" height="80" src="http://image.7723.cn/wuza/pics/20121217O793107.jpg" width="120"/></a></dt>
<dd>
<a href="http://www.7723.cn/download/1

# the first game is wrapped in a `li` element while the rest are wrapped in `dd>li`

In [44]:
els = ul_container.find_all('li', recursive=False)
len(els)

1

In [45]:
els

[<li><dl><dt class="img"><a href="http://www.7723.cn/download/12095.htm" target="_blank" title="神兽传说3－救赎大陆"><img alt="神兽传说3－救赎大陆" height="80" src="http://image.7723.cn/wuza/pics/20121218O196817.jpg" width="120"/></a></dt>
 <dd>
 <a href="http://www.7723.cn/download/12095.htm" target="_blank" title="神兽传说3－救赎大陆">神兽传说3－救赎大陆</a></dd>
 <dd>
 神兽传说三部曲最终篇火爆上线！清新亮丽的游戏画面，流畅细腻的人物动作，结合简便的操作和极度火爆的战斗场景——...
 <a href="http://www.7723.cn/download/12095.htm" target="_blank" title="查看《神兽传说3－救赎大陆》更多简介">查看更多&gt;&gt;</a></dd><dd class="load"><a href="http://www.7723.cn/download/12095.htm" target="_blank" title="进入下载《神兽传说3－救赎大陆》">下载</a></dd></dl>
 <ul class="list"><li>类型：角色扮演</li><li>语言：中文</li><li>发布日期：2012-12-18</li></ul>
 </li>]

In [46]:
el = els[0]
el

<li><dl><dt class="img"><a href="http://www.7723.cn/download/12095.htm" target="_blank" title="神兽传说3－救赎大陆"><img alt="神兽传说3－救赎大陆" height="80" src="http://image.7723.cn/wuza/pics/20121218O196817.jpg" width="120"/></a></dt>
<dd>
<a href="http://www.7723.cn/download/12095.htm" target="_blank" title="神兽传说3－救赎大陆">神兽传说3－救赎大陆</a></dd>
<dd>
神兽传说三部曲最终篇火爆上线！清新亮丽的游戏画面，流畅细腻的人物动作，结合简便的操作和极度火爆的战斗场景——...
<a href="http://www.7723.cn/download/12095.htm" target="_blank" title="查看《神兽传说3－救赎大陆》更多简介">查看更多&gt;&gt;</a></dd><dd class="load"><a href="http://www.7723.cn/download/12095.htm" target="_blank" title="进入下载《神兽传说3－救赎大陆》">下载</a></dd></dl>
<ul class="list"><li>类型：角色扮演</li><li>语言：中文</li><li>发布日期：2012-12-18</li></ul>
</li>

In [50]:
el.select_one('a')

<a href="http://www.7723.cn/download/12095.htm" target="_blank" title="神兽传说3－救赎大陆"><img alt="神兽传说3－救赎大陆" height="80" src="http://image.7723.cn/wuza/pics/20121218O196817.jpg" width="120"/></a>

# select the remaining games

In [56]:
els = ul_container.select('dd>li')
len(els)

11

In [57]:
for el in els:
    print('='*32)
    print(el)

<li><dl><dt class="img"><a href="http://www.7723.cn/download/12094.htm" target="_blank" title="血雨深情－颠覆风云"><img alt="血雨深情－颠覆风云" height="80" src="http://image.7723.cn/wuza/pics/20121217O793107.jpg" width="120"/></a></dt>
<dd>
<a href="http://www.7723.cn/download/12094.htm" target="_blank" title="血雨深情－颠覆风云">血雨深情－颠覆风云</a></dd>
<dd>
独孤星做为剑龙院大弟子，与师妹若沛菡和璃凡烟受命支援被异族入侵的洪源城前线。在前线中三人与异族从极北之地召唤...
<a href="http://www.7723.cn/download/12094.htm" target="_blank" title="查看《血雨深情－颠覆风云》更多简介">查看更多&gt;&gt;</a></dd><dd class="load"><a href="http://www.7723.cn/download/12094.htm" target="_blank" title="进入下载《血雨深情－颠覆风云》">下载</a></dd></dl>
<ul class="list"><li>类型：角色扮演</li><li>语言：中文</li><li>发布日期：2012-12-17</li></ul>
</li>
<li><dl><dt class="img"><a href="http://www.7723.cn/download/12079.htm" target="_blank" title="无情杀－风云初起"><img alt="无情杀－风云初起" height="80" src="http://image.7723.cn/wuza/pics/20121211O708509.jpg" width="120"/></a></dt>
<dd>
<a href="http://www.7723.cn/download/12079.htm" target="_blank" title="无情杀

In [58]:
el

<li><dl><dt class="img"><a href="http://www.7723.cn/download/12034.htm" target="_blank" title="守护者之刃II－天使之光"><img alt="守护者之刃II－天使之光" height="80" src="http://image.7723.cn/wuza/pics/20121120O619654.jpg" width="120"/></a></dt>
<dd>
<a href="http://www.7723.cn/download/12034.htm" target="_blank" title="守护者之刃II－天使之光">守护者之刃II－天使之光</a></dd>
<dd>
亚洲年度下载量创纪录巅峰之作震撼回归。享受传统RPG的乐趣感受最爽快的战斗。超炫的技能设计，以及自由风格的天赋加...
<a href="http://www.7723.cn/download/12034.htm" target="_blank" title="查看《守护者之刃II－天使之光》更多简介">查看更多&gt;&gt;</a></dd><dd class="load"><a href="http://www.7723.cn/download/12034.htm" target="_blank" title="进入下载《守护者之刃II－天使之光》">下载</a></dd></dl>
<ul class="list"><li>类型：角色扮演</li><li>语言：中文</li><li>发布日期：2012-11-20</li></ul>
</li>

In [59]:
el.select_one('a')

<a href="http://www.7723.cn/download/12034.htm" target="_blank" title="守护者之刃II－天使之光"><img alt="守护者之刃II－天使之光" height="80" src="http://image.7723.cn/wuza/pics/20121120O619654.jpg" width="120"/></a>

In [64]:
def parse_game_listing_page(url: str):
    content = GET(url)
    if content is None:
        return None
    
    _, content = Encoding.decode(content)
    soup = BeautifulSoup(content)
    
    selector = '#content'
    content_div = soup.select_one(selector)
    if content_div is None:
        print(f'Failed to get content container with selector {selector}')
        print('->', url)
        return None
    
    selector = 'ul.container'
    ul_container = content_div.select_one(selector)
    if ul_container is None:
        print(f'Failed to get game list container with selector {selector}')
        print('->', url)
        return None
    
    game_pages = []

    els = ul_container.find_all('li', recursive=False)
    els.extend(ul_container.select('dd>li'))

    for el in els:
        selector = 'a'
        anchor = el.select_one(selector)
        if anchor is None:
            print(f'This game entry does not have an a element!')
            print('->', el)
            print('->', url)
            continue
            
        if 'href' in anchor.attrs:
            game_page_url = anchor.attrs['href']
            game_pages.append(game_page_url)
    
    return game_pages

In [65]:
game_pages = parse_game_listing_page('http://www.7723.cn/zuixin/jiaose_1.htm')
game_pages

Pulling request content from cache!
http://www.7723.cn/zuixin/jiaose_1.htm


['http://www.7723.cn/download/12095.htm',
 'http://www.7723.cn/download/12094.htm',
 'http://www.7723.cn/download/12079.htm',
 'http://www.7723.cn/download/12078.htm',
 'http://www.7723.cn/download/12077.htm',
 'http://www.7723.cn/download/12076.htm',
 'http://www.7723.cn/download/12075.htm',
 'http://www.7723.cn/download/11970.htm',
 'http://www.7723.cn/download/12069.htm',
 'http://www.7723.cn/download/12049.htm',
 'http://www.7723.cn/download/4377.htm',
 'http://www.7723.cn/download/12034.htm']

In [66]:
game_pages = parse_game_listing_page('http://www.7723.cn/zuixin/jiaose_181.htm')
game_pages

Pulling request content from cache!
http://www.7723.cn/zuixin/jiaose_181.htm


[]