In [None]:
import time
import re
import os
import json
from urllib.parse import urlparse, urlunparse, urljoin
from pprint import pprint
import hashlib

# external modules
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

In [None]:
class Encoding:
    UTF8 = 'utf-8'
    UTF8_WITH_BOM = 'utf-8-sig'
    UTF16 = 'utf-16'
    GB2312 = 'gb2312' # chinese encoding

    @classmethod
    def decode(cls, bs: bytes):
        try:
            return cls.UTF8_WITH_BOM, bs.decode(cls.UTF8_WITH_BOM)
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            return cls.UTF8, bs.decode(cls.UTF8)
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            return cls.UTF16, bs.decode(cls.UTF16)
        except Exception as ex:
            # traceback.print_exc()
            pass

        try:
            return cls.GB2312, bs.decode(cls.GB2312)
        except Exception as ex:
            # traceback.print_exc()
            pass

        return None, bs

In [None]:
class GameEntry:
    def __init__(
        self,
        url: str,
        title: str,
        banner_url: str,
        sample_gameplay_image_urls: list,
        versions: list,
    ):
        self.url = url
        self.title = title
        self.banner_url = banner_url
        self.sample_gameplay_image_urls = sample_gameplay_image_urls
        self.versions = versions
        
    def __repr__(self):
        return repr(self.__dict__)

In [None]:
cache_dir = '.requests_cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

last_response = None
last_exception = None


def hash_url(url: str):
    hash_str = hashlib.md5(url.encode('utf-8')).hexdigest()
    hash_str = hash_str.lower()
    return hash_str


def GET(url: str, verbose=True):
    global last_response, last_exception

    url_hash = hash_url(url)
    cache_file = os.path.join(cache_dir, url_hash)
    
    if os.path.exists(cache_file):
        if verbose:
            print('Pulling request content from cache!')
            print(url)
        content = open(cache_file, mode='rb').read()
        return content
    else:
        try:
            res = requests.get(url)
            if res.ok:
                with open(cache_file, mode='wb') as stream:
                    stream.write(res.content)

                return res.content
            else:
                last_response = res
                print('The response is not usable! Please check last_response!')
                print('->', url)
                return None
        except Exception as ex:
            last_exception = ex
            print('Failed to request the content!')
            print('->', url)
            print(ex)
    
    return None


def POST(url: str):
    global last_response, last_exception

    url_hash = hash_url(url)
    cache_file = os.path.join(cache_dir, url_hash)
    
    if os.path.exists(cache_file):
        content = open(cache_file, mode='rb').read()
        return content
    else:
        try:
            res = requests.post(url)
            if res.ok:
                with open(cache_file, mode='wb') as stream:
                    stream.write(res.content)

                return res.content
            else:
                last_response = res
                print('The response is not usable! Please check last_response!')
                print('->', url)
                return None
        except Exception as ex:
            last_exception = ex
            print('Failed to request the content!')
            print('->', url)
            print(ex)
    
    return None

In [None]:
def parse_game_entry_url(url: str):
    content = GET(url)
    
    if content is None:
        print('Please check error from global variables!')
        return None
    
    _, content = Encoding.decode(content)
    soup = BeautifulSoup(content)
    
    selector = '#content'
    content_div = soup.select_one(selector)
    if content_div is None:
        print(f'Format for this page is not compatible! There is no element matches {selector}!')
        print('->', url)
        return None
    
    # retrieve game title
    title = None
    selector = '.title'
    title_div = content_div.select_one(selector)
    if title_div is None:
        print(f'Cannot find title! There is no element matches {selector}!')
        print('->', url)
    else:
        selector = 'h3'
        title_heading = title_div.select_one(selector)
        if title_heading is None:
            print(f'Cannot find title! There is no element matches {selector}!')
            print('->', url)
        else:
            title = title_heading.text
    
    selector = 'ul.container'
    ul_containers = content_div.select('ul.container')
    if not (len(ul_containers) == 3):
        print(f'Format for this page is not compatible! Number of elements match {selector} is not supported!')
        print('->', url)
        return None

    # retrieve game banner
    banner_url = None
    selector = 'img'
    banner_img = ul_containers[0].select_one(selector)
    if banner_img is None:
        print(f'Failed to get game banner with selector {selector}!')
        print('->', url)
    else:
        if 'src' in banner_img.attrs:
            banner_url = banner_img.attrs['src']
        else:
            print(f'Failed to get game banner. The img element does not contain src attribute!')
            print('->', url)
    
    # retrieve sample gameplay images
    sample_gameplay_image_urls = [imgE.attrs['src'] for imgE in ul_containers[1].select('img')]
    
    # retrieve game binaries for multiple phone models
    versions = []
    li_containers = ul_containers[2].select('li')
    versions = []
    for li_container in li_containers:
        selector = 'a'
        anchor_element = li_container.select_one(selector)
        if anchor_element is None:
            print(f'Failed to select anchor with {selector}!')
            print('->', url)
            print('->', li_container)
            continue
        
        if not 'href' in anchor_element.attrs:
            print(f'Failed to retrieve download url. Element does not have href attribute.')
            print('->', url)
            print(anchor_element)
            continue
            
        # send a post request to download the game
        version_url  = anchor_element.attrs['href']
        
        version_resolution = None
        version_model = None
        version_desc = None
        
        selector = 'p'
        version_desc_el = li_container.select_one(selector)
        if version_desc_el is None:
            print(f'Failed to retrieve game version description with {selector} selector!')
            print('->', url)
        else:
            version_desc = version_desc_el.text
            resolution_sr = re.search(r'\((\d+)×(\d+)\)', version_desc)
            
            if resolution_sr is None:
                print(f'Failed to search for this game version screen resolution in description!')
                print('->', url)
                print('->', version_desc)
            else:
                version_resolution = version_desc[resolution_sr.start():resolution_sr.end()]
                version_resolution = version_resolution.replace('×', 'x')
                version_resolution = re.sub(r'[\(\)]+', '', version_resolution)
                
                model_sr = re.search(r'[\x00-\x7F]+', version_desc[:resolution_sr.start()])
                
                if model_sr is None:
                    if '触摸屏通用版' in version_desc:
                        # universal touch screen phone models
                        version_model = 'touch'
                    elif '屏通用版' in version_desc:
                        # universal phone models
                        version_model = 'universal'
                    else:
                        print(f'Failed to search for supported model in description!')
                        print('->', url)
                        print('->', version_desc)
                else:
                    version_model = version_desc[model_sr.start():model_sr.end()]
                    version_model = version_model.replace(' ', '')

        versions.append({
            'url': version_url,
            'resolution': version_resolution,
            'model': version_model,
            'description': version_desc,
        })
    
    return GameEntry(
        url=url,
        title=title,
        banner_url=banner_url,
        sample_gameplay_image_urls=sample_gameplay_image_urls,
        versions=versions,
    )

In [None]:
urls = [
    'http://www.7723.cn/download/10172.htm', # 战姬无双-花缭乱
    'http://www.7723.cn/download/8077.htm', # 苍穹默示录完美运行版
    'http://www.7723.cn/download/10420.htm', # 苍弓默示录－吞噬时空    
]

In [None]:
entry_url = 'http://www.7723.cn/download/8077.htm'
entry = parse_game_entry_url(entry_url)
entry

In [None]:
pprint(entry.__dict__)

# Crawl all the entry urls from multiple pages from a game genre page.

In [None]:
genre_first_page_urls = [
    'http://www.7723.cn/zuixin/jiaose_1.htm', # 角色扮演 # RPG
    'http://www.7723.cn/zuixin/yizhi_1.htm', # 益智游戏 # Puzzle games
    'http://www.7723.cn/zuixin/dongzuo_1.htm', # 动作游戏 # Action games
    'http://www.7723.cn/zuixin/saiche_1.htm', # 赛车游戏 # Racing games
    'http://www.7723.cn/zuixin/maoxian_1.htm', # 冒险游戏 # Adventure games
    'http://www.7723.cn/zuixin/yangcheng_1.htm', # 养成游戏 # Dating sim?
    'http://www.7723.cn/zuixin/tiyu_1.htm', # 体育游戏 # Sports games
    'http://www.7723.cn/zuixin/gedou_1.htm', # 格斗游戏 # Fighting games
    'http://www.7723.cn/zuixin/qipai_1.htm', # 棋牌游戏 # Board games
    'http://www.7723.cn/zuixin/celue_1.htm', # 策略游戏 # Strategy games
    'http://www.7723.cn/zuixin/sheji_1.htm', # 射击游戏 # Shooting games
    'http://www.7723.cn/zuixin/moni_1.htm', # 模拟经营 # Simulation (city building, shop management, etc.)
    'http://www.7723.cn/zuixin/feixing_1.htm', # 飞行游戏 # Flying (e.g. space ship) games
    'http://www.7723.cn/zuixin/wangyou_1.htm', # 手机网游 # online games
]

In [None]:
for url in genre_first_page_urls:
    print(url)
    GET(url)

# Note

In [None]:
content = GET(genre_first_page_urls[0])
len(content)

In [None]:
_, content = Encoding.decode(content)
len(content)

In [None]:
content

In [None]:
soup = BeautifulSoup(content)

In [None]:
selector = '.pagenation'
page_container = soup.select_one(selector)
page_container

In [None]:
selector = 'a'
els = page_container.select(selector)
len(els)

In [None]:
last_page_anchor = els[-1]
last_page_anchor

In [None]:
last_page_url = last_page_anchor.attrs['href']
last_page_url

In [None]:
genre_base, last_page_doc_name = os.path.split(last_page_url)
genre_base, last_page_doc_name

In [None]:
num_sr = re.search(r'\d+', last_page_doc_name)
num_sr

In [None]:
num_text = last_page_doc_name[num_sr.start():num_sr.end()]
num_text

In [None]:
num_pages = int(num_text)
num_pages

# Generate genre's page urls

In [None]:
last_page_doc_name[:num_sr.start()]

In [None]:
last_page_doc_name[num_sr.end():]

In [None]:
genre_page_urls = []

for i in range(num_pages):
    page_url = f'{genre_base}/{last_page_doc_name[:num_sr.start()]}{i+1}{last_page_doc_name[num_sr.end():]}'
    genre_page_urls.append(page_url)
    
genre_page_urls

# Pack all of them together

In [None]:
def get_all_genre_urls(url: str):
    content = GET(url)
    
    if content is None:
        return None
    
    _, content = Encoding.decode(content)
    soup = BeautifulSoup(content)
    selector = '.pagenation'
    page_container = soup.select_one(selector)
    
    if page_container is None:
        print(f'Failed to select {selector} for getting pages container!')
        print('->', url)
        return None
    
    selector = 'a'
    els = page_container.select(selector)
    
    if len(els) == 0:
        print(f'Failed to select {selector} for page navigation anchors!')
        print('->', url)
        return None
    
    last_page_anchor = els[-1]
    if not 'href' in last_page_anchor.attrs:
        print(f'The last anchor element does not have href attribute!')
        print('->', url)
        return None

    last_page_url = last_page_anchor.attrs['href']
    genre_base, last_page_doc_name = os.path.split(last_page_url)
    
    num_sr = re.search(r'\d+', last_page_doc_name)
    if num_sr is None:
        print(f'Failed to find number of pages!')
        print('->', url)
        return None
    
    num_text = last_page_doc_name[num_sr.start():num_sr.end()]
    num_pages = int(num_text)
    
    genre_page_urls = []

    for i in range(num_pages):
        page_url = f'{genre_base}/{last_page_doc_name[:num_sr.start()]}{i+1}{last_page_doc_name[num_sr.end():]}'
        genre_page_urls.append(page_url)

    return genre_page_urls

In [None]:
url_list = []

for first_page_url in genre_first_page_urls:
    genre_page_urls = get_all_genre_urls(first_page_url)
    
    if genre_page_urls is None:
        continue

    print(len(genre_page_urls), first_page_url)
    
    url_list.extend(genre_page_urls)

pbar = tqdm(url_list)
for page_url in pbar:
    pbar.set_description(page_url)

    GET(page_url, verbose=False)

In [None]:
len(url_list)

In [None]:
url_set = set(url_list)
len(url_set)

In [None]:
url_list