In [1]:
import json
import urllib
import heapq
import time
from pprint import pprint

In [2]:
RETRYCOUNT = 20
QUERYTIMEOUT = 10
WAITTIME = 0.5

In [3]:
# known app names
KNOWN_APPNAMES = dict()
with open('data.jl', encoding='utf-8') as f:
    for l in f.readlines():
        try:
            j = json.loads(l)
        except:
            pass
        KNOWN_APPNAMES[j['appName']] = j['appDownCount']
print('size of known appnames:', len(KNOWN_APPNAMES))

size of known appnames: 73387


In [4]:
# searched appnames
SEARCHED_APPNAMES = set()
with open('searched_appname.txt', encoding='utf-8') as f:
    for l in f.readlines():
        SEARCHED_APPNAMES.add(l.strip())
print('size of searched appnames:', len(SEARCHED_APPNAMES))

size of searched appnames: 54796


In [5]:
# create the queue
QUEUE = []

def queue_append(name, count):
    QUEUE.append(name)

for name in KNOWN_APPNAMES:
    if name not in SEARCHED_APPNAMES:
        queue_append(name, KNOWN_APPNAMES[name])
print('size of queue', len(QUEUE))
QUEUE[:15]

size of queue 18606


['迅捷PDF阅读器',
 '英雄的第二个记忆',
 '指尖篮球2',
 '高考物理知识宝典',
 'Poputar',
 '游戏礼包大全',
 'English Thai Dictionary',
 '民泰手机银行',
 '民泰村镇银行',
 '掌上万家商家版',
 '三年级数学上册全解',
 '名将(免费热血版)',
 '華誼兄弟影院',
 '好运鸽',
 '记忆符号']

In [6]:
LOGFILENAME = 'log.txt'
def log(msg):
    with open(LOGFILENAME, 'ab') as f:
        f.write(msg.encode('utf-8'))
        f.write('\n'.encode('utf-8'))

In [7]:
def add_item(name, item):
    if name in KNOWN_APPNAMES: 
        #log(f'skipping [{name}] because item already crawled')
        return False
    
    KNOWN_APPNAMES[name] = item['appDownCount']
    queue_append(name, item['appDownCount'])

    with open('data.jl', 'ab') as f:
        f.write(json.dumps(item, ensure_ascii=False).encode("UTF-8"))
        f.write('\n'.encode("UTF-8"))
    #log(f'appended item [{name}]')
    
    return True

In [8]:
BASEURL = 'https://android.myapp.com/myapp/searchAjax.htm?kw={}&pns={}&sid='

def query(keyword, pns):
    kw = urllib.parse.quote(keyword)
    url = BASEURL.format(kw,pns)
    resp = ''
    for i in range(RETRYCOUNT):
        log(f'query {i}th [{keyword}] [{pns}] ')
        try:
            resp = urllib.request.urlopen(url, timeout=QUERYTIMEOUT).read().decode('utf-8')
        except KeyboardInterrupt:
            raise
        except:
            pass
        
        if len(resp) > 300: 
            return resp
        time.sleep(WAITTIME)
        
    return None

def crawl_search_keyword():
    
    keyword = QUEUE.pop(0)

    pns = ''
    found_count = 0
    while True:
        resp = query(keyword, pns)
        if resp == None:
            break
        resp = json.loads(resp)['obj']
        
        for item in resp['items']:
            # append item to datafiles
            # add to queue if doesnt already exist
            # return true if new item
            if add_item(item['appDetail']['appName'], item['appDetail']):
                found_count += 1
        
        if resp['hasNext'] == 0:
            break  # there's no next page
        pns = resp['pageNumberStack']
        
    log(f'keyword [{keyword}] added [{found_count}] new items')
    with open('searched_appname.txt', 'ab') as f:
        f.write(keyword.encode('utf-8'))
        f.write('\n'.encode('utf-8'))
        
    if len(QUEUE) != 0:
        crawl_search_keyword()

In [9]:
from concurrent.futures import ThreadPoolExecutor
NUMTHREADS = 200

executor = ThreadPoolExecutor(max_workers=NUMTHREADS)
for i in range(NUMTHREADS):
    executor.submit(crawl_search_keyword)

In [650]:
print(len(KNOWN_APPNAMES))
print(len(QUEUE))
QUEUE[:15]

76838
0


[]