In [1]:
import time
from selenium import webdriver
from pprint import pprint
import concurrent
import json
import pandas as pd
import random

In [2]:
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
# we dont need to load images
chrome_prefs = {
      'profile.managed_default_content_settings.images': 2,
    }
chrome_options.add_experimental_option('prefs', chrome_prefs)
def get_driver():
    return webdriver.Chrome(options=chrome_options)

In [3]:
KNOWN_IDS = {}
with open('app_id.txt', encoding='utf-8') as f:
    for l in f.readlines():
        try:
            j = json.loads(l)
            for appid in j:
                KNOWN_IDS[appid] =j[appid]
        except:
            raise
print(f'there are {len(KNOWN_IDS)} known app ids')

there are 813512 known app ids


In [4]:
SEARCHED_APPS = set()
with open('searched_appnames.txt', encoding='utf-8') as f:
    for l in f.readlines():
        name = l.strip()
        SEARCHED_APPS.add(name)
print(f'there are {len(SEARCHED_APPS)} searched apps')

there are 29086 searched apps


In [5]:
QUEUE = []
for appid in KNOWN_IDS:
    name = KNOWN_IDS[appid]
    if name not in SEARCHED_APPS:
        QUEUE.append(name)
random.shuffle(QUEUE)
print(f'there are {len(QUEUE)} apps in the queue, first 10 items are')
QUEUE[:10]

there are 764659 apps in the queue, first 10 items are


['One Direction New Full Offline',
 'Ice Cream Wallpapers - with Free editor',
 'Unity College',
 'MsarTaxi: easy cab rides in Sakakah, Al-Jawf',
 'Quran player offline - القرآن بدون أنترنيت',
 'TaxiPoint',
 'Jus Diet (Manfaat + Resep)',
 'Russian Camera & Voice Translator',
 '🎸 Kunci Gitar Lengkap Lagu Indonesia Offline 2019',
 'Asoebi & Ankara Styles']

In [6]:
def scrolltobottom(driver):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
def scrolltotop(driver):
    driver.execute_script("window.scrollTo(0, 0)")

In [7]:
def appendmsg(msg, filename):
    with open(filename, 'ab') as f:
        f.write(msg.encode('utf-8'))
        f.write('\n'.encode('utf-8'))
        
BUFSIZ = 1
BUFFER = []
def logid(name, appid):
    BUFFER.append((appid, name))
    if len(BUFFER) > BUFSIZ:
        for i in range(BUFSIZ):
            appid, name = BUFFER.pop(0)
            item = {appid: name}
            appendmsg(json.dumps(item, ensure_ascii=False), 'app_id.txt')

In [8]:
BASE_SEARCH_URL = 'https://play.google.com/store/search?q={}&c=apps'
SCROLL_ATTEMPT_COUNT = 15
WAIT_TIME_BETWEEN_SCROLL = 1

def task(driver):
    
    i = 0
    while len(QUEUE) != 0:
        i += 1
        if i % 100 == 0:
            random.shuffle(QUEUE)
            
        keyword = QUEUE.pop(0)
        driver.get(BASE_SEARCH_URL.format(keyword))

        # scroll down repeatedly to laod
        for i in range(SCROLL_ATTEMPT_COUNT):
            time.sleep(WAIT_TIME_BETWEEN_SCROLL)
            scrolltotop(driver)
            scrolltobottom(driver)
            try:
                driver.find_element_by_id('show-more-button').click()
            except:
                pass

        # pars all cards on the page
        cards = None
        try:
            cards = driver\
                        .find_element_by_class_name('search-page')\
                        .find_elements_by_class_name('title')
        except:
            pass
        
        if cards == None or len(cards) == 0:
            time.sleep(5)
            QUEUE.append(keyword)
            continue
        for card in cards:
            name = card.text.strip()
            appid  = card.get_attribute('href')\
                .replace('https://play.google.com/store/apps/details?id=','')

            if appid not in KNOWN_IDS:
                logid(name, appid)
                # logdebug(name, appid)
                KNOWN_IDS[appid] = name
                QUEUE.append(name)
        appendmsg(keyword, 'searched_appnames.txt')
        SEARCHED_APPS.add(keyword)
    

In [9]:
from concurrent.futures import ThreadPoolExecutor
NUMTHREADS = 8

executor = ThreadPoolExecutor(max_workers=NUMTHREADS)
for i in range(NUMTHREADS):
    executor.submit(task, get_driver())
    time.sleep(2)

In [145]:
print(len(QUEUE))
print(len(SEARCHED_APPS))

963462
47584


In [10]:
QUEUE[:10]

['WhoIs Lookup',
 'Thesis Tips',
 'Ahalya Stuti',
 'DNS changer by Blokada',
 'Wash Machine',
 'IPL cricket photo editor - IPL maker 2019',
 'Cocktail Bar Recipes',
 'Radio Hot Offline 2019',
 'US Dollar Australian Dollar USD to AUD Converter',
 'My Credit Cards']

In [24]:
random.shuffle(QUEUE)

In [None]:
None.something()

In [None]:
df = pd.read_csv('data.csv')
df

In [None]:
df = df.sort_values(by=['numrating'], ascending=False)

In [None]:
df

In [None]:
appids = []
for i in df.index:
    pkg = df['pkgname'][i]
    name = df['appname'][i]
    appids.append({pkg: name})

In [None]:
appids = appids[:1000]
print(appids)

In [None]:
for d in appids:
    appendmsg(json.dumps(d, ensure_ascii=False), 'app_id.txt')