In [1]:
# local imports
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from selenium_alza_client import SeleniumAlzaClient
from utils import get_project_root, read_json, save_json
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('alza_data_fetch')



In [2]:
# Hardcoded ID, look readme for more info.
category_id = '18852759'

raw_data_path = os.path.join(get_project_root(), f'data/raw_{category_id}.json')
raw_data = []
try:
    raw_data = read_json(raw_data_path)
except Exception as e:
    raise Exception(f'Failed to find raw data for category id: {category_id}')

In [3]:
id_list = list(map(lambda q: q['id'], raw_data))

review_stats_path = os.path.join(get_project_root(), f'data/review_stats_{category_id}.json')
review_stats = []
if os.path.isfile(review_stats_path):
    review_stats = read_json(review_stats_path)
    processed_ids_list = list(map(lambda q: q['id'], review_stats))
    logger.info(f'Found {len(processed_ids_list)} existing review items')
    id_list = [id for id in id_list if id not in processed_ids_list]

logger.info(f'Items to process {len(id_list)}')


INFO:alza_data_fetch:Found 210 existing review items
INFO:alza_data_fetch:Items to process 876


In [4]:
def merge_review_stats(existing_stats, new_stats):
    merged_stats = existing_stats.copy()
    merged_stats.extend(new_stats)
    return merged_stats

with SeleniumAlzaClient({ 'delay': 5, 'continue_on_error_delay': 120 }) as alza_client:
    batch_size = 50
    processed_review_stats = []
    for review_stat in alza_client.get_reviews_stats_generator(id_list):
        processed_review_stats.append(review_stat)
        if len(processed_review_stats) % batch_size == 0:
            logger.info(f'Review stats processed: {len(processed_review_stats)}')
            save_json(merge_review_stats(review_stats, processed_review_stats), review_stats_path)

    review_stats = merge_review_stats(review_stats, processed_review_stats)
    save_json(review_stats, review_stats_path)

logger.info(f'Final review stats count: {len(review_stats)}')

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/juliabadrutdinova/.wdm/drivers/chromedriver/mac64/120.0.6099.109/chromedriver-mac-x64/chromedriver] found in cache
INFO:alza_data_fetch:Review stats processed: 50
INFO:alza_data_fetch:Review stats processed: 100
INFO:alza_data_fetch:Review stats processed: 150
INFO:alza_data_fetch:Review stats processed: 200
INFO:alza_data_fetch:Review stats processed: 250
INFO:alza_data_fetch:Review stats processed: 300
INFO:alza_data_fetch:Review stats processed: 350
INFO:alza_data_fetch:Review stats processed: 400
INFO:alza_data_fetch:Review stats processed: 450
INFO:alza_data_fetch:Review stats processed: 500
INFO:alza_data_fetch:Review stats processed: 550
INFO:alza_data_fetch:Review stats processed: 600
INFO:alza_data_fetch:Review stats processed: 650
INFO:alza_data_fetch:Review stats processed: 700
INFO:alza_data_fetch:Review stats processed: 750
INFO:alza