In [1]:
# Define various auxiliary helpers and
# design a primary function to scrape

import numpy as np
from google_play_scraper import reviews, app
import time
import logging
import sys

def setup_logger(silent = False):
    """Setup a logger that streams to stdout (not red background on Jupyter)
    
    When used in Kaggle, the output log can be obtained via Log page.
    """
    logger = logging.getLogger("mylog")
    
    if silent:
        logger.setLevel(logging.ERROR)
    else:
        logger.setLevel(logging.DEBUG)
    
    stream_handler = logging.StreamHandler(stream = sys.stdout)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)
    
    return logger

def scrape(APP_ID, sleep_second = 0.05, max_time = 8.5*3600, silent = False):
    """Scrape reviews on APP_ID
    
    Scrape Google Play Store reviews of application with id APP_ID,
    using language and country tag 'id' (Indonesia).
    
    Every 1000 reviews scraped, add delay time of sleep_second seconds
    to avoid dead responses/bottleneck/blocking.
    
    Halt scraping process when reaches max_time seconds.
    """
    
    logger = setup_logger(silent = silent)
    
    logger.info(f"BEGINNING OF FUNCTION CALL")
    logger.info(f"==================================")
    logger.info(f"Scraping started for: {APP_ID}")
    logger.info(f"Language and country tag is 'id'")
    
    app_info = app(APP_ID, lang = 'id', country = 'id')
    
    logger.info(f'----------------------------------')
    logger.info(f"Information")
    logger.info(f"Name: {app_info['title']}")
    logger.info(f"Enlisted reviews count : {app_info['reviews']}")
    logger.info(f'----------------------------------')
    
    start_time = time.time()
    timeout = start_time + max_time
    
    result, continuation_token = reviews(
        APP_ID, lang = 'id', country = 'id'
    )
    
    _n_prev, _n_current = len(result), len(result)
    logger.info(f"Number of acquired reviews so far: {_n_current}")
    
    # If the .token attribute is not None, it means there's more to scrape.
    while continuation_token.token:
        _result, continuation_token = reviews(
            APP_ID, lang = 'id', country = 'id',
            continuation_token = continuation_token
        )
        
        result += _result
        _n_current = len(result)
        # Update counts every 1000 reviews fetched
        # Deliberately delay requests by sleep_in_seconds seconds on every count update
        if _n_current - _n_prev >= 1000:
            _n_prev = _n_current
            logger.info(f"Number of acquired reviews so far: {_n_current}")
            
            time.sleep(sleep_second)
        
        if time.time() >= timeout:
            logger.warning(f"Timeout reached, escaping loop.")
            break
    
    seconds_elapsed = time.time() - start_time
    minutes_elapsed = np.round(seconds_elapsed/60, 4)
    logger.info(f"Scraping finished after {minutes_elapsed} minutes")
    
    final_count = len(result)
    fraction = np.round(final_count/app_info['reviews'], 4)*100
    
    logger.info(f"Final count of reviews scraped: {final_count}")
    logger.info(f"{fraction}% of enlisted review count.")
    logger.info(f"==================================")
    logger.info(f"END OF FUNCTION CALL")
    
    return result

In [2]:
# Perform scraping
APP_NAME = 'com.dafturn.mypertamina'
result = scrape(APP_NAME, max_time = 2*3600)

2022-07-04 17:18:50,992 - mylog - INFO - BEGINNING OF FUNCTION CALL
2022-07-04 17:18:51,042 - mylog - INFO - Scraping started for: com.dafturn.mypertamina
2022-07-04 17:18:51,046 - mylog - INFO - Language and country tag is 'id'
2022-07-04 17:19:05,022 - mylog - INFO - ----------------------------------
2022-07-04 17:19:05,022 - mylog - INFO - Information
2022-07-04 17:19:05,022 - mylog - INFO - Name: MyPertamina
2022-07-04 17:19:05,022 - mylog - INFO - Enlisted reviews count : 140001
2022-07-04 17:19:05,022 - mylog - INFO - ----------------------------------
2022-07-04 17:19:08,866 - mylog - INFO - Number of acquired reviews so far: 100
2022-07-04 17:19:28,125 - mylog - INFO - Number of acquired reviews so far: 1100
2022-07-04 17:20:11,197 - mylog - INFO - Number of acquired reviews so far: 2100
2022-07-04 17:20:26,085 - mylog - INFO - Number of acquired reviews so far: 3100
2022-07-04 17:20:49,115 - mylog - INFO - Number of acquired reviews so far: 4100
2022-07-04 17:21:13,357 - mylo

In [3]:
# Turn the resulting list of dictionary (json format)
# into pandas dataframe.
import pandas as pd
raw_df = pd.json_normalize(result)

# Drop the user image column, it will never be used.
# Other columns may be useful. But who knows.
result_df = raw_df.drop(['userImage'], axis = 1).copy()

# Get indexes for latest version
# idx_latest_ver = result_df[result_df['reviewCreatedVersion'] == '4.0.2'].index

# Take latest version
# result_df = result_df.loc[idx_latest_ver, :]

# Show dataframe
result_df

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
0,e3ec75ec-675b-4328-bed5-e2f3dbfe0a91,Radhy Aiman,Ini salah masa buka hp dipertamina,1,0,3.6.2,2022-07-04 17:18:54,,NaT
1,a6923e84-bef9-4c86-8b88-ae6b8d98940b,rara mulan,Mau menyusahkan rakyat ya....?,1,0,3.6.2,2022-07-04 17:18:51,,NaT
2,384c929f-6554-4723-bf83-355656281321,Indramayu TV Official,"Sering nge bug, dan grafiknya juga buriq kayak...",1,0,3.6.2,2022-07-04 17:18:50,,NaT
3,c96d5d1d-3c2f-408b-8f42-2cf2c3e411ea,M.fahri29 29,Uh,1,0,3.6.2,2022-07-04 17:18:44,,NaT
4,512d0b16-8fd4-42b4-b648-6b22c28bc422,Soko Hengky,Ribet,1,0,3.6.2,2022-07-04 17:18:36,,NaT
...,...,...,...,...,...,...,...,...,...
122989,273b017d-1b0a-4b8a-961a-faa7629f40f2,Pengguna Google,Mantap...,5,1,,2017-08-10 21:19:14,Terima kasih sobat Sukrons KRN atas apresiasin...,2018-09-14 13:13:14
122990,7c86bfc3-7d7f-425d-98d9-bde583f2f6d9,Pengguna Google,#bringbackRioHaryantoF12018 @pertaminaracingid,3,0,,2017-08-10 09:50:30,Terima kasih atas masukannya sobat bukhari yul...,2018-09-17 04:29:16
122991,45e977af-e705-4d02-9ad4-4c3d357e0b1a,Pengguna Google,Downloader ke 100... Nice apps... smg ada prog...,5,1,1.0.2,2017-08-10 08:43:30,Terima kasih sobat Dudy Effendi atas attention...,2018-09-14 13:06:49
122992,592690d7-ecdb-4796-ae06-6764c70b3840,Pengguna Google,Good start.. be responsive toward feedback and...,5,0,1.0,2017-08-09 17:58:21,Hai sobat Ilmianto Boediman. Terima kasih atas...,2018-10-03 15:03:26


In [5]:
# Save to csv, the most loved format by data scientists
result_df.to_csv('mypertamina-review.csv', index=False)