## Scrape Google Play Store reviews
---

In [1]:
import sys 

!{sys.executable} -m pip install google-play-scraper
!{sys.executable} -m pip install tzlocal



In [75]:
import pandas as pd

# for scraping app info and reviews from Google Play
from google_play_scraper import app, Sort, reviews

# for pretty printing data structures
from pprint import pprint

# for keeping track of timing
import datetime as dt
from tzlocal import get_localzone

# for building in wait times
import random
import time

In [76]:
app_name = 'GCash'
app_id = 'com.globe.gcash.android'
    
# Get start time
start = dt.datetime.now(tz=get_localzone())
fmt= "%m/%d/%y - %T %p"    

# Print starting output for app
print('---'*20)
print('---'*20)    
print(f'***** {app_name} started at {start.strftime(fmt)}')
print()

# Empty list for storing reviews
app_reviews = []

# Number of reviews to scrape per batch
count = 200

# To keep track of how many batches have been completed
batch_num = 0


# Retrieve reviews (and continuation_token) with reviews function
rvws, token = reviews(
    app_id,           # found in app's url
    lang='en',        # defaults to 'en'
    country='ph',     # defaults to 'us'
    sort=Sort.NEWEST, # start with most recent
    count=count,       # batch size
    filter_score_with=1 # defaults to None(means all score)
)


# Add the list of review dicts to overall list
app_reviews.extend(rvws)

# Increase batch count by one
batch_num +=1 
print(f'Batch {batch_num} completed.')

# convert list of review dictionaries to a dataframe
gps_df = pd.DataFrame(app_reviews)

# Wait 20 to 30 seconds to start next batch
time.sleep(random.randint(10, 15))

# Append review IDs to list prior to starting next batch
pre_review_ids = []
for rvw in app_reviews:
    pre_review_ids.append(rvw['reviewId'])

# Loop through at most max number of batches
for batch in range(200):
    rvws, token = reviews( # store continuation_token
        app_id,
        lang='en',
        country='ph',
        sort=Sort.NEWEST,
        count=count,
        continuation_token=token, # using token obtained from previous batch
        filter_score_with=1 # defaults to None(means all score)
    )

    # Append unique review IDs from current batch to new list
    new_review_ids = []
    for r in rvws:
        new_review_ids.append(r['reviewId'])

    # Add the list of review dicts to main app_reviews list
    app_reviews.extend(rvws)

    # Increase batch count by one
    batch_num +=1

    # Break loop and stop scraping for current app if most recent batch
      # did not add any unique reviews
    all_review_ids = pre_review_ids + new_review_ids
    if len(set(pre_review_ids)) == len(set(all_review_ids)):
        print(f'No reviews left to scrape. Completed {batch_num} batches.\n')
        break

    # all_review_ids becomes pre_review_ids to check against 
      # for next batch
    pre_review_ids = all_review_ids

    # At every 100th batch
    if batch_num%10==0:

        # print update on number of batches
        print(f'Batch {batch_num} completed.')

        # print update about num reviews inserted
        store_time = dt.datetime.now(tz=get_localzone())
        print(f"""
        Successfully inserted {len(app_reviews)} {app_name} 
        reviews into collection at {store_time.strftime(fmt)}.\n
        """)

#         # empty our list for next round of 100 batches
#         app_reviews = []

    # Wait 20 to 30 seconds to start next batch
    time.sleep(random.randint(10, 15))

# Print update when max number of batches has been reached
  # OR when last batch didn't add any unique reviews
print(f'Done scraping {app_name}.')
print(f'Scraped a total of {len(set(pre_review_ids))} unique reviews.\n')

# Append new reviews to main dataframe
gps_df = gps_df.append(app_reviews, ignore_index=True)

# Get end time
end = dt.datetime.now(tz=get_localzone())

# Print ending output for app
print(f"""
Successfully inserted all {app_name} reviews into collection
at {end.strftime(fmt)}.\n
""")
print(f'Time elapsed for {app_name}: {end-start}')
print('---'*20)
print('---'*20)
print('\n')

------------------------------------------------------------
------------------------------------------------------------
***** GCash started at 06/04/21 - 19:15:32 PM

Batch 1 completed.
Batch 10 completed.

        Successfully inserted 2000 GCash 
        reviews into collection at 06/04/21 - 19:16:21 PM.

        


URLError: <urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>

In [74]:
gps_df.sort_values(by=['at'])

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
4399,gp:AOqpTOH83ytb9EaYwfNkPhN9jn4Pnqk2qqb--w7qMLq...,Patricia Nicole Lambert,https://play-lh.googleusercontent.com/a-/AOh14...,At first this app is good. But now? I can't re...,1,1,5.39.1,2021-04-12 14:23:16,,NaT
4398,gp:AOqpTOF6978YafD0p3TZaVdBkTL0zE5Ev5_frJ0KRsT...,Michael Perfecio,https://play-lh.googleusercontent.com/a-/AOh14...,Disappointing. I trasferred Cash from my BPI A...,1,0,5.39.1,2021-04-12 14:26:21,,NaT
4397,gp:AOqpTOG7ZzVW43WgZXBJoG_bE2APYNEcKJdAUa3LLdd...,Ema Soñas,https://play-lh.googleusercontent.com/a-/AOh14...,Puro update😤😤😤palagi nlng error system,1,0,5.37.0,2021-04-12 14:29:52,,NaT
4396,gp:AOqpTOF4wY_FfThVepsisKtbWStpYOK-chyuv6PPdeR...,Yolly Dabandan,https://play-lh.googleusercontent.com/a-/AOh14...,Bkit diko sya maopen loading lng sya,1,0,5.38.1,2021-04-12 14:30:17,,NaT
4395,gp:AOqpTOHuqAGgzFZVTxcInD_KKgdAhX3QVXd9t9071fr...,Jenelyn Diarios,https://play-lh.googleusercontent.com/a-/AOh14...,Sana Naman po may paraan para ma fully verifie...,1,0,5.38.1,2021-04-12 14:30:56,,NaT
...,...,...,...,...,...,...,...,...,...,...
202,gp:AOqpTOEDDrxsqfu1BDS2oJEsCc1EjmjfPuQ0_Qfi6_c...,Tori Aquino,https://play-lh.googleusercontent.com/a/AATXAJ...,"I didnt get any authentication code,how could ...",1,0,5.40.0,2021-06-04 18:57:05,,NaT
1,gp:AOqpTOEVVxC-qpzK4KIpHwUTsCoBD9GJwmRr335PuTV...,Joselito Manalad,https://play-lh.googleusercontent.com/a/AATXAJ...,Ok,1,0,5.37.0,2021-06-04 19:01:57,,NaT
201,gp:AOqpTOEVVxC-qpzK4KIpHwUTsCoBD9GJwmRr335PuTV...,Joselito Manalad,https://play-lh.googleusercontent.com/a/AATXAJ...,Ok,1,0,5.37.0,2021-06-04 19:01:57,,NaT
200,gp:AOqpTOHSOrxo0ZIYMmDzNE9eIwogfevhbMvvK-SqkKQ...,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Ok,1,0,5.41.0,2021-06-04 19:02:55,,NaT


In [65]:
asd_df = gps_df
asd_df.drop_duplicates()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
0,gp:AOqpTOFYzXkeYpU0b4EJOZDSGbnck-BVZKmcVCB2rwo...,Mylene I Bollido,https://play-lh.googleusercontent.com/a/AATXAJ...,hindi ko mabuksan gcash ko,1,0,5.40.0,2021-06-04 17:20:55,,NaT
1,gp:AOqpTOFs1TfHX3rC5OOzSqTPZy6XveVjaBv7KSSfNbj...,MyMy,https://play-lh.googleusercontent.com/a-/AOh14...,No one's replying to my email. I created a tic...,1,0,5.41.0,2021-06-04 17:04:27,,NaT
2,gp:AOqpTOEVUSkwKeKxOwFJfiWwU_l1KK8MN2vPRzlBysW...,rose panidar,https://play-lh.googleusercontent.com/a-/AOh14...,since the latest update i can't open the app a...,1,0,5.41.0,2021-06-04 17:04:01,,NaT
3,gp:AOqpTOGN8otsBdWWDqG1FWPUnLsiy6LpXjRC6lqa2sO...,jade abella,https://play-lh.googleusercontent.com/a-/AOh14...,I cant install gg,1,0,,2021-06-04 17:04:00,,NaT
4,gp:AOqpTOHxbH_wLsJ4zJjM3C_2GZyHS-p36yA5Rl13yjP...,John Isaac Laroya,https://play-lh.googleusercontent.com/a-/AOh14...,Gscore is not INCREASING anymore Now I Cant us...,1,0,5.1.0,2021-06-04 16:55:40,,NaT
...,...,...,...,...,...,...,...,...,...,...
110195,gp:AOqpTOGe-WKaslLTP43zOOwwV79gQhEdChpXHGZaHHj...,uno Gascon,https://play-lh.googleusercontent.com/a-/AOh14...,"I need to update my email,gcash send the codes...",1,0,5.30.2,2020-07-27 09:19:23,,NaT
110196,gp:AOqpTOHNaX4Zv4KgSAu9Ren1HOFLLpghimdQ2gYn__G...,Jasniyah M Manalocon,https://play-lh.googleusercontent.com/a-/AOh14...,nice,1,0,5.30.2,2020-07-27 08:55:05,,NaT
110197,gp:AOqpTOEuMsYD3eImHQuyP-RvrGCJ82ow_MB-j9_ZrMF...,Sherwin Mendoza,https://play-lh.googleusercontent.com/a-/AOh14...,Useless.... I have never reached max limit and...,1,0,5.30.2,2020-07-27 08:53:56,,NaT
110198,gp:AOqpTOEDkwyEHTzPcHtTNLbGRhNhtqTTbGuN1UCadPH...,Hailey Delfino,https://play-lh.googleusercontent.com/a/AATXAJ...,Taas ng fees,1,0,,2020-07-27 08:52:21,,NaT


In [None]:
from google_play_scraper import reviews_all

app_name = 'GCash'
app_id = 'com.globe.gcash.android'

# scrape all 1-star reviews
gps_one_star_reviews = reviews_all(
    app_id,
    sleep_milliseconds=random.randint(10000, 20000), # defaults to 0
    lang='en', # defaults to 'en'
    country='ph', # defaults to 'us'
    sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
    filter_score_with=1 # defaults to None(means all score)
)

In [None]:
gps_five_star_reviews = reviews_all(
    app_id,
    sleep_milliseconds=random.randint(3000, 5000), # defaults to 0
    lang='en', # defaults to 'en'
    country='ph', # defaults to 'us'
    sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
    filter_score_with=5 # defaults to None(means all score)
)

In [None]:
# convert list of review dictionaries to a dataframe
gps_df = pd.DataFrame(gps_reviews)
gps_df.sort_values(by=['at'])

In [12]:
# Retrieve reviews (and continuation_token) with reviews function
app_id = 'com.globe.gcash.android'

# rvws, token = reviews(
#     app_id,           # found in app's url
#     lang='en',        # defaults to 'en'
#     country='ph',     # defaults to 'us'
#     sort=Sort.NEWEST, # start with most recent
#     count=200,      # batch size
#     filter_score_with=1 # defaults to None(means all score)
# )

# df = pd.DataFrame(rvws)

# To keep track of how many batches have been completed
batch_num = 0

app_reviews = []

# Loop through at most max number of batches
for batch in range(5):
    # Retrieve reviews (and continuation_token) with reviews function
    rvws, token = reviews(
        app_id,           # found in app's url
        lang='en',        # defaults to 'en'
        country='us',     # defaults to 'us'
        sort=Sort.NEWEST, # start with most recent
        count=count       # batch size
    )

    # Add the list of review dicts to overall list
    app_reviews.extend(rvws)

    # Increase batch count by one
    batch_num +=1 
    print(f'Batch {batch_num} completed.')

    # Wait 1 to 5 seconds to start next batch
    time.sleep(random.randint(3, 5))

Batch 1 completed.
Batch 2 completed.
Batch 3 completed.
Batch 4 completed.
Batch 5 completed.


In [13]:
df = pd.DataFrame(app_reviews)

In [14]:
df.sort_values(by=['at'], ascending=False)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
0,gp:AOqpTOH5qWAsKwB_W8qlkSNx9ZaHAo97sbOOqnkzv7D...,Kush Rodriguez,https://play-lh.googleusercontent.com/a/AATXAJ...,great,5,0,5.40.0,2021-06-04 13:44:44,,
400,gp:AOqpTOH5qWAsKwB_W8qlkSNx9ZaHAo97sbOOqnkzv7D...,Kush Rodriguez,https://play-lh.googleusercontent.com/a/AATXAJ...,great,5,0,5.40.0,2021-06-04 13:44:44,,
800,gp:AOqpTOH5qWAsKwB_W8qlkSNx9ZaHAo97sbOOqnkzv7D...,Kush Rodriguez,https://play-lh.googleusercontent.com/a/AATXAJ...,great,5,0,5.40.0,2021-06-04 13:44:44,,
600,gp:AOqpTOH5qWAsKwB_W8qlkSNx9ZaHAo97sbOOqnkzv7D...,Kush Rodriguez,https://play-lh.googleusercontent.com/a/AATXAJ...,great,5,0,5.40.0,2021-06-04 13:44:44,,
200,gp:AOqpTOH5qWAsKwB_W8qlkSNx9ZaHAo97sbOOqnkzv7D...,Kush Rodriguez,https://play-lh.googleusercontent.com/a/AATXAJ...,great,5,0,5.40.0,2021-06-04 13:44:44,,
...,...,...,...,...,...,...,...,...,...,...
799,gp:AOqpTOFwlnV-N277IHc-DEmOCV1A6d3CHtJOy0KRQGi...,Eddie boy Reyteran,https://play-lh.googleusercontent.com/a/AATXAJ...,Puro update,1,0,5.40.0,2021-06-04 11:08:26,,
199,gp:AOqpTOFwlnV-N277IHc-DEmOCV1A6d3CHtJOy0KRQGi...,Eddie boy Reyteran,https://play-lh.googleusercontent.com/a/AATXAJ...,Puro update,1,0,5.40.0,2021-06-04 11:08:26,,
399,gp:AOqpTOFwlnV-N277IHc-DEmOCV1A6d3CHtJOy0KRQGi...,Eddie boy Reyteran,https://play-lh.googleusercontent.com/a/AATXAJ...,Puro update,1,0,5.40.0,2021-06-04 11:08:26,,
599,gp:AOqpTOFwlnV-N277IHc-DEmOCV1A6d3CHtJOy0KRQGi...,Eddie boy Reyteran,https://play-lh.googleusercontent.com/a/AATXAJ...,Puro update,1,0,5.40.0,2021-06-04 11:08:26,,
