# Part 1.1 - Scrapping App Reviews

Scrapping Google Play Store app reviews from:
- Syfe
- Endowus
- StashAway

Reference: 
https://python.plainenglish.io/scraping-storing-google-play-app-reviews-with-python-5640c933c476

In [4]:
# Import Libraries
import numpy as np
import pandas as pd

from google_play_scraper import app, Sort, reviews

from pprint import pprint

# for keeping track of timing
import datetime as dt
from tzlocal import get_localzone

# for building in wait times
import random
import time

In [5]:
## Extracting data and relevant app names + Ids
app_df = pd.read_csv('app_info.csv')
print(app_df.head())

## Get list of app names and app IDs
app_names = list(app_df['app_name'])
app_ids = list(app_df['app_id'])

    app_name                 app_id
0       Syfe               com.syfe
1    Endowus  com.endowus.mobileapp
2  StashAway      com.awp.stashaway


In [6]:
## Loop through app IDs to get app info
app_info = []
for i in app_ids:
    info = app(i)
    del info['comments']
    app_info.append(info)

## Pretty print the data for the first app
pprint(app_info[0])

app_infos_df = pd.DataFrame(app_info)
app_infos_df.to_csv('apps.csv', index=None, header=True)
app_infos_df

{'adSupported': None,
 'androidVersion': '5.0',
 'androidVersionText': '5.0 and up',
 'appId': 'com.syfe',
 'containsAds': False,
 'contentRating': 'Everyone',
 'contentRatingDescription': None,
 'currency': 'USD',
 'description': 'Invest better, faster and smarter with Syfe. Syfe is a '
                'digital wealth manager for investors who expect more – '
                'greater transparency, smart\xader personalised portfolios, '
                'and better investment outcomes.\r\n'
                '\r\n'
                'As one of Singapore’s fastest-growing robo advisors, we do '
                'all the heavy lifting for you, from fund selection, '
                'reinvesting dividends to rebalancing your portfolios and '
                'more! All you have to do is sit back and watch your money '
                'grow. Our digital investment advisor platform offers a '
                "smarter way to invest with low fees and no minimums. We're "
                'also licens

Unnamed: 0,title,description,descriptionHTML,summary,summaryHTML,installs,minInstalls,score,ratings,reviews,...,adSupported,containsAds,released,updated,version,recentChanges,recentChangesHTML,editorsChoice,appId,url
0,Syfe: Invest Better,"Invest better, faster and smarter with Syfe. S...","Invest better, faster and smarter with Syfe. S...",Start your investment journey. Invest and grow...,Start your investment journey. Invest and grow...,"50,000+",50000,4.45,311,108,...,,False,"Feb 13, 2020",1628609720,4.72,1. Introducing Syfe Select - Build wealth your...,1. Introducing Syfe Select - Build wealth your...,False,com.syfe,https://play.google.com/store/apps/details?id=...
1,"Endowus: Invest CPF, SRS, Cash","Thousands invest and grow their Cash, CPF & SR...","Thousands invest and grow their Cash, CPF &amp...",Endowus offers access to superior investment p...,Endowus offers access to superior investment p...,"10,000+",10000,4.87963,220,118,...,,False,"Oct 6, 2020",1629383049,1.3.1,Upgrade to v1.3.1 to find the latest bug fixes...,Upgrade to v1.3.1 to find the latest bug fixes...,False,com.endowus.mobileapp,https://play.google.com/store/apps/details?id=...
2,StashAway: Invest and save,StashAway is where intelligent investing meets...,StashAway is where intelligent investing meets...,Personal finance and investing,Personal finance and investing,"100,000+",100000,4.31,3002,1458,...,,False,"Oct 25, 2017",1629966240,11.95.2,A release with no new features?! Why aren’t we...,A release with no new features?! Why aren’t we...,False,com.awp.stashaway,https://play.google.com/store/apps/details?id=...


In [7]:
# Empty list for storing reviews
app_reviews = []

## Loop through apps to get reviews
for app_name, app_id in zip(app_names, app_ids):
    
    # Get start time
    start = dt.datetime.now(tz=get_localzone())
    fmt= "%m/%d/%y - %T %p"    
    
    # Print starting output for app
    print('---'*20)
    print('---'*20)    
    print(f'***** {app_name} started at {start.strftime(fmt)}')
    print()
    
    # Number of reviews to scrape per batch
    count = 200
    
    # To keep track of how many batches have been completed
    batch_num = 0
    
    
    # Retrieve reviews (and continuation_token) with reviews function
    rvws, token = reviews(
        app_id,           # found in app's url
        lang='en',        # defaults to 'en'
        country='us',     # defaults to 'us'
        sort=Sort.NEWEST, # start with most recent
        count=count       # batch size
    )
    
    # For each review obtained
    for r in rvws:
        r['app_name'] = app_name # add key for app's name
        r['app_id'] = app_id     # add key for app's id
     
    
    # Add the list of review dicts to overall list
    app_reviews.extend(rvws)
    
    # Increase batch count by one
    batch_num +=1 
    print(f'Batch {batch_num} completed.')
    
    # Wait 1 to 5 seconds to start next batch
    time.sleep(random.randint(1,5))
    
    # Append review IDs to list prior to starting next batch
    pre_review_ids = []
    for rvw in app_reviews:
        pre_review_ids.append(rvw['reviewId'])
    
    
    # Loop through at most max number of batches
    for batch in range(4999):
        rvws, token = reviews( # store continuation_token
            app_id,
            lang='en',
            country='us',
            sort=Sort.NEWEST,
            count=count,
            # using token obtained from previous batch
            continuation_token=token
        )
        
        # Append unique review IDs from current batch to new list
        new_review_ids = []
        for r in rvws:
            new_review_ids.append(r['reviewId'])
            
            # And add keys for name and id to ea review dict
            r['app_name'] = app_name # add key for app's name
            r['app_id'] = app_id     # add key for app's id
     
        # Add the list of review dicts to main app_reviews list
        app_reviews.extend(rvws)
        
        # Increase batch count by one
        batch_num +=1
        
        # Break loop and stop scraping for current app if most recent batch
          # did not add any unique reviews
        all_review_ids = pre_review_ids + new_review_ids
        if len(set(pre_review_ids)) == len(set(all_review_ids)):
            print(f'No reviews left to scrape. Completed {batch_num} batches.\n')
            break
        
        # all_review_ids becomes pre_review_ids to check against 
          # for next batch
        pre_review_ids = all_review_ids
        
        # Wait 1 to 5 seconds to start next batch
        time.sleep(random.randint(1,5))
      
    
    # Print update when max number of batches has been reached
      # OR when last batch didn't add any unique reviews
    print(f'Done scraping {app_name}.')
    print(f'Scraped a total of {len(set(pre_review_ids))} unique reviews.\n')
    
    # Get end time
    end = dt.datetime.now(tz=get_localzone())
    
    # Print ending output for app
    print(f"""
    Successfully inserted all {app_name} reviews into collection
    at {end.strftime(fmt)}.\n
    """)
    print(f'Time elapsed for {app_name}: {end-start}')
    print('---'*20)
    print('---'*20)
    print('\n')
    
    # Wait 1 to 5 seconds to start scraping next app
    time.sleep(random.randint(1,5))

------------------------------------------------------------
------------------------------------------------------------
***** Syfe started at 08/29/21 - 17:03:53 PM

Batch 1 completed.
No reviews left to scrape. Completed 2 batches.

Done scraping Syfe.
Scraped a total of 106 unique reviews.


    Successfully inserted all Syfe reviews into collection
    at 08/29/21 - 17:03:55 PM.

    
Time elapsed for Syfe: 0:00:02.281415
------------------------------------------------------------
------------------------------------------------------------


------------------------------------------------------------
------------------------------------------------------------
***** Endowus started at 08/29/21 - 17:03:56 PM

Batch 1 completed.
No reviews left to scrape. Completed 2 batches.

Done scraping Endowus.
Scraped a total of 237 unique reviews.


    Successfully inserted all Endowus reviews into collection
    at 08/29/21 - 17:04:00 PM.

    
Time elapsed for Endowus: 0:00:03.386454
--

In [10]:
# Converting output to dataframe
app_reviews_df = pd.DataFrame(app_reviews)
app_reviews_df.head()
print(f'There are {np.shape(app_reviews_df)[0]} rows and {np.shape(app_reviews_df)[1]} columns.')

There are 1490 rows and 12 columns.


## Extracting Google Play Store Reviews

In [12]:
## Extracting to CSV file
app_reviews_df.to_csv('gps_reviews.csv', index=None, header=True)