In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
# Install Google Play Scraper
#!pip install google_play_scraper

In [7]:
# Install App Store Scraper
#!pip install app-store-scraper

In [31]:
# Imports
import json
from tqdm import tqdm
from google_play_scraper import Sort, reviews, app
from datetime import datetime, date, timedelta

## Scraping Reviews from Google Play Store

Reviews for Final Fantasy VIII Remastered will be scraped from the Google Play Store.

In [32]:
# Specify the app
app_packages=['com.square_enix.android_googleplay.FFVIII']

# Initialize a blank list to hold all the reviews
playstore_reviews=[]

In [33]:
# Looping to scrape the reviews
for ap in tqdm(app_packages):
    # Low score = 1, high score = 5
    for score in list(range(1, 6)):
        # Sort from new to old
        for sort_order in [Sort.NEWEST]:
            # Review components
            rvs, _ = reviews(ap, 
                                 sort = sort_order, 
                                 count = 1000, 
                                 filter_score_with = score)
            # Iterate through reviews gathered
            for r in rvs:
                r['sortOrder'] = 'newest'
                r['AppId'] = ap
            # Add new review to list
            playstore_reviews.extend(rvs)

100%|██████████| 1/1 [00:00<00:00,  1.04it/s]


In [34]:
# Check gathered contents
playstore_reviews[50]

{'reviewId': '6f2edd4f-ad80-483e-a41b-78b060882dfa',
 'userName': 'Brian Howard',
 'userImage': 'https://play-lh.googleusercontent.com/a-/AFdZucop-pg-cjtJw5eEtc5bAP4wnYeSzz9pKuDH61c',
 'content': 'App doesnt work right',
 'score': 1,
 'thumbsUpCount': 0,
 'reviewCreatedVersion': '1.0.0 (210309)',
 'at': datetime.datetime(2021, 4, 19, 18, 4, 49),
 'replyContent': None,
 'repliedAt': None,
 'sortOrder': 'newest',
 'AppId': 'com.square_enix.android_googleplay.FFVIII'}

From the above display it appears that all of the relevant data has been collected. 

In [35]:
# Create a dataframe only for reviews from the Google Play Store
google_play_reviews = pd.DataFrame(playstore_reviews)

# Drop duplicates
google_play_reviews = google_play_reviews.drop_duplicates(keep = 'first')

# Check
google_play_reviews.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,sortOrder,AppId
0,5ba25ca1-6bb0-4a28-8db5-b9286f46d5cc,Jack Agresti,https://play-lh.googleusercontent.com/a-/AFdZu...,"Played FFVII and IX, they played just fine. My...",1,0,1.0.1,2022-07-26 11:44:45,,,newest,com.square_enix.android_googleplay.FFVIII
1,4803f991-793b-4574-af77-52b3c7c51099,Justice Peralta,https://play-lh.googleusercontent.com/a-/AFdZu...,Bad aspect ratio and flubbed on-screen controls.,1,0,,2022-07-16 09:06:41,,,newest,com.square_enix.android_googleplay.FFVIII
2,8b215996-c5d8-4cd5-8b43-e2dbf7da2c24,Brian Puckett,https://play-lh.googleusercontent.com/a-/AFdZu...,"Game runs great for about 5 minutes, then you ...",1,0,1.0.1,2022-07-09 21:07:57,,,newest,com.square_enix.android_googleplay.FFVIII
3,e1d302de-30fa-47f3-bb49-136cc378bc84,B B,https://play-lh.googleusercontent.com/a/AItbvm...,Any updates on this? Why can I install final f...,1,1,1.0.1,2022-06-17 18:15:38,,,newest,com.square_enix.android_googleplay.FFVIII
4,c05044c7-35b3-4f3b-8d5d-6fe733f006ee,James Small,https://play-lh.googleusercontent.com/a-/AFdZu...,My favorite FF but this seriously needs fixing...,1,2,1.0.1,2022-05-29 17:13:56,,,newest,com.square_enix.android_googleplay.FFVIII


In [36]:
# Get information about the gathered reviews
google_play_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 335 entries, 0 to 334
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              335 non-null    object        
 1   userName              335 non-null    object        
 2   userImage             335 non-null    object        
 3   content               335 non-null    object        
 4   score                 335 non-null    int64         
 5   thumbsUpCount         335 non-null    int64         
 6   reviewCreatedVersion  299 non-null    object        
 7   at                    335 non-null    datetime64[ns]
 8   replyContent          0 non-null      object        
 9   repliedAt             0 non-null      object        
 10  sortOrder             335 non-null    object        
 11  AppId                 335 non-null    object        
dtypes: datetime64[ns](1), int64(2), object(9)
memory usage: 34.0+ KB


In [50]:
# Summarize the unique values for specific columns
gp_cols = google_play_reviews.columns

# Iterate through the list of columns
for c in range(1,len(gp_cols)):
    if(gp_cols[c] == 'score') | (gp_cols[c] == 'reviewCreatedVersion'):
        #this_col_name = google_play_reviews[gp_cols[c]]
        print(f"{gp_cols[c]} unique values: {google_play_reviews[gp_cols[c]].unique()}")

score unique values: [1 2 3 4 5]
reviewCreatedVersion unique values: ['1.0.1' None '1.0.0 (210309)']
