# 0. Import

In [1]:
import pandas as pd
import numpy as np
import requests
import time

# 1. Get review function

Official documentation: https://partner.steamgames.com/doc/store/getreviews

In [2]:
def get_reviews(appid, params={'json':1}):
    url = 'https://store.steampowered.com/appreviews/'
    response = requests.get(url=url+appid, params=params, headers={'User-Agent': 'Mozilla/5.0'})
    return response.json()

With appid is the code in the URL of a game:

For example, Cyberpunk 2077's URL is https://store.steampowered.com/app/1091500/Cyberpunk_2077/<br>
-> Cyberpunk 2077's appid is **1091500**

It will return a JSON formatted list of reviews matching the parameters:
- success: 1 if the query was successful
- query_summary: summary about the request, the most important info is total_reviews 
- reviews: list of the reviews, including id, author id, review text, voted_up info and relevant data. 
- cursor: By default, a request can only retrieve a maximum of 100 reviews. We use this value to pass into the next request as the cursor to retrieve the next batch of reviews. 

**Example**

In [3]:
test_get_reviews = get_reviews('1091500', params={'json':1})

In [4]:
test_get_reviews['success']

1

In [5]:
test_get_reviews['query_summary']

{'num_reviews': 20,
 'review_score': 8,
 'review_score_desc': 'Very Positive',
 'total_positive': 187104,
 'total_negative': 41350,
 'total_reviews': 228454}

In [6]:
test_get_reviews['reviews']

[{'recommendationid': '130542522',
  'author': {'steamid': '76561198065823547',
   'num_games_owned': 128,
   'num_reviews': 5,
   'playtime_forever': 12501,
   'playtime_last_two_weeks': 0,
   'playtime_at_review': 10156,
   'last_played': 1674236482},
  'language': 'english',
  'review': "---{ Graphics }---\n☐ You forget what reality is\n☑ Beautiful\n☐ Good\n☐ Decent\n☐ Bad\n☐ Don‘t look too long at it\n☐ MS-DOS\n\n---{ Gameplay }---\n☑ Very good\n☐ Good\n☐ It's just gameplay\n☐ Mehh\n☐ Watch paint dry instead\n☐ Just don't\n\n---{ Audio }---\n☐ Eargasm\n☐ Very good\n☑ Good\n☐ Not too bad\n☐ Bad\n☐ I'm now deaf\n\n---{ Audience }---\n☐ Kids\n☐ Teens\n☑ Adults\n☐ Grandma\n\n---{ PC Requirements }---\n☐ Check if you can run paint\n☐ Potato\n☐ Decent\n☐ Fast\n☑ Rich boi\n☐ Ask NASA if they have a spare computer\n\n---{ Difficulty }---\n☐ Just press 'W'\n☑ Easy\n☐ Easy to learn / Hard to master\n☐ Significant brain usage\n☐ Difficult\n☐ Dark Souls\n\n---{ Grind }---\n☐ Nothing to grind\n

In [7]:
test_get_reviews['cursor']

'AoIIPxBrFnCNl/QD'

# 2. Get all reviews of a game

According to the official doc: https://partner.steamgames.com/doc/store/getreviews

**filter:**
- recent – sorted by creation time
- updated – sorted by last updated time
- all – (default) sorted by helpfulness, with sliding windows based on day_range parameter, will always find results to return.
- If paging through the reviews with cursor then choose either the recent option or the updated option to eventually receive an empty response list.

The function below will take the appid and number of reviews to scrape. It will stop when it either reached the desired number of reviews or scraped all the available reviews matching the filter, whichever comes first.

In [8]:
def get_n_reviews(appid, n_review): # specify the appid and the max number of review
    reviews = [] # list to store review after each request
    cursor = '*' # initial cursor
    params = {
            'json' : 1,
            'filter' : 'recent',     # return reviews by creation time
            'language' : 'english',
            'review_type' : 'all',
            'purchase_type' : 'all'
            }
    page = 0
    
    # While number of reviews to get still > 0, keep scraping
    while n_review > 0:                             
        # Get 100 reviews each request, or the remaining reviews, whichever lower
        params['num_per_page'] = min(100, n_review)  
        n_review -= 100
        params['cursor'] = cursor.encode()
        
        # Request the review, store the new cursor for next batch
        response = get_reviews(appid, params)
        cursor = response['cursor']
        
        # append the new reviews
        reviews += response['reviews']
        
        # print current progress for debugging purpose. comment out 3 lines above if not needed
        page += 1
        new_reviews = response['reviews']
        print(f'Page: {page}, New review: {len(new_reviews)}, Total retrieved: {len(reviews)}')


        if len(response['reviews']) == 0: break # break when returned an empty list

    return reviews

In [22]:
# remember to Enable scrolling for output

start=time.time()

n_review = 1_000_000
appid = '1091500'
scrape_1mil_break0 = get_n_reviews(appid, n_review)

print('Actual reviews scraped:', len(scrape_1mil_break0))
print('Scrape time:', time.time()-start)

Page: 1, New review: 100, Total retrieved: 100
Page: 2, New review: 100, Total retrieved: 200
Page: 3, New review: 100, Total retrieved: 300
Page: 4, New review: 100, Total retrieved: 400
Page: 5, New review: 100, Total retrieved: 500
Page: 6, New review: 100, Total retrieved: 600
Page: 7, New review: 100, Total retrieved: 700
Page: 8, New review: 100, Total retrieved: 800
Page: 9, New review: 100, Total retrieved: 900
Page: 10, New review: 100, Total retrieved: 1000
Page: 11, New review: 100, Total retrieved: 1100
Page: 12, New review: 100, Total retrieved: 1200
Page: 13, New review: 100, Total retrieved: 1300
Page: 14, New review: 100, Total retrieved: 1400
Page: 15, New review: 100, Total retrieved: 1500
Page: 16, New review: 100, Total retrieved: 1600
Page: 17, New review: 100, Total retrieved: 1700
Page: 18, New review: 100, Total retrieved: 1800
Page: 19, New review: 100, Total retrieved: 1900
Page: 20, New review: 100, Total retrieved: 2000
Page: 21, New review: 100, Total retri

Convert scraped data to pandas dataframe and save as csv

In [9]:
def create_df(scraped_data, csv_name = ''):
    pd_scraped_data = pd.Series(scraped_data)
    df_result = pd.DataFrame({
        'recommendationid': pd_scraped_data.apply(lambda x: x['recommendationid']),
        'review': pd_scraped_data.apply(lambda x: x['review']),
        'timestamp_created': pd_scraped_data.apply(lambda x: x['timestamp_created']),
        'voted_up': pd_scraped_data.apply(lambda x: x['voted_up'])
    })
    if csv_name != '':
        df_result.to_csv(f'{csv_name}.csv', encoding='utf-8', index=False)
        print(f'Exported: {csv_name}.csv')
    return df_result

In [35]:
create_df(scrape_1mil_break0, csv_name = 'scrape_cyberpunk_246970_reviews')

Exported: scrape_cyberpunk_246970_reviews.csv


Unnamed: 0,recommendationid,review,timestamp_created,voted_up
0,132354840,"Loe, love, LOVE this game.\n\nI purchased it w...",1675747403,True
1,132354525,shouldve been more,1675746707,False
2,132354176,It's fun,1675745970,True
3,132353890,full gen z game.,1675745336,False
4,132353500,it good that is all,1675744599,True
...,...,...,...,...
246965,81918924,Still waiting to be able to play.\n\nEdit: I'm...,1607559098,True
246966,81918918,"Bigot developer\n\nTo clarify, I got this game...",1607559089,False
246967,81918912,Obligatory It’s breathtaking!\nA wonderful sci...,1607559070,True
246968,81918903,It's a city of dreams... and I'm a big dreamer.,1607559050,True


## Alternative/Troubleshoot: get ALL reviews

For the method above, on a few occasions (I guess due to connection), the returned list can be empty and break the request loop prematurely.

To workaround that, this function finds the total number of available reviews that matches the filter and then loops the requests until it has got all the reviews.

In [10]:
def get_n_reviews_total_review(appid):
    reviews = []
    cursor = '*'
    params = {
            'json' : 1,
            'filter' : 'recent',
            'language' : 'english',
            'review_type' : 'all',
            'purchase_type' : 'all'
            }
    # find the total reviews matching the filter
    total_reviews = get_reviews(appid, params)['query_summary']['total_reviews']
    print(f'Total reviews: {total_reviews}')
    
    page = 0
    while total_reviews > 0: 
        params['num_per_page'] = min(100, total_reviews)
        total_reviews -= 100
        params['cursor'] = cursor.encode()

        response = get_reviews(appid, params)
        cursor = response['cursor']
        reviews += response['reviews']
        
        # print current progress for debugging purpose. comment out 3 lines above if not needed
        page += 1
        new_reviews = response['reviews']
        print(f'Page: {page}, New review: {len(new_reviews)}, Total retrieved: {len(reviews)}')
        
        # no break when returned list = empty

    return reviews

In [30]:
start=time.time()

scrape_total_review = get_n_reviews_total_review(appid)

print('Actual reviews scraped:', len(scrape_total_review))
print('Scrape time:', time.time()-start)

Total reviews: 246984
Page: 1, New review: 100, Total retrieved: 100
Page: 2, New review: 100, Total retrieved: 200
Page: 3, New review: 100, Total retrieved: 300
Page: 4, New review: 100, Total retrieved: 400
Page: 5, New review: 100, Total retrieved: 500
Page: 6, New review: 100, Total retrieved: 600
Page: 7, New review: 100, Total retrieved: 700
Page: 8, New review: 100, Total retrieved: 800
Page: 9, New review: 100, Total retrieved: 900
Page: 10, New review: 100, Total retrieved: 1000
Page: 11, New review: 100, Total retrieved: 1100
Page: 12, New review: 100, Total retrieved: 1200
Page: 13, New review: 100, Total retrieved: 1300
Page: 14, New review: 100, Total retrieved: 1400
Page: 15, New review: 100, Total retrieved: 1500
Page: 16, New review: 100, Total retrieved: 1600
Page: 17, New review: 100, Total retrieved: 1700
Page: 18, New review: 100, Total retrieved: 1800
Page: 19, New review: 100, Total retrieved: 1900
Page: 20, New review: 100, Total retrieved: 2000
Page: 21, New re

## Getting more data from more games with mixed reviews

I was interested to work with just Cyberpunk review data at first. Due to data imbalance, I went back and get more data, this time from games that have more mixed review.

Mixed reviewed games:
usual culprit: Bethesda, Ubisoft, EA for technical issue or mtx / cash grabbing business practice.

https://store.steampowered.com/app/1938090/Call_of_Duty_Modern_Warfare_II/

https://store.steampowered.com/app/1517290/Battlefield_2042/

https://store.steampowered.com/app/1506830/FIFA_22/?curator_clanid=36135791

https://store.steampowered.com/app/1151340/Fallout_76/

https://store.steampowered.com/app/552520/Far_Cry_5/

### Call of Duty

In [None]:
start=time.time()

scrape_total_review_COD = get_n_reviews_total_review('1938090')

print('Actual reviews scraped:', len(scrape_total_review_COD))
print('Scrape time:', time.time()-start)

In [36]:
create_df(scrape_total_review_COD, csv_name = 'scrape_total_review_COD')

Exported: scrape_total_review_COD.csv


Unnamed: 0,recommendationid,review,timestamp_created,voted_up
0,132360103,Server crashed TWICE SAME DAY. I CANT COMPLET...,1675758880,False
1,132359724,"imagine this, Not talking the whole game but s...",1675758262,False
2,132359621,There is no doubt that the staff at IW are a b...,1675758076,False
3,132359018,GOOD GAME,1675756819,True
4,132358907,massive cheating issue and overall lackluster ...,1675756553,False
...,...,...,...,...
146622,124501304,h,1666929517,True
146623,124501299,A Call Of Duty game i was actually excited for...,1666929510,True
146624,124501289,gamr fun cool guyQ!\n,1666929502,True
146625,124501287,The campaign crashes more often than it doesn'...,1666929500,False


In [37]:
df_cod = create_df(scrape_total_review_COD)

In [38]:
df_cod['voted_up'].value_counts()

True     94297
False    52330
Name: voted_up, dtype: int64

### Battlefield 2042

In [11]:
start=time.time()

scrape_total_review_btf = get_n_reviews_total_review('1517290')

print('Actual reviews scraped:', len(scrape_total_review_btf))
print('Scrape time:', time.time()-start)

Total reviews: 85046
Page: 1, New review: 100, Total retrieved: 100
Page: 2, New review: 100, Total retrieved: 200
Page: 3, New review: 100, Total retrieved: 300
Page: 4, New review: 100, Total retrieved: 400
Page: 5, New review: 100, Total retrieved: 500
Page: 6, New review: 100, Total retrieved: 600
Page: 7, New review: 100, Total retrieved: 700
Page: 8, New review: 100, Total retrieved: 800
Page: 9, New review: 100, Total retrieved: 900
Page: 10, New review: 100, Total retrieved: 1000
Page: 11, New review: 100, Total retrieved: 1100
Page: 12, New review: 100, Total retrieved: 1200
Page: 13, New review: 100, Total retrieved: 1300
Page: 14, New review: 100, Total retrieved: 1400
Page: 15, New review: 100, Total retrieved: 1500
Page: 16, New review: 100, Total retrieved: 1600
Page: 17, New review: 100, Total retrieved: 1700
Page: 18, New review: 100, Total retrieved: 1800
Page: 19, New review: 100, Total retrieved: 1900
Page: 20, New review: 100, Total retrieved: 2000
Page: 21, New rev

In [12]:
create_df(scrape_total_review_btf, csv_name = 'scrape_total_review_btf')

Exported: scrape_total_review_btf.csv


Unnamed: 0,recommendationid,review,timestamp_created,voted_up
0,132361652,Would love Breakthrough modes vs. Zombies :p,1675761921,True
1,132357743,It is actually fun now.,1675753896,True
2,132357060,yes,1675752396,True
3,132357051,"i have never had a game crash on me so much, o...",1675752372,True
4,132355598,I can't recommend it at full price. It doesn't...,1675749067,False
...,...,...,...,...
85041,103030696,Battlefield 2042? more like Inshallah peepoo,1637308567,False
85042,103030693,imagine using no brain and effort to develop a...,1637308562,False
85043,103030690,Pog\n,1637308561,True
85044,103030686,Fact: It’s better than all COD titles together!,1637308558,True


### FIFA 22

In [13]:
start=time.time()

scrape_total_review_fifa = get_n_reviews_total_review('1506830')

print('Actual reviews scraped:', len(scrape_total_review_fifa))
print('Scrape time:', time.time()-start)

Total reviews: 36999
Page: 1, New review: 100, Total retrieved: 100
Page: 2, New review: 100, Total retrieved: 200
Page: 3, New review: 100, Total retrieved: 300
Page: 4, New review: 100, Total retrieved: 400
Page: 5, New review: 100, Total retrieved: 500
Page: 6, New review: 100, Total retrieved: 600
Page: 7, New review: 100, Total retrieved: 700
Page: 8, New review: 100, Total retrieved: 800
Page: 9, New review: 100, Total retrieved: 900
Page: 10, New review: 100, Total retrieved: 1000
Page: 11, New review: 100, Total retrieved: 1100
Page: 12, New review: 100, Total retrieved: 1200
Page: 13, New review: 100, Total retrieved: 1300
Page: 14, New review: 100, Total retrieved: 1400
Page: 15, New review: 100, Total retrieved: 1500
Page: 16, New review: 100, Total retrieved: 1600
Page: 17, New review: 100, Total retrieved: 1700
Page: 18, New review: 100, Total retrieved: 1800
Page: 19, New review: 100, Total retrieved: 1900
Page: 20, New review: 100, Total retrieved: 2000
Page: 21, New rev

In [14]:
create_df(scrape_total_review_fifa, csv_name = 'scrape_total_review_fifa')

Exported: scrape_total_review_fifa.csv


Unnamed: 0,recommendationid,review,timestamp_created,voted_up
0,132354527,nice game\n,1675746710,True
1,132346293,Balls,1675731454,True
2,132345663,best game ever,1675730290,True
3,132341121,carrer mode is the best tbh,1675722231,True
4,132338406,Good fun,1675718053,True
...,...,...,...,...
36994,100234232,"GAME WAJIB UNTUK BUDAK EA, SUNGKEM YANG MULIA ...",1633015942,True
36995,100234022,A lot of Change by FIFA afterall. Finally have...,1633015772,True
36996,100233969,This is the first time I played Ultimate Team ...,1633015728,True
36997,100233761,"i loved the game so much, and i've been waitin...",1633015561,True


### Fallout 76


In [15]:
start=time.time()

scrape_total_review_fallout = get_n_reviews_total_review('1151340')

print('Actual reviews scraped:', len(scrape_total_review_fallout))
print('Scrape time:', time.time()-start)

Total reviews: 43482
Page: 1, New review: 100, Total retrieved: 100
Page: 2, New review: 100, Total retrieved: 200
Page: 3, New review: 100, Total retrieved: 300
Page: 4, New review: 100, Total retrieved: 400
Page: 5, New review: 100, Total retrieved: 500
Page: 6, New review: 100, Total retrieved: 600
Page: 7, New review: 100, Total retrieved: 700
Page: 8, New review: 100, Total retrieved: 800
Page: 9, New review: 100, Total retrieved: 900
Page: 10, New review: 100, Total retrieved: 1000
Page: 11, New review: 100, Total retrieved: 1100
Page: 12, New review: 100, Total retrieved: 1200
Page: 13, New review: 100, Total retrieved: 1300
Page: 14, New review: 100, Total retrieved: 1400
Page: 15, New review: 100, Total retrieved: 1500
Page: 16, New review: 100, Total retrieved: 1600
Page: 17, New review: 100, Total retrieved: 1700
Page: 18, New review: 100, Total retrieved: 1800
Page: 19, New review: 100, Total retrieved: 1900
Page: 20, New review: 100, Total retrieved: 2000
Page: 21, New rev

In [16]:
create_df(scrape_total_review_fallout, csv_name = 'scrape_total_review_fallout')

Exported: scrape_total_review_fallout.csv


Unnamed: 0,recommendationid,review,timestamp_created,voted_up
0,132358722,This game is very entertaining and whole. It i...,1675756136,True
1,132357714,"Literally wont even launch anymore, just hangs...",1675753809,False
2,132356159,"Ope world, you can just wander around looking ...",1675750328,True
3,132354648,Now encapsulates almost exactly what you want ...,1675746980,True
4,132352988,"game wont go into fullscreen, trash.",1675743574,False
...,...,...,...,...
43476,67289097,It just works,1586864598,True
43477,67289073,"Even when it (just) works, every patch is like...",1586864569,False
43478,67289056,Fallout but online only. Recommended :),1586864552,True
43479,67289023,Yes!,1586864507,True


# 3. Combine data

In [4]:
df = pd.read_csv('scrape_cyberpunk_246970_reviews.csv')

In [7]:
csv_list = [
    'scrape_total_review_btf.csv',
    'scrape_total_review_COD.csv',
    'scrape_total_review_fallout.csv',
    'scrape_total_review_fifa.csv']
for csv in csv_list:
    df = pd.concat([df, pd.read_csv(csv)])

In [9]:
df.shape

(559123, 4)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 559123 entries, 0 to 36998
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   recommendationid   559123 non-null  int64 
 1   review             557447 non-null  object
 2   timestamp_created  559123 non-null  int64 
 3   voted_up           559123 non-null  bool  
dtypes: bool(1), int64(2), object(1)
memory usage: 17.6+ MB


In [17]:
df.to_csv('scraped_data_559k.csv', index=False)

In [15]:
df.voted_up.value_counts(normalize=True)

True     0.689426
False    0.310574
Name: voted_up, dtype: float64