In [78]:
from datetime import datetime
import os
from typing import List

from bs4 import BeautifulSoup
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import numpy as np
import pandas as pd
import requests

# https://andrew-muller.medium.com/scraping-steam-user-reviews-9a43f9e38c92
# to get more than 100 reviews, you have to use the cursor.  you basically make a loop where you use the cursor to identify where you last left off,
#  and you can make another request for 100 reviews after that

# everything works, something to add would be to grab the game name during the bs4 scrape step instead of just the app id name.

In [83]:
def get_reviews(appid, params={'json': 1}):
    try:
        url = f'https://store.steampowered.com/appreviews/{appid}'
        response = requests.get(url= url, params = params, headers={'User-Agent': 'Mozilla/5.0'}).json()
        return response
    except BaseException as e:
        print(f"Error Occurred, {e}")

def get_n_reviews(appid, n = 1000):
    try:
        reviews = []
        cursor = '*'
        params = {
                'json' : 1,
                'filter' : 'all',
                'language' : 'english',
                'day_range' : 9223372036854775807,
                'review_type' : 'all',
                'purchase_type' : 'all'
                }

        while n > 0:
            params['cursor'] = cursor.encode()
            params['num_per_page'] = min(100, n)
            n -= 100

            response = get_reviews(appid, params)
            cursor = response['cursor']
            for i in response['reviews']:
                i['app_id'] = appid
            reviews += response['reviews']

            if len(response['reviews']) < 100: break

        return reviews
    except BaseException as e:
        print(f"Error Occurred, {e}")


def get_app_id(game_name):
    try:
        response = requests.get(url=f'https://store.steampowered.com/search/?term={game_name}&category1=998', headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, 'html.parser')
        app_id = soup.find(class_='search_result_row')['data-ds-appid']
        return app_id
    except BaseException as e:
        print(f"Error Occurred, {e}")


def get_n_appids(n=100, filter_by='topsellers'):
    try:
        appids = []
        url = f'https://store.steampowered.com/search/?category1=998&filter={filter_by}&page='
        page = 0

        while page*25 < n:
            page += 1
            response = requests.get(url=url+str(page), headers={'User-Agent': 'Mozilla/5.0'})
            soup = BeautifulSoup(response.text, 'html.parser')
            for row in soup.find_all(class_='search_result_row'):
                appids.append(row['data-ds-appid'])

        return appids[:n]
    except BaseException as e:
        print(f"Error Occurred, {e}")

def gather_reviews(app_ids_num, num_reviews: int) -> pd.DataFrame:
    """
    Function which grabs a random # of Steam Reviews and applies Sentiment Analysis with NLTK Vader Lexicon.  
    50 Games with 100 Reviews each returns 5000 reviews.

    Args:
        app_ids_num (int) - The Number of Games to scrape

        num_reviews (int) - The Number of Reviews per game to scrape

    Returns:
        DataFrame of Steam Reviews for the specified number of reviews to be scraped
    """
    try:
        reviews = []
        app_ids = get_n_appids(app_ids_num)

        for appid in app_ids:
            reviews += get_n_reviews(appid, 100)

        df = pd.DataFrame(reviews)
        df = df.join(pd.DataFrame(df['author'].values.tolist(), index = df.index).add_prefix('author_'))
        df[['timestamp_created', 'timestamp_updated', 'author_last_played']] = df[['timestamp_created', 'timestamp_updated', 'author_last_played']].apply(pd.to_numeric)
        df['timestamp_created'] = df['timestamp_created'].apply(lambda x: datetime.utcfromtimestamp(x))
        df['timestamp_updated'] = df['timestamp_updated'].apply(lambda x: datetime.utcfromtimestamp(x))
        df['author_last_played'] = df['author_last_played'].apply(lambda x: datetime.utcfromtimestamp(x))
        df = df.drop('author', axis = 1)

        analyzer = SentimentIntensityAnalyzer()
        df["compound"] = [analyzer.polarity_scores(x)["compound"] for x in df["review"]]
        df["neg"] = [analyzer.polarity_scores(x)["neg"] for x in df["review"]]
        df["neu"] = [analyzer.polarity_scores(x)["neu"] for x in df["review"]]
        df["pos"] = [analyzer.polarity_scores(x)["pos"] for x in df["review"]]
        df["sentiment"] = np.where(df["compound"] > 0, 1, 0)

        return df
    except BaseException as e:
        print(f"Error Occurred, {e}")


In [79]:
df = gather_reviews(app_ids_num = 50, num_reviews = 100)

In [82]:
df.head(5)

Unnamed: 0,recommendationid,language,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,...,author_num_reviews,author_playtime_forever,author_playtime_last_two_weeks,author_playtime_at_review,author_last_played,compound,neg,neu,pos,sentiment
0,43549825,english,"[h1]ɢʀᴇᴀᴛ ɢᴀᴍᴇ, ʀᴜɪɴᴇᴅ ʙʏ ᴄʜᴇᴀᴛᴇʀꜱ[/h1]",2018-07-15 17:31:13,2021-03-18 21:28:11,False,1231,141,0.9740669131278992,1337,...,1,2018941,20128,731994,2022-05-09 17:20:45,0.0,0.0,1.0,0.0,0
1,26884882,english,I NOT CAN PLAY,2016-11-23 21:09:08,2018-12-07 20:38:12,False,2258,175,0.9714574813842772,150,...,6,101658,0,87354,2021-03-30 18:39:35,-0.2584,0.504,0.496,0.0,0
2,82020816,english,"After 8 years playing it, I didn't improve my ...",2020-12-10 22:07:43,2020-12-10 22:07:43,True,1558,1436,0.9714266657829284,43,...,7,149209,0,146079,2022-04-11 00:32:14,0.7329,0.121,0.705,0.175,1
3,102066585,english,The Best game on steam but it has the worst an...,2021-11-02 02:20:14,2021-11-02 02:20:14,True,479,1,0.9711211919784544,5,...,6,204951,0,181968,2022-01-02 11:43:32,-0.6187,0.328,0.522,0.151,0
4,26109315,english,Where can I get the English version ?,2016-10-19 02:01:26,2016-11-23 19:13:48,True,7188,2350,0.9652948975563048,412,...,14,162706,0,50595,2017-02-20 01:25:17,0.0,0.0,1.0,0.0,0
