# Steam Reviews

## Initilization Code

### Imports

In [30]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import datetime
import time

### Selenium Navigation 

In [31]:
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By

In [32]:
# Using Selenium to get the page source of the reviews from the steam store page url 
# after loading the page an amount of time dictated by 'load_time', in which every 1 
# increase of load_time leads to another 10 reviews. Default is 1000 reviews

def get_page_source(url, load_time = 100):
    
    ## create an object of the chrome webdriver
    driver = webdriver.Chrome()

    ## open selenium URL in chrome browser
    driver.get(url)
    time.sleep(3)
    
    ## Check for Age Verification
    if ('agecheck' in driver.current_url):
        age_verify(driver)
        
    ## Scroll down to the bottom of the page, wait 2 seconds,
    ## then scroll down further
    scroller(driver, 2)

    ## Select the more reviews button and let load
    review_page = driver.find_element('id', 'ViewAllReviewssummary')
    button = review_page.find_element('xpath', "./child::*")
    driver.execute_script("arguments[0].click();", button);
    time.sleep(3)
    
    ## Getting past the second age verification
    ## Checking to see if there is a response page, which would be age verification
    page = driver.find_element('id', 'responsive_page_template_content')
    if (page is not None):
        ## If there is a response page, click the buttons from the page
        buttonList = page.find_elements(By.TAG_NAME,"button")
        driver.execute_script("arguments[0].click();", buttonList[0])
        time.sleep(1)

    ## Load more reviews
    scroller(driver, 1, load_time)
    
    # obtains the page source with additional reviews loaded
    source = driver.page_source
    
    # Closes the original webpage
    driver.close()
    
    ## Return the page source to be parsed into Beautiful Soup
    return source

In [33]:
## Takes the webpage if locked by age verification and bypasses it
def age_verify(driver):

    select = Select(driver.find_element('id', 'ageYear'))

    select.select_by_value('2000')

    button = driver.find_element('id', 'view_product_page_btn')

    driver.execute_script("arguments[0].click();", button)

In [34]:
## Scrolls pages to the bottom, from @Cuong Tran on Stack Overflow
def scroller(driver, SCROLL_PAUSE_TIME = 0.5, scroll_limit = 100):

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while (scroll_limit > 0):
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        
        scroll_limit = scroll_limit - 1

### Data Scraping

In [35]:
def scrape_page(page_url):
    """Extracts HTML from a webpage"""
    
    answer = requests.get(page_url)
    content = answer.content
    soup = BeautifulSoup(content, features='html.parser')
    
    return soup

In [36]:
# Gets a list of steam reviews, given the URL of the reviews page
# TODO; only works with first page so far 
def find_reviews_url(page_url):
    scrapped_page = scrape_page(page_url)
    
    # Getting the attribute containing the first page of reviews
    page_1 = scrapped_page.find("div", {"id": "page1"})
    
    # Isolating the reviews and putting them into a list
    # pg_1_reviews = page_1.findAll('div', {"class": "apphub_UserReviewCardContent"})
    pg_1_reviews = page_1.findAll('div', {"class": "apphub_Card modalContentLink interactable"})
    
    return pg_1_reviews

In [37]:
# Gets a list of steam reviews, given the page source of the reviews page
# Used with Selenium
def find_reviews_source(page_source):
    scrapped_page = BeautifulSoup(page_source, 'lxml')
    
    page_num = 1
    reviews = []
    
    # Iterating through the pages
    while True:
        
        page_id = "page" + str(page_num) 

        # Getting the attribute containing the first page of reviews
        page = scrapped_page.find("div", {"id": page_id})
        
        if page is None:
            break

        # Isolating the reviews and putting them into a list
        pg_reviews = page.findAll('div', {"class": "apphub_Card modalContentLink interactable"})
                
        reviews = reviews + pg_reviews
        
        page_num = page_num + 1
    
    return reviews

In [38]:
# Returns int values for the number of helpful and funny votes on a review
def get_helpful_funny(review):
    
    # Getting the helpful/funny text attribute
    text = review.find("div", {"class": "found_helpful"}).text
    
    # Using regex to isolate the numbers for each vote
    regex_text = re.findall(r'([\d]?[.,\d]+)(\W+people\b|\W+person\b)', text)
    
    # Isolating the numbers and turning them into ints
    try:
        helpful = regex_text[0][0].replace(",", "")
    except:
        helpful = 0
    
    try:
        funny = regex_text[1][0].replace(",", "")
    except:
        funny = 0
    
    return int(helpful), int(funny)
    

In [39]:
# Returns true if the review is postive
def get_recommend(review):
    return (review.find("div", {"class": "title"}).text)

In [40]:
# Returns *Current* number of played hours as a float
# Does not include hours at time of review
def get_hours(review):
    
    # Getting hours attribute
    text = review.find("div", {"class": "hours"}).text
    
    # Using regex to isolate the number of hours
    hours = re.findall(r'([\d]+[.,\d]+)', text)[0]
    
    # Removing the commas
    hours = hours.replace(",", "")
    
    return float(hours)

In [41]:
# Returns a string of the date the review was posted
# The Format of the date is: Month, Day, Year
# Month is typed out while day and year are nums
# Only works in year 2023
def get_date(review):
    
    # Getting date attribute
    text = review.find("div", {"class": "date_posted"}).text
    
    # Using Regex to isolate date
    date = re.findall(r':[\s\S]*$', text)[0]
    
    # Adding the year if the review was this year
    # Steam leaves out the year otherwise
    if (re.search(r'\d{4}$', date) is None):
        date = date + ', ' + str(datetime.date.today().year)
    
    # Skipping the ': ' of the date
    return date[2:]

In [42]:
# Getting the text from the review
# Definitely pretty wonky way of doing this but it works
def get_text(review):
    
    text_attribute = review.find("div", {"class": "apphub_CardTextContent"})
    
    str_text = str(text_attribute)
    
    try:
        # Isolating the main text of the review from the class attributes
        isolated_text = re.findall(r'<\/div>([^;]*)<\/div>', str_text)[0]
    
    except:
        # If there is weird formatting, do some weird stuff
        
        # Grab all text between two '(<\/div>)'
        isolated_text = re.findall(r'(<\/div>)([\s\S]*)(<\/div>)', str_text)[0]
        
        # Run through each possible text and select the largest one
        maxed_i = 0
        for i in range(len(isolated_text)):
            if (len(isolated_text[i]) > len(isolated_text[maxed_i])):
                maxed_i = i
        isolated_text = isolated_text[maxed_i]
    
    # Prettifying the text and removing the listed characters
    for to_remove in ['\r', '\t', '\n', ' ']:
        isolated_text = isolated_text.replace(to_remove, " ")
    
    # Reformatting >
    isolated_text = isolated_text.replace("&gt;", ">")
    
    # Removing the steam formatted newlines
    isolated_text = isolated_text.replace("<br/><br/>", " ")
    isolated_text = isolated_text.replace("<br/>", " ")
    
    # Cleaning up the tabs and whitespaces
    isolated_text = re.sub('\s+',' ', isolated_text)

    # Removing the extra whitespaces at the begining and end of the text
    return isolated_text[1:-1]

In [43]:
# Getting Steam ID and Name of the review Author
def get_names(review):
    
    # Try/Except to handle case if user is Online or Offline
    try:
        # Getting steam nickname (non-unique) from attribute text
        nickname = review.find("div", {"class": "apphub_CardContentAuthorName offline ellipsis"}).text

        # Getting class attribute that contains steam id (unique)
        text_attribute = review.find("div", {"class": "apphub_CardContentAuthorName offline ellipsis"})
        
    except:
        
        try:
            # Getting steam nickname (non-unique) from attribute text
            nickname = review.find("div", {"class": "apphub_CardContentAuthorName online ellipsis"}).text

            # Getting class attribute that contains steam id (unique)
            text_attribute = review.find("div", {"class": "apphub_CardContentAuthorName online ellipsis"})
            
        except:
            
            # Getting steam nickname (non-unique) from attribute text
            nickname = review.find("div", {"class": "apphub_CardContentAuthorName in-game ellipsis"}).text

            # Getting class attribute that contains steam id (unique)
            text_attribute = review.find("div", {"class": "apphub_CardContentAuthorName in-game ellipsis"})
    
    # Isolating text attribute containing steam id
    steam_link_attribute = text_attribute.find_all('a', href=True)
    
    # Getting raw string text of steam id
    raw_text = str(steam_link_attribute[0])
    
    # Using Regex to isolate everying in between the ""
    steam_id_link = re.findall(r'\"([^;]*)\"', raw_text)[0]
    
    # Using Regex to then isolate the unique steam ID from the link
    # Removing 'https://steamcommunity.com/id/*Steam_ID*/' 
    try:
        steam_id = re.findall(r'\/([^\/]+)\/[^\/]*$', steam_id_link)[0]
    except:
        print(steam_id_link)
    
    return nickname, steam_id

In [44]:
# Getting total number of products the author owns as an int
def get_total_prod(review):

    text_attribute = review.find("div", {"class": "apphub_CardContentMoreLink ellipsis"}).text
    
    # Checking to see if total products are hidden
    # If they are, use dummy variable '-1'
    if (len(text_attribute) == 0):
        return -1
    
    # Using Regex to get all numbers including commas
    products = re.findall(r'([\d]+[.,\d]*)', text_attribute)[0]
    
    # Removing the commas
    products = products.replace(",", "")
    
    return int(products)

In [45]:
# Getting total number of comments on the review
def get_total_comments(review):
    
    text_attribute = review.find("div", {"class": "apphub_CardCommentCount alignNews"})
    
    total_comments = text_attribute.text
    
    # Removing the commas
    total_comments = total_comments.replace(",", "")
    
    return int(total_comments)
    

### Database Instalization

In [46]:
# Creating a dataframe of steam reviews given the steam store page
# Loadtime is how long selenium will scroll, with every int increase
# increasing the number of reviews by 10. 

def create_df(url, load_time = 99):
    
    # Using Selenium to load the additional reviews into source lxml
    source = get_page_source(url, load_time)
    
    # Using Beautiful Soup to parse the source into a list of reviews
    review_list = find_reviews_source(source)
    
    # Using Beautiful Soup to parse the list of reviews into a dataframe
    df = Review_df(review_list)
    
    return df
    

In [47]:
# Initilize a pandas dataframe given a list of reviews in beautiful soup format
def Review_df(review_list):    
    
    df = pd.DataFrame(columns=['Steam Id', 'Nickname', 'Product_Library', 'Review', 'Date', 'Hours', 'Review_Text', 'Helpful_Votes', 'Funny_Votes', 'Comments'])
    
    for review in review_list:
    
        names = get_names(review)

        helpful, funny = get_helpful_funny(review)

        new_row = {'Steam Id':names[1], 
                   'Nickname':names[0], 
                   'Product_Library':get_total_prod(review), 
                   'Review':get_recommend(review), 
                   'Date':get_date(review), 
                   'Hours':get_hours(review), 
                   'Review_Text':get_text(review), 
                   'Helpful_Votes':helpful, 
                   'Funny_Votes':funny, 
                   'Comments':get_total_comments(review)}

        # To avoid using Append as it is depreciated
        new_df = pd.DataFrame([new_row])                      
        df = pd.concat([df, new_df])
    
    return df

### Testing

In [48]:
df = create_df('https://store.steampowered.com/app/1086940/Baldurs_Gate_3/')

In [49]:
csgo_reviews = df

## Analysis

In [61]:
csgo_reviews.iloc[998].Review_Text

'<div class="early_access_review">Early Access Review</div> If you love sex then this game isn\'t for you, every time you play this game you will cream your Jorts. This game will replace the touch of a woman/man and give you a major climax that will make you come back for more.'

In [51]:
csgo_reviews

Unnamed: 0,Steam Id,Nickname,Product_Library,Review,Date,Hours,Review_Text,Helpful_Votes,Funny_Votes,Comments
0,clockworkarchangel13,clockworkarchangel13,-1,Recommended,"September 12, 2023",203.7,my work is really cutting into my Baldur's Gat...,7498,2690,0
0,Insta_Coffee,InstaCoffee,-1,Recommended,"August 11, 2023",121.6,Possessed a guard and had them open a gate con...,3007,1177,11
0,76561198053712363,Cranch,151,Recommended,"August 22, 2023",121.2,I like the part where the game doesn't have mi...,2711,190,10
0,76561198006614254,Skets,-1,Recommended,"August 24, 2023",60.8,To sum up this game in a way only players will...,10092,394,0
0,76561198055540898,SaVioR iS RaW,-1,Recommended,"August 11, 2023",222.8,This game got me away from League of Legends.,5023,1530,0
...,...,...,...,...,...,...,...,...,...,...
0,76561198258789000,alpa,106,Recommended,"August 29, 2023",189.4,shadowheart makes me twirl my hair and kick my...,5,0,0
0,76561197995796223,Bruce Wayne,-1,Recommended,"August 14, 2023",221.1,BEST GAME EVER!,6,0,0
0,Dr_Bungo,Dr. Bungo,-1,Recommended,"August 12, 2023",85.1,yeah its p good,4,0,0
0,76561198400300154,Nelson_BROS,57,Recommended,"January 22, 2023",146.3,"<div class=""early_access_review"">Early Access ...",18,18,1
