Visits the 'recommendations' page of each user's profile and logs records with the following:
1. 'user' ---- user's id
2. 'app_id' ---- game's appid
3. 'positive' ---- 1 if review is positive, 0 if negative
4. 'total_playtime' ---- user's playtime in the game
5. 'review_playtime' ---- in the game at the time of review, if different
6. 'text' ---- text of review, if any
7. 'helpful_count' ---- how many people found the review helpful
8. 'review_date' ---- date of the review
9. 'edit_date' ---- date the review was last edited, if any
10. 'date_scraped' ---- date scraped


In [1]:
# Basic DS stuff
import numpy as np
import pandas as pd
import json
import pyarrow as pa
import pyarrow.parquet as pq

# String manipulation
import re

# Web scraping
from bs4 import BeautifulSoup
from urllib.request import urlopen

# For labeling records, tracking files, and formatting
from datetime import datetime
import time

# For Rick
import pickle

In [2]:
# # # NOTE: Only use once. Instantiates holder files.

# scraped_users = set([0])
# with open('../data/raw/review_scraped_users.pkl', 'wb+') as file :
#     pickle.dump(scraped_users, file)

# skipped_users = {0:'hello world'}
# with open('../data/raw/review_skipped_users.pkl', 'wb+') as file :
#     pickle.dump(skipped_users, file)

# users_with_no_reviews = set([0])
# with open('../data/raw/users_with_no_reviews.pkl', 'wb+') as file :
#     pickle.dump(users_with_no_reviews, file)

In [3]:
# Load data

with open('../data/raw/all_users', 'rb+') as file :
    all_users = set(pickle.load(file))

with open('../data/raw/review_scraped_users.pkl', 'rb+') as file :
    review_scraped_users = pickle.load(file)

with open('../data/raw/users_with_no_reviews.pkl', 'rb+') as file :
    users_with_no_reviews = pickle.load(file)

In [4]:
# Establish vars

# Use as url.format(userid)
url = 'https://steamcommunity.com/profiles/{}/recommended/'

unscraped_users = all_users - review_scraped_users

skipped_users = {}

In [5]:
# Useful functions

def log_failure(user, message) :
    print(message)
    skipped_users[user] = message
    with open('../data/raw/review_skipped_users.pkl', 'wb+') as file :
        pickle.dump(skipped_users, file)

In [6]:
# Main loop

counter = 0
holding_list = []
temp_scraped_users = set()

for user in unscraped_users :

    # Yes soup for you!
    try :
        html = urlopen(url.format(user))
        soup = BeautifulSoup(html, 'lxml')
        review_blocks = soup.find_all('div', class_='review_box')
        if len(review_blocks) == 0 :
            with open('../data/raw/users_with_no_reviews.pkl', 'rb+') as file :
                users_with_no_reviews = pickle.load(file)
            users_with_no_reviews.add(user)
            with open('../data/raw/users_with_no_reviews.pkl', 'wb+') as file :
                pickle.dump(users_with_no_reviews, file)
            temp_scraped_users.add(user)
            continue
    except :
        log_failure(user, 'Unable to parse review blocks.')
        continue

    for review_block in review_blocks :

        holding_dict = {}

        # Specify user
        holding_dict['user'] = user

        # Get appid
        try :
            app_link = review_block.find('a').get('href')
            app_id = re.findall(r'\d+', app_link)
            holding_dict['app_id'] = int(app_id[0])
        except :
            holding_dict['app_id'] = 'Failed'
        
        # Get 'positive'
        try :
            r_or_n = review_block.find('div', class_='title').find('a').get_text()
            if r_or_n[0] == 'R' :
                holding_dict['positive'] = 1
            else :
                holding_dict['positive'] = 0
        except :
            holding_dict['positive'] = 'Failed'

        # Get playtimes
        try :
            playtimes = review_block.find('div', class_='hours').get_text().replace(',', '')
            pruned_playtimes = re.findall(r'[-+]?\d*\.?\d+|\d+', playtimes)
            for i in range(len(pruned_playtimes)) :
                pruned_playtimes[i] = float(pruned_playtimes[i])
            holding_dict['total_playtime'] = pruned_playtimes[0]
            if len(pruned_playtimes) > 1 :
                holding_dict['review_playtime'] = pruned_playtimes[1]
            else :
                holding_dict['review_playtime'] = pruned_playtimes[0]
        except :
            holding_dict['total_playtime'] = 'Failed'
            holding_dict['review_playtime'] = 'Failed'

        # Get review text
        try :
            review_text = review_block.find('div', class_='content').get_text().strip()
            holding_dict['text'] = review_text
        except :
            holding_dict['text'] = 'Failed'

        # Get 'helpful' counts
        try :
            helpful_str = review_block.find('div', class_='header').get_text()
            helpful_count = re.findall(r'\d+', helpful_str)
            if len(helpful_count) != 0 :
                holding_dict['helpful_count'] = int(helpful_count[0])
            else :
                holding_dict['helpful_count'] = 0
        except :
            holding_dict['helpful_count'] = 'Failed'

        # Get review date
        try :
            posted_text = review_block.find('div', class_='posted').get_text().strip().replace('.', '')
            review_date_text = posted_text[7:]
            # If the review was edited, this str will contain the original date,
            # then a bunch of weird, un-strip()-able whitespace, then the edited date.
            # Let's split these two dates.
            if len(review_date_text) > 20 :
                date_texts = review_date_text.split('Last edited')
                date_texts[0] = date_texts[0][:-9]
                date_texts[1] = date_texts[1][1:]
            else :
                date_texts = [review_date_text, review_date_text]
            # Dates for reviews made in the current year do not include the year.
            # For consistency, we can add it manually.
            for i in range(2) :
                if ',' not in date_texts[i] :
                    year = datetime.now().year
                    date_texts[i] = date_texts[i] + f", {year}"
                date_texts[i] = datetime.strptime(date_texts[i], '%B %d, %Y')
            holding_dict['review_date'] = date_texts[0].date()
            holding_dict['edit_date'] = date_texts[1].date()
        except :
            holding_dict['review_date'] = 'Failed'
            holding_dict['edit_date'] = 'Failed'

        # Log the scraped date
        holding_dict['date_scraped'] = datetime.now().date()

        # Append the holding dict to the df at this level
        holding_list.append(holding_dict.copy())
    
    temp_scraped_users.add(user)

    counter += 1

    if counter % 50 == 0 :
        # Save to disk
        this_round_df = pd.DataFrame(holding_list)
        this_round_table = pa.Table.from_pandas(this_round_df)
        extant_table = pq.read_table('../data/raw/review_table.parquet')
        total_table = pa.concat_tables([extant_table, this_round_table])
        # Release the memory
        extant_table = None
        pq.write_table(total_table, '../data/raw/review_table.parquet')
        # Store len and release memory
        all_records = len(total_table)
        total_table = None

        with open('../data/raw/review_scraped_users.pkl', 'rb+') as file :
            review_scraped_users = pickle.load(file)
        review_scraped_users.update(temp_scraped_users)
        with open('../data/raw/review_scraped_users.pkl', 'wb+') as file :
            pickle.dump(review_scraped_users, file)

        # Report
        print(f'Scraped {len(temp_scraped_users)} users this round')
        print(f'Got {len(holding_list)} reivews')
        print(f'{all_records} reviews are now in the bag, so to speak')
        print(datetime.now().replace(microsecond=0))
        print('')

        # Reset
        temp_scraped_users = set()
        holding_list = []

Scraped 98 users this round
Got 175 reivews
315258 reviews are now in the bag, so to speak
2024-03-25 23:04:49

Scraped 111 users this round
Got 179 reivews
315437 reviews are now in the bag, so to speak
2024-03-25 23:07:20

Scraped 100 users this round
Got 161 reivews
315598 reviews are now in the bag, so to speak
2024-03-25 23:09:34

Scraped 125 users this round
Got 166 reivews
315764 reviews are now in the bag, so to speak
2024-03-25 23:12:26

Scraped 136 users this round
Got 186 reivews
315950 reviews are now in the bag, so to speak
2024-03-25 23:15:38

Scraped 113 users this round
Got 177 reivews
316127 reviews are now in the bag, so to speak
2024-03-25 23:18:22

Scraped 101 users this round
Got 197 reivews
316324 reviews are now in the bag, so to speak
2024-03-25 23:20:31

Scraped 111 users this round
Got 173 reivews
316497 reviews are now in the bag, so to speak
2024-03-25 23:23:00

Scraped 129 users this round
Got 194 reivews
316691 reviews are now in the bag, so to speak
2024-