# Tag list replacer

When I initially scraped records (17,000+ records extant at the time of this notebook's creation, scraping at a rate of 200 games/hr), I only pulled the 7 most common tags from each game. Those were easier to get from the Steam store, since Steam displays them in a convenient location for scraping.

However, once I had enough records to begin analysis, I discovered that this made my feature space too sparse. Fewer than half of the records had 2 or more usable tags, and that's with a very generous definition of "useful" (specifically, "appears in >5% of records", which is super sparse in its own right).

To prevent scraping from scratch, we now have to devise a way to grab an extended tag list for each game and use it to replace the tag list in our existing data.

In [1]:
# Basic DS stuff
import numpy as np
import pandas as pd

# Trying not to get blocked while scraping by inputting
# random delays between Get requests.
import random
import time

# Web scraping
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen

# To help see if we have existing data or not.
import os

# For Rick
import pickle

In [39]:
# Load the necessary data.

%store -r tags_dict

try :
    %store -r skipped_indexes
except :
    skipped_indexes = []

with open('../data/raw/0 - Scraped Games DF.pkl', 'rb') as file :
    games_df = pickle.load(file)

no stored variable or alias skipped_indexes


In [40]:
# Set the size of each iteration.
iterations = 20
counter = 0
successful_indexes = []
failed_indexes = []
total_touched = []

------

### Step 1: Find the Full Tag List

Turns out there's no tag-key list, but the full TEXT of each tag value does appear in a specific div:

In [41]:
for index, row in games_df.iterrows() :

    # Check to see if we already have tags for this game.
    if (row['tag_list'] != []) :
        continue

    # Try to call the page. Sometimes this fails randomly
    try :
        url = row['game_page_link']
        html = urlopen(url)
        current_page_soup = BeautifulSoup(html, 'lxml')
    except :
        print("Failed call. Retrying in 2 min...")
        skipped_indexes.append(index)
        %store skipped_indexes
        time.sleep(120)
        continue

    try :
        code_block = current_page_soup.find('div', attrs={'class':'glance_tags popular_tags'})
        successful_indexes.append(index)
        counter += 1
    except :
        games_df.at[index, 'tag_list'] = ['Failed']
        failed_indexes.append(index)
        counter += 1
        continue

    tag_names_list = []

    for tag_section in code_block.find_all('a', class_='app_tag') :
        tag_name = tag_section.get_text().strip()
        tag_names_list.append(tag_name)
    
    games_df.at[index, 'tag_list'] = tag_names_list.copy()

    if counter == iterations :
        print(f"Batch complete.")
        print(f"Successfully updated: {len(successful_indexes)}")
        print(f"Failed to update: {len(failed_indexes)}")
        print(f"Total touched: {len(successful_indexes) + len(failed_indexes)}")
        print(f"Current index: {index}")
        print(f"")
        counter = 0
        successful_indexes = []
        failed_indexes = []
        with open('../data/raw/0.5 - Scraped Games DF with Tag Lists.pkl', 'wb') as file :
            pickle.dump(games_df, file)
    
    delay = 0.3 + random.random() * 0.3
    time.sleep(delay)

print("COMPLETE!")

Failed call. Retrying in 2 min...


NameError: name 'skipped_indexes' is not defined