# Webscrape

This notebook is the final webscrape product of an iterative process of our EDA and data collection / preparation.

Once we decided to do a Recommendation System on top of our NLP model, we realized that we needed additional data in order to effectively produce and interpret a recommendation system.

Original dataset at:  https://www.kaggle.com/forgemaster/steam-reviews-dataset?select=reviews-1-115.csv

In this notebook I use the app ID for each game to webscrape for that game's Title (needed for interpretation) and that game's tags (things like FPS, action, adventure, puzzle, etc.).

After some discussion we decided to narrow down our dataset to just games tagged with FPS because when performing NLP there might be some key words that have a lot of meaning for FPS games, but not for others.

After webscraping, assigning titles and tags to each row, and then trimming for FPS games, I store the trimmed csvs in a folder outside of my local repo.

#### It's important to note here that the original dataset csvs and the trimmed csvs are too large to push to github, and that's why they're stored in folders outside of the local repo.

In [1]:
import numpy as np 
import pandas as pd 
import string
import time

import re
import requests
from bs4 import BeautifulSoup

pd.set_option('display.max_colwidth', 100)

import IPython
sound_file = '../../../data/sounds/puzzle_solved_jingle.wav'

import sys
if not 'Notebooks/Individual/Jake' in sys.path:
    sys.path.append('Notebooks/Individual/jake')
from functions import FetchTitlesTags

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ultim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ultim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df0 = pd.read_csv("../../../../archive/reviews-1-115.csv")
df1 = pd.read_csv("../../../../archive/reviews-115-1230.csv")
df2 = pd.read_csv("../../../../archive/reviews-1230-2345.csv")
df3 = pd.read_csv("../../../../archive/reviews-2345-4575.csv")
df4 = pd.read_csv("../../../../archive/reviews-4575-6805.csv")
df5 = pd.read_csv("../../../../archive/reviews-6805-9035.csv")
df6 = pd.read_csv("../../../../archive/reviews-9035-11265.csv")
df7 = pd.read_csv("../../../../archive/reviews-11265-13495.csv")
df8 = pd.read_csv("../../../../archive/reviews-13495-13500.csv")
df9 = pd.read_csv("../../../../archive/reviews-13500-13537.csv")
df10 = pd.read_csv("../../../../archive/reviews-13537-27075.csv")

In [3]:
df0 = df0.dropna()
df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()
df4 = df4.dropna()
df5 = df5.dropna()
df6 = df6.dropna()
df7 = df7.dropna()
df8 = df8.dropna()
df9 = df9.dropna()
df10 = df10.dropna()

In [4]:
df2.isna().sum()

steamid                   0
appid                     0
voted_up                  0
votes_up                  0
votes_funny               0
weighted_vote_score       0
playtime_forever          0
playtime_at_review        0
num_games_owned           0
num_reviews               0
review                    0
unix_timestamp_created    0
unix_timestamp_updated    0
dtype: int64

In [5]:
def get_ids_urls(df):
    game_ids = list(df["appid"].unique())

    url_list = []
    for each in game_ids:
        url = "https://store.steampowered.com/app/{}/".format(each)
        url_list.append(url)
    
    return game_ids, url_list

In [6]:
def get_title(soup):
    title = str(soup.find('div', class_="apphub_AppName"))
    title = title.replace('<div class="apphub_AppName" id="appHubAppName">', "")
    title = title.replace('</div>', "")
    return title
    
def get_tags(soup):
    warning = soup.find('div', class_="glance_tags popular_tags")
    tags = [p.text for p in warning.findAll('a', class_="app_tag")]
    
    for index in range(len(tags)):
        tags[index] = tags[index].replace("\t", "")
        tags[index] = tags[index].replace("\r\n", "")
    return tags

In [7]:
def get_titles_tags(url_list):
    titles = []
    tags = []

    for url in url_list:
        html_page = requests.get(url)
        soup = BeautifulSoup(html_page.content, 'html.parser')
        titles.append(get_title(soup))
        tags.append(get_tags(soup))
        time.sleep(1)
        
    return titles, tags

In [8]:
def add_titles(appid, game_ids, titles):
    for index in range(len(game_ids)):
        if appid == game_ids[index]:
            title = titles[index]
    return title

In [9]:
def add_tags(appid, game_ids, tags):
    for index in range(len(game_ids)):
         if appid == game_ids[index]:
                tag = tags[index]
    return tag

In [10]:
def add_features(df):
    game_ids, url_list = get_ids_urls(df)
    titles, tags = get_titles_tags(url_list)
    
    df["app_title"] = df["appid"].apply(lambda x: add_titles(x, game_ids, titles))
    df["app_tags"] = df["appid"].apply(lambda x: add_tags(x, game_ids, tags))
    
    return df

In [11]:
col = ["steamid", "appid", "app_title", "app_tags", "review", "voted_up"]

def get_FPS(tags):
    result = False
    if "FPS" in tags:
        result = True
    return result

In [12]:
df0_added = add_features(df0)
df0_trim = df0_added[col]
df0_trim["fps"] = df0_trim["app_tags"].apply(lambda x: get_FPS(x))
df0_trim = df0_trim[df0_trim["fps"] == True]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df0_trim["fps"] = df0_trim["app_tags"].apply(lambda x: get_FPS(x))


In [13]:
df1_added = add_features(df1)
df1_trim = df1_added[col]
df1_trim["fps"] = df1_trim["app_tags"].apply(lambda x: get_FPS(x))
df1_trim = df1_trim[df1_trim["fps"] == True]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_trim["fps"] = df1_trim["app_tags"].apply(lambda x: get_FPS(x))


In [14]:
df2_added = add_features(df2)
df2_trim = df2_added[col]
df2_trim["fps"] = df2_trim["app_tags"].apply(lambda x: get_FPS(x))
df2_trim = df2_trim[df2_trim["fps"] == True]

AttributeError: 'NoneType' object has no attribute 'findAll'

In [None]:
df3_added = add_features(df3)
df3_trim = df3_added[col]
df3_trim["fps"] = df3_trim["app_tags"].apply(lambda x: get_FPS(x))
df3_trim = df3_trim[df3_trim["fps"] == True]

In [None]:
df4_added = add_features(df4)
df4_trim = df4_added[col]
df4_trim["fps"] = df4_trim["app_tags"].apply(lambda x: get_FPS(x))
df4_trim = df4_trim[df4_trim["fps"] == True]

In [None]:
df5_added = add_features(df5)
df5_trim = df5_added[col]
df5_trim["fps"] = df5_trim["app_tags"].apply(lambda x: get_FPS(x))
df5_trim = df5_trim[df5_trim["fps"] == True]

In [None]:
df6_added = add_features(df6)
df6_trim = df6_added[col]
df6_trim["fps"] = df6_trim["app_tags"].apply(lambda x: get_FPS(x))
df6_trim = df6_trim[df6_trim["fps"] == True]

In [None]:
df7_added = add_features(df7)
df7_trim = df7_added[col]
df7_trim["fps"] = df7_trim["app_tags"].apply(lambda x: get_FPS(x))
df7_trim = df7_trim[df7_trim["fps"] == True]

In [None]:
df8_added = add_features(df8)
df8_trim = df8_added[col]
df8_trim["fps"] = df8_trim["app_tags"].apply(lambda x: get_FPS(x))
df8_trim = df8_trim[df8_trim["fps"] == True]

In [None]:
df9_added = add_features(df9)
df9_trim = df9_added[col]
df9_trim["fps"] = df9_trim["app_tags"].apply(lambda x: get_FPS(x))
df9_trim = df9_trim[df9_trim["fps"] == True]

In [None]:
df10_added = add_features(df10)
df10_trim = df10_added[col]
df10_trim["fps"] = df10_trim["app_tags"].apply(lambda x: get_FPS(x))
df10_trim = df10_trim[df10_trim["fps"] == True]

In [None]:
df0_trim.to_csv("../../../../trimmed/fps_trimmed_00.csv", index=False)
df1_trim.to_csv("../../../../trimmed/fps_trimmed_01.csv", index=False)
df2_trim.to_csv("../../../../trimmed/fps_trimmed_02.csv", index=False)
df3_trim.to_csv("../../../../trimmed/fps_trimmed_03.csv", index=False)
df4_trim.to_csv("../../../../trimmed/fps_trimmed_04.csv", index=False)
df5_trim.to_csv("../../../../trimmed/fps_trimmed_05.csv", index=False)
df6_trim.to_csv("../../../../trimmed/fps_trimmed_06.csv", index=False)
df7_trim.to_csv("../../../../trimmed/fps_trimmed_07.csv", index=False)
df8_trim.to_csv("../../../../trimmed/fps_trimmed_08.csv", index=False)
df9_trim.to_csv("../../../../trimmed/fps_trimmed_09.csv", index=False)
df10_trim.to_csv("../../../../trimmed/fps_trimmed_10.csv", index=False)

In [None]:
IPython.display.Audio(sound_file, autoplay=True, rate=1000)

In [None]:
df_trim0.head()