# Steam Sale!

The excersise is to webscrape data from steam store sales page. The data needs to be parsed. We will save the data to a csv file that we can analyse and show to people whom can use excel or csv files. 
The site we will use is : [https://store.steampowered.com/search/?specials=1&page=1](https://store.steampowered.com/search/?specials=1&page=1)

## Import libraries

In [1]:
import pandas as pd
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import re
from datetime import datetime
import os

## Create a dataframe
Game|Rating|#Reviews|Discount%|Price|Original_price|Release_year|Win|Lin|OSX|Time


In [2]:
def createGameDf():
    '''Creates game dataframe with "Game|Rating|#Reviews|Discount%|Price|Original_price|Release_year|Win|Lin|OSX|Time" columns '''
    games_df = pd.DataFrame(columns=['Game', 'Rating', 'Review_amount','Discount%', 'Price', 'Original_price', 'Release_year', 
                                 'Win', 'Lin','OSX','Time'])
    games_df = games_df.astype({'Game': 'object','Rating': 'int64','Review_amount': 'int64','Discount%': 'int64','Price': 'object',
                            'Original_price': 'object','Release_year': 'int64', 'Win': 'int64', 'Lin': 'int64',
                            'OSX': 'int64', 'Time': 'object'})
    return games_df

## Download page 1 of Steam sales


In [3]:
def fecthSite(page_nr):
    '''Fetches steam sale site. Takes in aparameter page_nr that defines what page it fetches'''
    base_steam_url = "https://store.steampowered.com/search/?specials=1&page="
    uClient = uReq( base_steam_url + str(page_nr))
    page_html = uClient.read()
    uClient.close()
    return page_html

In [4]:
page_html = fecthSite(1)
fetch_timestamp = str(datetime.now())
# preview of page
page_html



### Parse the data to a bs4 soup objekt

In [5]:
page_soup = soup(page_html, 'html.parser')
# preview of page
page_soup

<!DOCTYPE html>

<html class="responsive" lang="en">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="#171a21" name="theme-color"/>
<title>Steam Search</title>
<link href="/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="https://steamstore-a.akamaihd.net/public/shared/css/motiva_sans.css?v=FAK4O46_mOLB&amp;l=english" rel="stylesheet" type="text/css"/>
<link href="https://steamstore-a.akamaihd.net/public/shared/css/shared_global.css?v=S1TAP5-hzwa6&amp;l=english" rel="stylesheet" type="text/css"/>
<link href="https://steamstore-a.akamaihd.net/public/shared/css/buttons.css?v=6uRURryOh96m&amp;l=english" rel="stylesheet" type="text/css"/>
<link href="https://steamstore-a.akamaihd.net/public/css/v6/store.css?v=ZI_5VEPYpykL&amp;l=english" rel="stylesheet" type="text/css"/>
<link href="https://steamstore-a.akamaihd.net/public/css/v6/browse.css?v=mPT3iBtgO8s7&amp;l=e

## Search all games

In [6]:
games = page_soup.findAll("a", {"class":"search_result_row"})
# preview of page
#len(games) #should be 25
games

[<a class="search_result_row ds_collapse_flag" data-ds-appid="359550" data-ds-crtrids="[33075774]" data-ds-itemkey="App_359550" data-ds-tagids="[1663,3859,1708,1774,5711,19,3839]" href="https://store.steampowered.com/app/359550/Tom_Clancys_Rainbow_Six_Siege/?snr=1_7_7_2300_150_1" onmouseout="HideGameHover( this, event, 'global_hover' )" onmouseover="GameHover( this, event, 'global_hover', {&quot;type&quot;:&quot;app&quot;,&quot;id&quot;:359550,&quot;public&quot;:1,&quot;v6&quot;:1} );">
 <div class="col search_capsule"><img src="https://steamcdn-a.akamaihd.net/steam/apps/359550/capsule_sm_120.jpg?t=1568227548" srcset="https://steamcdn-a.akamaihd.net/steam/apps/359550/capsule_sm_120.jpg?t=1568227548 1x, https://steamcdn-a.akamaihd.net/steam/apps/359550/capsule_231x87.jpg?t=1568227548 2x"/></div>
 <div class="responsive_search_name_combined">
 <div class="col search_name ellipsis">
 <span class="title">Tom Clancy's Rainbow Six® Siege</span>
 <p>
 <span class="platform_img win"></span> </

#### Lets look at the first game and start searching for the data we need

In [7]:
games[0]

<a class="search_result_row ds_collapse_flag" data-ds-appid="359550" data-ds-crtrids="[33075774]" data-ds-itemkey="App_359550" data-ds-tagids="[1663,3859,1708,1774,5711,19,3839]" href="https://store.steampowered.com/app/359550/Tom_Clancys_Rainbow_Six_Siege/?snr=1_7_7_2300_150_1" onmouseout="HideGameHover( this, event, 'global_hover' )" onmouseover="GameHover( this, event, 'global_hover', {&quot;type&quot;:&quot;app&quot;,&quot;id&quot;:359550,&quot;public&quot;:1,&quot;v6&quot;:1} );">
<div class="col search_capsule"><img src="https://steamcdn-a.akamaihd.net/steam/apps/359550/capsule_sm_120.jpg?t=1568227548" srcset="https://steamcdn-a.akamaihd.net/steam/apps/359550/capsule_sm_120.jpg?t=1568227548 1x, https://steamcdn-a.akamaihd.net/steam/apps/359550/capsule_231x87.jpg?t=1568227548 2x"/></div>
<div class="responsive_search_name_combined">
<div class="col search_name ellipsis">
<span class="title">Tom Clancy's Rainbow Six® Siege</span>
<p>
<span class="platform_img win"></span> </p>
</di

In [8]:
game_name = games[0].find("span", {"class":"title"}).getText()
game_name

"Tom Clancy's Rainbow Six® Siege"

In [9]:
rating = games[0].find("div", {"class":"col search_reviewscore responsive_secondrow"}).span['data-tooltip-html']
rating = re.search("(?<=)(.*?)(?=\<)", rating).group(0)
rating

'Very Positive'

In [10]:
review_amount = games[0].find("div", {"class":"col search_reviewscore responsive_secondrow"}).span['data-tooltip-html']
review_amount = re.search("(?<= )([0-9,]+)(?= )", review_amount).group(0)
review_amount

'304,716'

In [11]:
discount = games[0].find("div", {"class":"col search_discount responsive_secondrow"}).span.getText()
discount = re.search("(?<=-)(.*?)(?=%)", discount).group(0) # Remove - and % from text
discount

'60'

In [12]:
og_price = games[0].find("div", {"class":"col search_price discounted responsive_secondrow"}).span.strike.getText()[:-1]
og_price

'19,99'

In [13]:
price = games[0].find("div", {"class":"col search_price discounted responsive_secondrow"}).getText()
price = re.search("(?<=€)(.*?)(?=€)", price).group(0) # Takes only the price that is between two € chars
price

'7,99'

In [14]:
release_year = games[0].find("div", {"class":"col search_released responsive_secondrow"}).getText()[-4:] # last 4 digits is year
release_year

'2015'

In [15]:
win_support = games[4].find("span", {"class":"platform_img win"})
if(win_support != None):
   win_support = 1
else:
   win_support = 0
win_support

1

One liner if else 

In [16]:
# Vi förbättrar övre koden till en kortare
win_support = games[0].find("span", {"class":"platform_img win"})
win_support = 1 if win_support != None else 0
win_support

1

In [17]:
lin_support = games[0].find("span", {"class":"platform_img linux"})
lin_support = 1 if lin_support != None else 0
lin_support

0

In [18]:
mac_support = games[0].find("span", {"class":"platform_img mac"})
mac_support = 1 if mac_support != None else 0
mac_support

0

In [19]:
fetch_timestamp

'2019-10-28 10:29:50.566358'

### Create methods of some of the searches

In [20]:
def game_support(game,platform_class):
    '''game_support - Takes in game and searches for a span including the platform class. Returns a 1 if 
    game is supported on platform, else it returns 0'''
    platform_support = game.find("span", {"class": platform_class})
    platform_support = 1 if platform_support != None else 0
    return platform_support

In [21]:
def reviewScore(game):
    ''' Takes game as a parameter and returns a score from 1-9 depending on the review tooltip'''
    rating_text = game.find("div", {"class":"col search_reviewscore responsive_secondrow"}).span['data-tooltip-html']
    rating_text = re.search("(?<=)(.*?)(?=\<)", rating_text).group(0)
    
    switcher = {
        'Overwhelmingly Positive': 9,
        'Very Positive':8,
        'Positive':7,
        'Mostly Positive':6,
        'Mixed':5,
        'Mostly Negative':4,
        'Negative':3,
        'Mostly Negative':2,
        'Overwhelmingly Negative':1
    }
    
    return switcher.get(rating_text, 0)



#### Fix exceptions and create method for extracting data from each game

In [22]:
def gameDataFrame(game, fetch_timestamp):
    '''gameDataFrame - takes in game as parameter and timestamp data was fetched. Returns a data  '''
    # GAME NAME
    game_name = game.find("span", {"class":"title"}).getText()
    # RATING
    try:
        rating = reviewScore(game)
    except:
        rating = 0
    # REVIEW AMOUNT
    try:
        review_amount = game.find("div", {"class":"col search_reviewscore responsive_secondrow"}).span['data-tooltip-html']
        review_amount = re.search("(?<= )([0-9,]+)(?= )", review_amount).group(0)
        review_amount = review_amount.replace(",","")
    except:
        review_amount = 0
    # DISCOUNT
    try:
        discount = game.find("div", {"class":"col search_discount responsive_secondrow"}).span.getText()
        discount = re.search("(?<=-)(.*?)(?=%)", discount).group(0) # Remove - and % from text
    except:
        discount = 0
    # PRICE
    try:
        price = game.find("div", {"class":"col search_price discounted responsive_secondrow"}).getText()
        price = re.search("(?<=€)(.*?)(?=€)", price).group(0) # Takes only the price that is between two € chars
    except:
        price = 0
    # ORIGINAL PRICE
    try:
        og_price = game.find("div", {"class":"col search_price discounted responsive_secondrow"}).span.strike.getText()[:-1]
    except:
        og_price = 0
    # RELEASE YEAR
    release_year = game.find("div", {"class":"col search_released responsive_secondrow"}).getText()[-4:] # last 4 digits is year
    # WIN
    win_support = game_support(game, "platform_img win")
    # LIN
    lin_support = game_support(game, "platform_img linux")
    # OSX
    mac_support = game_support(game, "platform_img mac")
    # TIME
    time = fetch_timestamp
    
    data = {
        'Game': game_name, 
        'Rating': rating,
        'Review_amount': review_amount,
        'Discount%': discount,
        'Price': price, 
        'Original_price': og_price, 
        'Release_year': release_year, 
        'Win': win_support, 
        'Lin': lin_support,
        'OSX': mac_support,
        'Time': time
      }
    return data
    

## Show data from first page to check that everything works fine

In [23]:
one_page = createGameDf()

for game in games:
    game_data = gameDataFrame(game, fetch_timestamp)
    one_page = one_page.append(game_data, ignore_index=True)

In [24]:
one_page

Unnamed: 0,Game,Rating,Review_amount,Discount%,Price,Original_price,Release_year,Win,Lin,OSX,Time
0,Tom Clancy's Rainbow Six® Siege,8,304716,60,799,1999,2015.0,1,0,0,2019-10-28 10:29:50.566358
1,PLAYERUNKNOWN'S BATTLEGROUNDS,5,881898,50,1499,2999,2017.0,1,0,0,2019-10-28 10:29:50.566358
2,MONSTER HUNTER: WORLD,6,78255,50,2999,5999,2018.0,1,0,0,2019-10-28 10:29:50.566358
3,A Plague Tale: Innocence,8,4931,40,2699,4499,2019.0,1,0,0,2019-10-28 10:29:50.566358
4,Dying Light,8,76188,66,1019,2999,2015.0,1,1,1,2019-10-28 10:29:50.566358
5,HITMAN™ - Game of The Year Edition,6,19916,75,1791,7191,,1,1,1,2019-10-28 10:29:50.566358
6,Dying Light Enhanced Edition,8,79693,70,1499,4999,,1,1,1,2019-10-28 10:29:50.566358
7,Fallout 4,6,96380,60,1199,2999,2015.0,1,0,0,2019-10-28 10:29:50.566358
8,HITMAN™,6,18982,0,0,0,2016.0,1,1,1,2019-10-28 10:29:50.566358
9,Assetto Corsa Ultimate Edition,8,19129,78,1739,7988,,1,0,0,2019-10-28 10:29:50.566358


## Loop through the 5 first pages

In [25]:
games_df = createGameDf()

for x in range(1, 6):
    # Fetch site 
    page_html = fecthSite(x)
    fetch_timestamp = str(datetime.now())
    page_soup = soup(page_html, 'html.parser')
    games = page_soup.findAll("a", {"class":"search_result_row"})
    # save games to dataframe from site page
    for game in games:
        game_data = gameDataFrame(game, fetch_timestamp)
        games_df = games_df.append(game_data, ignore_index=True)

In [26]:
# preview of data
games_df.head()

Unnamed: 0,Game,Rating,Review_amount,Discount%,Price,Original_price,Release_year,Win,Lin,OSX,Time
0,Tom Clancy's Rainbow Six® Siege,8,304716,60,799,1999,2015,1,0,0,2019-10-28 10:29:52.229951
1,PLAYERUNKNOWN'S BATTLEGROUNDS,5,881898,50,1499,2999,2017,1,0,0,2019-10-28 10:29:52.229951
2,MONSTER HUNTER: WORLD,6,78255,50,2999,5999,2018,1,0,0,2019-10-28 10:29:52.229951
3,A Plague Tale: Innocence,8,4931,40,2699,4499,2019,1,0,0,2019-10-28 10:29:52.229951
4,Dying Light,8,76188,66,1019,2999,2015,1,1,1,2019-10-28 10:29:52.229951


## Save Dataframe to csv file

In [27]:
cwd = os.getcwd()
path = cwd + "\\SteamSale.csv"
if(os.path.exists(path)):
    # SAVE in same file
    old_csv = pd.read_csv(path)
    # CSV is release_year is read as float and  has ".0" after each number so that must be removed
    old_csv['Release_year'] = old_csv['Release_year'].map(lambda x: str(x).rstrip(".0"))
    # Don't want "nan" in my release year so i'll remove them
    old_csv['Release_year'] = old_csv['Release_year'].map(lambda x: str(x).rstrip(".0") if (str(x) != "nan") else str(x).replace("nan",""))
    concatenated_csv = pd.concat([old_csv,games_df])
    concatenated_csv.to_csv( path, index=False, header=True)
else:
    # CREATE new file
    export_csv = games_df.to_csv (path, index = None, header=True)
    