In [501]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from IPython.core.display import HTML
import re
from datetime import date

### First, I extract the relevant section of the page and use it as a basis to determine what information I need and what the attribute names are

In [436]:
res = requests.get("https://store.steampowered.com/search/?specials=1&page=1")
today = date.today()
soup = bs(res.text)

games = soup.find_all(class_="responsive_search_name_combined")

HTML(str([row for row in games]))
games_list = [row for row in games]

print(games_list[0].prettify())



<div class="responsive_search_name_combined">
 <div class="col search_name ellipsis">
  <span class="title">
   UNCHARTED™: Legacy of Thieves Collection
  </span>
  <div>
   <span class="platform_img win">
   </span>
  </div>
 </div>
 <div class="col search_released responsive_secondrow">
  19 Oct, 2022
 </div>
 <div class="col search_reviewscore responsive_secondrow">
  <span class="search_review_summary positive" data-tooltip-html="Very Positive&lt;br&gt;89% of the 15,382 user reviews for this game are positive.">
  </span>
 </div>
 <div class="col search_price_discount_combined responsive_secondrow" data-price-final="1999">
  <div class="col search_discount_and_price responsive_secondrow">
   <div aria-label="60% off. 49,99€ normally, discounted to 19,99€" class="discount_block search_discount_block" data-bundlediscount="0" data-discount="60" data-price-final="1999" role="link">
    <div class="discount_pct">
     -60%
    </div>
    <div class="discount_prices">
     <div class="di

### In this function we extract the ratings.
Using regex isn't the most intuitive approach, but I managed to make it work.

In [465]:
def ratings():
    ratings_list = []
    for game in games: 
        try:
            rating = game.find(class_="col search_reviewscore responsive_secondrow") #The class that contains the reviews.
            rating = rating.find("span").get("data-tooltip-html") #The span element with the class "review" or the data-tooltip attribute contains the review information. 
            pre = re.search(r'(\d+%)', rating) #One or more digits after the percentage symbol.
            if pre:
                ratings_list.append(pre.group(1))
            else:
                ratings_list.append("N/A")
        except AttributeError:
            ratings_list.append("N/A")
    return ratings_list
ratings()

['95%',
 '69%',
 '72%',
 '93%',
 '95%',
 '92%',
 '91%',
 '78%',
 '87%',
 '97%',
 '85%',
 '90%',
 'N/A',
 '76%',
 '73%',
 '88%',
 '66%',
 '88%',
 '81%',
 '88%',
 '74%',
 '52%',
 '91%',
 '74%',
 '79%']

### This function retrieves the supported OS platforms for each game. This took me a while to get it working.
 I initially thought it was overly complex, but in the end, it was just a matter of using a few for loops. The biggest challenge was figuring out how to separate the different OS names into their respective lists.

In [504]:
def osCheck(oss):
    win = []

    for game in games:
        os = game.find("div").find_all("span", class_="platform_img") #The spans with the class platform_img
        game_win = 0 # default 0
        
        for span in os:
            class_list = span.get("class", [])
            if oss in class_list: #If the OS name matches the class name in the platform_img class, "os".
                game_win = 1
        
        win.append(game_win)
  
    return win
osCheck("mac") #win mac linux


[0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0]

### We extract the reviews.
The regex was tricky, but I got some help to make it work.

In [463]:
def reviewCount():
    ratings_list = []
    for game in games:
        try:
            rating = game.find(class_="col search_reviewscore responsive_secondrow") #The class that contains the reviews.
            rating = rating.find("span").get("data-tooltip-html") #The span element with the class "review" or the data-tooltip attribute contains the review information.
            revCount = re.search(r'the (\d{1,3}(?:,\d{3})*) user reviews', rating)
            if revCount:
                ratings_list.append(revCount.group(1))
            else:
                ratings_list.append("N/A")
        except AttributeError:
            ratings_list.append("N/A")
    return ratings_list
reviewCount()

['93,393',
 '559',
 '2,300',
 '81,622',
 '35,253',
 '206,625',
 '591,533',
 '47',
 '18,209',
 '11,378',
 '773',
 '48,549',
 'N/A',
 '3,209',
 '1,369',
 '144,294',
 '12,963',
 '87,802',
 '7,516',
 '4,112',
 '85,297',
 '1,614',
 '3,991',
 '2,994',
 '14,689']

### Here we bring everything together and write the data to a CSV file.


In [503]:
page = 1
df = pd.DataFrame()

for page in range(1, 6):
   
    res = requests.get(f"https://store.steampowered.com/search/?specials=1&page={page}")
    today = date.today()
    soup = bs(res.text)
    
    games = soup.find_all(class_="responsive_search_name_combined") #The element that contains the game information.
    
    title = [game.find(class_="title").text.strip() for game in games if game.find(class_="title")]
    rating = ratings()
    review = reviewCount() 
    sale = [game.find(class_="discount_pct").text.strip() if game.find(class_="discount_pct") else 'N/A' for game in games] 
    price = [game.find(class_="discount_final_price").text.strip() if game.find(class_="discount_final_price") else 'N/A' for game in games] 
    normal_price = [game.find(class_="discount_original_price").text.strip() if game.find(class_="discount_original_price") else 'N/A' for game in games]
    release = [game.find(class_="col search_released responsive_secondrow").text.strip() if game.find(class_="col search_released responsive_secondrow") else 'N/A' for game in games]
    win = osCheck("win")
    lin = osCheck("linux")
    osx = osCheck("mac") 
    time = today.strftime("%d/%m/%Y") #format the time correctly
      
    data = {
        'Spelnamn': title,
        'Rating': rating,
        '#Reviews': review,
        'Rabatt%': sale,
        'Pris': price,
        'OrdinariePris': normal_price,
        'Utgivningsår': release,
        'Win': win,
        'Lin': lin,
        'OSX': osx,
        'Tid': time,
    }
    page_df = pd.DataFrame(data)
 
    df = pd.concat([df, page_df], ignore_index=True)

df.to_csv('out.csv', index=False)
print("csv. created")

    
print(df.head(3))
print("\033[1m \n \n top 3 /\  sista 3 \/ \033[0m")
print(df.tail(3))


csv. created
                                   Spelnamn Rating #Reviews Rabatt%    Pris  \
0  UNCHARTED™: Legacy of Thieves Collection    89%   15,385    -60%  19,99€   
1                         Hearts of Iron IV    91%  224,247    -70%  14,99€   
2                          EA SPORTS FC™ 25    45%   22,763    -50%  34,99€   

  OrdinariePris  Utgivningsår  Win  Lin  OSX         Tid  
0        49,99€  19 Oct, 2022    1    0    0  27/11/2024  
1        49,99€   9 Apr, 2024    1    1    1  27/11/2024  
2        69,99€  26 Sep, 2024    1    0    0  27/11/2024  
[1m 
 
 top 3 /\  sista 3 \/ [0m
                             Spelnamn Rating #Reviews Rabatt%    Pris  \
122                      HoT Potatoes    96%  105,751     N/A  10,48€   
123                  ICBM: Escalation    88%      302    -10%  26,09€   
124  Master of Magic Classic Complete    95%      709     N/A  11,03€   

    OrdinariePris  Utgivningsår  Win  Lin  OSX         Tid  
122           N/A                  1    0    

Unnamed: 0,Win,Lin,OSX
count,125.0,125.0,125.0
mean,1.0,0.224,0.288
std,0.0,0.4186,0.454653
min,1.0,0.0,0.0
25%,1.0,0.0,0.0
50%,1.0,0.0,0.0
75%,1.0,0.0,1.0
max,1.0,1.0,1.0
