# Games Research


## Table of contents

1. [Introduction](#Introduction)
2. [Imports](#Imports)
3. [Data Acquisition](#Data_aqu)

# <a class="anchor" id="Introduction"></a>1. Introduction



# <a class="anchor" id="Imports"></a>2. Imports

In [1]:
!pip install selenium
!pip install pydot
!pip install pydotplus
from bs4 import BeautifulSoup
from bs4.element import Tag as HtmlTag
import requests
import os
import re
from random import randint
import time
from time import sleep
from abc import ABC, abstractmethod
from enum import Enum
from functools import partial
from typing import Callable, List
from numbers import Number
import pandas as pd
import numpy as np
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from IPython.display import Image, display  
import pydotplus 
from scipy import misc
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from selenium import webdriver



# <a class="anchor" id="Data_aqu"></a>3. Data Acquisition

In order to create a predication model we first needed to gather relevant data.

Considering our options of data acquisition sources, we decided to look for the biggest video game digital distribution service and storefront, and scrape data which we thought will be helpful and save it as a dataframe.

## The Platform that we choose to get the data:  [Steam](https://store.steampowered.com "Steam")

We have decided on Steam due to the scale of the community and amount of features provided with each game.
We started looking for a way to scrape data from Steam and we immediately faced the issue of having to scroll down to load more games, for that issue we used selenium.

## Getting Games Links

In [2]:
# Set up the web driver
driver = webdriver.Firefox()
driver.get("https://store.steampowered.com/search?category1=998&supportedlang=english&ndl=1")

# Scroll down to load more search results we choose to scroll down 900 times to get near to 50k links
for i in range(0):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)

# Extract game links from search results
soup = BeautifulSoup(driver.page_source, "html.parser")
game_tags = soup.find_all("a", {"class": "search_result_row"})
game_links = [tag.get("href") for tag in game_tags]

#save game links to df
df = pd.DataFrame({"link_to_game_page":game_links})

# Quit the web driver
driver.quit()

#print number of links
print(f"Total links scraped: {len(game_links)}")
df.to_csv('game_links.csv', index=False)


Total links scraped: 50


## Read Game links and scrape data from it

In [3]:
# Read the CSV file containing the URLs
url_df = pd.read_csv('game_links.csv')

# Create an empty list to store the scraped data
game_data_list = []

# Initialize an empty dataframe to store the scraped data
game_data_df = pd.DataFrame(columns=['game_name', 'game_price', 'release_date', 'publisher', 'developer', 'all_review_score', 'all_review_count', 'genre', 'features'])

# Loop through each URL in the CSV file
for index, row in df.iterrows():
    # Send a GET request to the URL
    response = requests.get(row['link_to_game_page'])

    # Parse the HTML of the response with Beautiful Soup
    soup = BeautifulSoup(response.content, "html.parser")
    # Check if the age verification prompt is present on the page
    age_gate = soup.find('div', {'class': 'agegate_text_container'})

    # If age verification is required, set the year to 1995 and submit the form
    if age_gate:
        print(f"Age verification required for {url}")
        # Extract the form data from the age verification prompt
        action_url = age_gate.find('form')['action']
        session_id = age_gate.find('input', {'name': 'sessionid'})['value']
        age_check = age_gate.find('input', {'name': 'ageDay'})['value']
        # Set the year to 1995
        age_year = '1995'
        # Send a POST request to the age verification form URL with the form data
        age_verification_data = {'snr': '1_agecheck_agecheck__age-gate', 'ageDay': age_check, 'ageMonth': '1', 'ageYear': age_year, 'sessionid': session_id}
        verification_response = requests.post(action_url, data=age_verification_data)
        # Parse the HTML of the age verification response with Beautiful Soup
        soup = BeautifulSoup(verification_response.content, "html.parser")

    # Extract the game name from the page
    game_name = soup.select_one('.apphub_AppName').text.strip()

    # Extract the game price from the page
    discount_element = soup.find("div", {"class": "discount_original_price"})
    if discount_element:
        # If on sale, extract the before discounted price
        price_element = discount_element
    else:
        # Otherwise, extract the regular price
        price_element = soup.find("div", {"class": "game_purchase_price"})

    # Extract the price text and remove the currency symbol
    if price_element:
        game_price = price_element.text.strip().replace("₪", "")
    else:
        game_price = "f2p"

    # Extract the release date from the page
    release_date = soup.find('div', {'class': 'date'}).text.strip()

    # Extract the publisher and developer from the page
    publisher = soup.find('div', {'class': 'dev_row'}).find_next_sibling('div').find('a').text
    developer = soup.find('div', {'class': 'dev_row'}).find('a').text

    # Extract the all review score and number of all reviews from the page
    all_review_score = soup.find_all('span', {'class': 'game_review_summary'})[1].text.strip()
    all_review_num = soup.find('meta', {'itemprop': 'reviewCount'})['content']

    # Extract the game genres from the page
    genres = soup.find('span', {'data-panel': '{"flow-children":"row"}'}).find('a').text.strip()

    # find the div containing the game features
    features_div = soup.find('div', {'class': 'game_area_features_list_ctn'})

    # find all the game features and store them in a list
    features = []
    for feature in features_div.find_all('div', {'class': 'label'}):
        features.append(feature.text.strip())


    # Create a new dataframe with the scraped data
    new_game_data_df = pd.DataFrame({
        'game_name': [game_name],
        'game_price': [game_price],
        'release_date': [release_date],
        'publisher': [publisher],
        'developer': [developer],
        'all_review_score': [all_review_score],
        'all_review_count': [all_review_num],
        'genre': [genres],
        'features': [features]
    })
   # Concatenate the new dataframe with the existing dataframe
    if game_data_df.empty:
        game_data_df = new_game_data_df
    else:
        game_data_df = pd.concat([game_data_df, new_game_data_df], ignore_index=True)
    
# Save the new dataframe to a CSV file
game_data_df.to_csv('game_data_df.csv', index=False)

# Display the resulting dataframe with the scraped data
print(game_data_df.head())


                           game_name    game_price  release_date  \
0   Counter-Strike: Global Offensive  Free to Play  21 Aug, 2012   
1                      Apex Legends™  Free to Play   4 Nov, 2020   
2              Red Dead Redemption 2        230.00   5 Dec, 2019   
3                          Destiny 2  Free To Play   1 Oct, 2019   
4  Call of Duty®: Modern Warfare® II        395.00  27 Oct, 2022   

         publisher              developer all_review_score all_review_count  \
0            Valve                  Valve    Very Positive          7155319   
1  Electronic Arts  Respawn Entertainment    Very Positive           638200   
2   Rockstar Games         Rockstar Games    Very Positive           361295   
3          Bungie                  Bungie    Very Positive           538286   
4       Activision          Infinity Ward            Mixed           205084   

    genre                                           features  
0  Action  [Steam Achievements, Full controller suppo