# Data Cleaning

In [None]:
# install this if u havent

# !pip install beautifulsoup4 fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [2]:
import re
import requests
import pandas as pd
import numpy as np
import seaborn as sns
from bs4 import BeautifulSoup
from fuzzywuzzy import process
import time



In [None]:
# set a path there if the files cant be read
path = ""

# for game name and id
# quotechar='"' and escapechar='\\' used for handling nested quotes
df_game = pd.read_csv(
    path + 'steam-insights-main/games.csv', 
    usecols=["app_id", "name"], 
    quotechar='"', 
    escapechar='\\'
)

# for viewing the number of recomendations for the games
df_review = pd.read_csv(path + 'steam-insights-main/reviews.csv', usecols=[0, 8])

# for retrieving genres (not needed for now but when we start including the recc engine for games this will be used)
df_genre = pd.read_csv(path + 'steam-insights-main/genres.csv')


In [3]:
# first change all the app_id into numeric to avoid issues
df_game['app_id'] = pd.to_numeric(df_game['app_id'], errors='coerce')
df_review['app_id'] = pd.to_numeric(df_review['app_id'], errors='coerce')

In [None]:
df_game.head()

In [None]:
df_review.head()

In [4]:
# now these 2 dataframes will be merged
df = pd.merge(df_game, df_review, on='app_id', how='inner')

In [None]:
df.head()

In [None]:
# Now to fix the recommendations column because theres non numeric values

# Convert non-numeric values to NaN 
df['recommendations'] = pd.to_numeric(df['recommendations'], errors='coerce')

# Replace NaN to 0
df['recommendations'].fillna(0, inplace=True)

# change recommendations to int64 to remove decimals
df['recommendations'] = df['recommendations'].astype('int64')

In [None]:
df.head()

In [41]:
# Now with the these columns cleaned we need to include the requirements of the games
# CPU, GPU and ram

# To test this only the top 100 recommended games will be used.
# I am using the top recommended games because there is no way to get the top rated games on steam
# The most relevent games dont have enough variaty in game requirements
# Sorting by "User Reviews" doesnt give the most recommended games for some reason (????)

df_test = df.sort_values(by='recommendations', ascending=False).head(100)

df_test = df_test.reset_index(drop=True)

In [None]:
df_test.head(10)
# print(df_test.columns)

In [43]:
# I will be webscrapping the game requirements from steam.com
# Thankfully wescraping for public information on this website is allowed

# Add new columns for the game requirments
df_test['genres'] = None
df_test['CPU'] = None
df_test['GPU'] = None
df_test['memory'] = None

for index, row in df_test.iterrows():
    id = str(row['app_id'])
    url = "https://store.steampowered.com/app/" + id

    cookies = {'birthtime': '568022401'}
    response = requests.get(url)

    if response.status_code != 200:
        print("Failed to retrieve the page for " + row['name'])

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract genres
    genres = []
    genre_section = soup.find('div', id='genresAndManufacturer')

    if genre_section:
        genre_span = genre_section.find('span')
        if genre_span:
            genres = [genre.text.strip() for genre in genre_span.find_all('a')]

    # Extract recommended system requirements
    suggested_reqs = {}
    reqs_section = soup.find('div', class_='game_area_sys_req_rightCol')

    # Check for the specific phrase and switch to left column if found
    if reqs_section and "Requires a 64-bit processor and operating system" in reqs_section.get_text():
        reqs_section = soup.find('div', class_='game_area_sys_req_leftCol')

    # If only minimum requirements are stated
    if not reqs_section:
        reqs_section = soup.find('div', class_='game_area_sys_req_full')
        
    if reqs_section:
        # Find the <ul class="bb_ul"> containing the requirements
        reqs_list = reqs_section.find('ul', class_='bb_ul')
        if reqs_list:
            # Iterate over the list items to extract the requirement details
            for li in reqs_list.find_all('li'):
                text = li.get_text(strip=True)
                if text:
                    # Identify and store relevant information
                    if text.startswith('OS'):
                        suggested_reqs['OS'] = text
                    elif text.startswith('Processor'):
                        suggested_reqs['Processor'] = text.replace('Processor:', '', 1).strip()
                    elif text.startswith('Memory'):
                        suggested_reqs['Memory'] = text.replace('Memory:', '', 1).replace(' RAM', '', 1).strip()
                    elif text.startswith('Graphics'):
                        suggested_reqs['Graphics'] = text.replace('Graphics:', '', 1).strip()
                    if not suggested_reqs.get('Graphics') and text.startswith('Video Card'):
                        suggested_reqs['Graphics'] = text.replace('Video Card:', '', 1).strip()
    
    # Update columns with scraped data
    df_test.at[index, 'genres'] = genres 
    df_test.at[index, 'CPU'] = suggested_reqs.get('Processor', 'Not Available')
    df_test.at[index, 'GPU'] = suggested_reqs.get('Graphics', 'Not Available')
    df_test.at[index, 'memory'] = suggested_reqs.get('Memory', 'Not Available')


In [45]:
df_test.to_csv('top100.csv', index=False)

In [89]:
df100 = pd.read_csv(path + 'top100.csv')

In [None]:
df100.head(10)

In [91]:
# Now with a csv file with the game's name, genres and hardware requirements 
# The next step is to get a benchmark dataset to map then gauge what pc's to suggest to the user
# But first since theres multiple cpu/gpu recommendation it has to be cleaned and made into a list for ease of checking
# or i could do it manually (may be the case honestly)

In [92]:
phrases = ['or or equivalent', 'or higher', 'or better']

def remove_phrase(text):
    if isinstance(text, str):
        for phrase in phrases:
            text = text.replace(phrase, '').strip()
    return text

df100['CPU'] = df100['CPU'].apply(remove_phrase)
df100['GPU'] = df100['GPU'].apply(remove_phrase)

In [None]:
df100.head(30)

In [None]:
# Even with a few repeated phrases removed there are too many inconsistencies in the GPU and CPU columns so I will resort to manual data cleaning for those columns
# i will be uinsg passmark benchmarks to replace those 2 columns
# *These scores are the closest thing we can get to easily estimate performance
# *There are also drivers and other settings that could affect the performance depending on the game

In [8]:
pc_build = pd.read_csv('data/pc_build/pc_prebuilds_clean.csv')

pc_build['CPU'] = pc_build['CPU'].astype('int64')

ValueError: invalid literal for int() with base 10: '25,844'