In [13]:
import pandas as pd
from typing import Union

In [14]:
# Function boolean_error_safe is used as a converter to get non-bool-like values to return NA
def boolean_error_safe(value) -> Union[bool, pd.NA]:
    if str(value).lower() in ["true", "1"]:
        return True
    elif str(value).lower() in ["false", "0"]:
        return False
    else:
        return pd.NA 

# TODO: Need a converter function to parse list-like objects typed as 'object' to Python list

# Function parse_df needed since dataset must be parsed in two parts to mitigate GitHub size restriction
def parse_df(filepath:str) -> pd.core.frame.DataFrame:
    df : pd.core.frame.DataFrame = pd.read_csv(filepath, 
                                               index_col="AppID",
                                               dtype={"AppID" : "category",
                                                    "name" : "string",
                                                    "required_age":"string",
                                                    "dlc_count":"string",
                                                    "detailed_description":"string",
                                                    "about_the_game":"string",
                                                    "short_description":"string",
                                                    "reviews":"string",
                                                    "header_image":"string",
                                                    "website":"string",
                                                    "support_url":"string",
                                                    "support_email":"string",
                                                    "metacritic_score":"string",
                                                    "metacritic_url":"string",
                                                    "achievements":"string",
                                                    "notes":"string",
                                                    "supported_languages":"object",
                                                    "full_audio_languages":"object",
                                                    "packages":"string",
                                                    "developers":"object",
                                                    "publishers":"object",
                                                    "categories":"object",
                                                    "genres":"object",
                                                    "screenshots":"string",
                                                    "movies":"string",
                                                    "user_score":"string",
                                                    "score_rank":"string",
                                                    "estimated_owners":"string",
                                                    "average_playtime_forever":"string",
                                                    "average_playtime_2weeks":"string",
                                                    "median_playtime_forever":"string",
                                                    "median_playtime_2weeks":"string",
                                                    "tags":"string"
                                                    },
                                                converters = {"price": lambda x: pd.to_numeric(x, errors="coerce"),
                                                    "positive": lambda x: pd.to_numeric(x, errors="coerce"),
                                                    "negative": lambda x: pd.to_numeric(x, errors="coerce"),
                                                    "peak_ccu": lambda x: pd.to_numeric(x, errors="coerce"),
                                                    "pct_pos_total": lambda x: pd.to_numeric(x, errors="coerce"),
                                                    "num_reviews_total": lambda x: pd.to_numeric(x, errors="coerce"),
                                                    "pct_pos_recent": lambda x: pd.to_numeric(x, errors="coerce"),
                                                    "num_reviews_recent": lambda x: pd.to_numeric(x, errors="coerce"),
                                                    "release_date": lambda x: pd.to_datetime(x, errors="coerce"),
                                                    "windows" : boolean_error_safe,
                                                    "mac" : boolean_error_safe,
                                                    "linux" : boolean_error_safe
                                                    },
                                                encoding='latin1',
                                                encoding_errors='ignore')

    # Drop columns that are not needed for the analysis
    df : pd.core.frame.DataFrame = df.drop(columns=["required_age","detailed_description","about_the_game","short_description","reviews","support_url","support_email","estimated_owners",
                                                                    "metacritic_score","metacritic_url","achievements","notes","packages","screenshots","movies","user_score","score_rank","tags",
                                                                    "average_playtime_forever", "average_playtime_2weeks", "median_playtime_forever", "median_playtime_2weeks", "header_image",
                                                                    "website"]) 

    return df





# Parse the dataset parts into DataFrames and append them into a single DataFrame
games_sub1 : pd.core.frame.DataFrame = parse_df("datasets/games_may2024_cleaned_1of2.zip")
games_sub2 : pd.core.frame.DataFrame = parse_df("datasets/games_may2024_cleaned_2of2.zip")

games : pd.core.frame.DataFrame = pd.concat([games_sub1, games_sub2])
games.head()

  df : pd.core.frame.DataFrame = pd.read_csv(filepath,


Unnamed: 0_level_0,name,release_date,price,dlc_count,windows,mac,linux,recommendations,supported_languages,full_audio_languages,...,publishers,categories,genres,positive,negative,peak_ccu,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
730,Counter-Strike 2,2012-08-21,0.0,1,True,False,True,4111974,"['Czech', 'Danish', 'Dutch', 'English', 'Finni...","['English', 'Vietnamese']",...,['Valve'],"['Multi-player', 'Cross-Platform Multiplayer',...","['Action', 'Free to Play']",7024836.0,1029779.0,1362469.0,87.0,8071426.0,80.0,56072.0
578080,PUBG: BATTLEGROUNDS,2017-12-21,0.0,0,True,False,False,1701431,"['English', 'Korean', 'Simplified Chinese', 'F...",[],...,"['KRAFTON, Inc.']","['Multi-player', 'PvP', 'Online PvP', 'Stats',...","['Action', 'Adventure', 'Massively Multiplayer...",1379580.0,981860.0,590582.0,58.0,2365012.0,68.0,19517.0
570,Dota 2,2013-07-09,0.0,2,True,True,True,14329,"['Bulgarian', 'Czech', 'Danish', 'Dutch', 'Eng...","['English', 'Korean', 'Simplified Chinese', 'V...",...,['Valve'],"['Multi-player', 'Co-op', 'Steam Trading Cards...","['Action', 'Strategy', 'Free to Play']",1832477.0,406030.0,668192.0,81.0,2247365.0,71.0,23832.0
271590,Grand Theft Auto V,2015-04-13,0.0,1,True,False,False,1641404,"['English', 'French', 'Italian', 'German', 'Sp...","['English', 'Spanish - Latin America']",...,['Rockstar Games'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Action', 'Adventure']",1557234.0,236827.0,133571.0,87.0,1643791.0,92.0,16181.0
359550,Tom Clancy's Rainbow SixÂ® Siege,2015-12-01,19.99,9,True,False,False,1088708,"['English', 'French', 'Italian', 'German', 'Sp...","['English', 'French', 'Italian', 'German', 'Sp...",...,['Ubisoft'],"['Single-player', 'Multi-player', 'PvP', 'Onli...",['Action'],1094330.0,190046.0,68162.0,85.0,1091695.0,77.0,10566.0
