In [1]:
steam_data_filepath = r"..\..\data\raw\Steam_2024_bestRevenue_1500.csv"

In [2]:
#Function to tidy some of the language data from the API. Remove all text in between (and including) < > 
def remove_text_in_triangle_brackets(string):
    list = []
    inside_brackets = False

    for char in string:
        if char == '<':
            inside_brackets = True
        elif char == '>':
            inside_brackets = False
        elif not inside_brackets:
            list.append(char)
    return ''.join(list)


In [3]:
#Function to take a steamID and use the API to get data related to that id
def get_game_data(appid):
    url = f'https://store.steampowered.com/api/appdetails?appids={appid}'
    response = requests.get(url)
    return response.json()

In [4]:
# Function to interate through a list of IDs and collect that into a DataFrame
# This will take at least a couple of seconds per ID

def get_dataframe_from_id_list_by_api(steamID_list):
    # Create empty list and set tracking values all to 0
    game_data_list = []
    i = 0
    j = 0
    k = 0

    #loops through each id in the input list and tries to get data from the api
    for appid in steamID_list:
        i += 1
        appid_str = str(appid)  # Ensure appid is a string
        appdata = get_game_data(appid_str)
    
        # Check if the API response contains the appid and if the success key exists
        if appid_str in appdata and appdata[appid_str].get('success'):
            j += 1
            print(f"\rAttempt {i}: {appid_str} Success. {k} Failures, {j} Successes. {100*i/len(steamID_list):.2f}% complete ", end="")
            data = appdata[appid_str].get('data', {})
    
            lang_string = data.get('supported_languages', None)
            clean_lang_string = remove_text_in_triangle_brackets(lang_string).replace("*", "").replace("languages with full audio support", "") if lang_string else None
            lang_list = [item.strip() for item in clean_lang_string.split(',')] if clean_lang_string else []
    
            game_data_list.append({
                'game_name': data.get('name', None),
                'steam_id': data.get('steam_appid', None),
                'required_age': data.get('required_age', None),
                'is_free': data.get('is_free', None),
                'description': data.get('short_description', None),
                'languages': lang_list,
                'developers': data.get('developers', None),
                'publishers': data.get('publishers', None),
                'release_date': data.get('release_date', None),
                'price': data.get('price_overview', {}).get('final_formatted', None),
                'metacritic_score': data.get('metacritic', {}).get('score', None),
                'genres': [genre['description'] for genre in data.get('genres', [])],
                'categories': [cat['description'] for cat in data.get('categories', [])]
            })
        
        # If the API response does not contain the appid, handles the failure
        # Adds the id to a new row, but with no data in other columns
        else:
            # Log the entire response for debugging if it fails
            k += 1
            print(f"\rAttempt {i}: {appid_str} Failure. {k} Failures, {j} Successes. {100*i/len(steamID_list):.2f}% complete ", end="")
            
            #print(f"Response: {appdata}")  # Print the response to understand why it failed
            game_data_list.append({
                'game_name': None,
                'steam_id': appid_str,
                'required_age': None,
                'is_free': None,
                'description': None,
                'languages': None,
                'developers': None,
                'publishers': None,
                'release_date': None,
                'price': None,
                'metacritic_score': None,
                'genres': None,
                'categories': None
            })
    
        #wait for 1.5s after each id to ensure the api limit is not hit
        time.sleep(1.5)

    game_df = pd.DataFrame(game_data_list)
    return game_df

In [5]:
def tidy_list_cols(df):
    game_df2 = df.copy()
    
    cols_to_convert = ['languages', 'developers', 'publishers', 'genres', 'categories']
    
    for col in cols_to_convert:
        game_df2[col] = game_df2[col].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')
    return(game_df2)

In [6]:
# Tidies release_date column to just return the date
def clean_release_date(df):
    df['release_date'] = df['release_date'].apply(lambda x: x['date'])
    return df

In [7]:
#Master function that from the original dataset csv, runs the api and returns a dataframe of the api data

def get_api_data_df_from_csv(csv_filepath):
    """ Master function that from the original dataset csv, runs the api and returns a dataframe of the api data 
    This takes a long time to run!!! - About 2s per id, so for the dataset of 1500 ids takes 40-50mins """
    
    #Import needed libraries
    import pandas as pd
    import requests
    import time
    
    #gets dataframe from the csv
    steam_df = import_data_from_csv(csv_filepath)

    #get list of steamIDs from the dataframe
    steamID_list = get_list_of_ids(steam_df)

    #runs the api using that list to pull the data for each id - Takes a couple of seconds for each id
    game_api_df = get_dataframe_from_id_list_by_api(steamID_list)

    #tidies some of the columns
    game_api_df = tidy_list_cols(game_api_df)
    game_api_df = clean_release_date(game_api_df)

    return(game_api_df)