# Steam Games Keywords Linker
This is a sample jupyter notebook that shows how the project works. We use the Steam Games Store Dataset from Kaggle to extract keywords from the games store short description. This allows us to link games based on their keywords. The result of this notebook is a .ttl file that you can use to create queries, in this project we use Sparql.

In [None]:
# if on colab, you can install the requirements with the following command
!pip install -r requirements.txt

## Process the raw dataset

In this section we will process the raw steam data, by doing this we will be able to add each game's present keyphrases.

In [None]:
# standard library imports
import itertools
import re

# third-party imports
import numpy as np
import pandas as pd


def remove_non_english(df):
    # keep only rows marked as supporting english
    df = df[df['english'] == 1].copy()

    # keep rows which don't contain 3 or more non-ascii characters in succession
    df = df[~df['name'].str.contains('[^\u0001-\u007F]{3,}')]

    # remove english column, now redundant
    df = df.drop('english', axis=1)

    return df

def calc_rating(row):
    """Calculate rating score based on SteamDB method."""
    import math

    pos = row['positive_ratings']
    neg = row['negative_ratings']

    total_reviews = pos + neg
    average = pos / total_reviews

    # pulls score towards 50, pulls more strongly for games with few reviews
    score = average - (average*0.5) * 2**(-math.log10(total_reviews + 1))

    return score * 100

def pre_process(df):
    # keep english only
    df = remove_non_english(df)

    # keep windows only, and remove platforms column
    df = df[df['platforms'].str.contains('windows')].drop('platforms', axis=1).copy()

    # keep lower bound of owners column, as integer
    df['owners'] = df['owners'].str.split('-').apply(lambda x: x[0]).astype(int)

    # calculate rating, as well as simple ratio for comparison
    df['total_ratings'] = df['positive_ratings'] + df['negative_ratings']
    df['rating_ratio'] = df['positive_ratings'] / df['total_ratings']
    df['rating'] = df.apply(calc_rating, axis=1)

    # convert release_date to datetime type and create separate column for release_year
    df['release_date'] = df['release_date'].astype('datetime64[ns]')
    df['release_year'] = df['release_date'].apply(lambda x: x.year)

    return df

In [None]:
df_steam = pd.read_csv('steam.csv')
df_steam.head()

In [None]:
# Extraction of games with high owners
df_steam_key = df_steam.nlargest(5000, 'owners')
# We remove all columns except appid and rating
df_steam_key = df_steam_key[['appid','name','rating']]

df_steam_description = pd.read_csv('steam_description_data.csv')

# Add the detailed_description column of steam_description to the df_steam_key (based on appid=steam_appid)
df_steam_key = df_steam_key.merge(df_steam_description[['steam_appid', 'short_description']], left_on='appid', right_on='steam_appid', how='left')
df_steam_key = df_steam_key.drop('steam_appid', axis=1)
df_steam_key = df_steam_key.drop('rating', axis=1)

df_steam_key.head()

In [None]:
# Extract keyphrases:
# Define a function to extract keyphrases from the descriptions
def extract_top_keyphrases(description):
    keywords = kw_model.extract_keywords(description, keyphrase_ngram_range=(2, 4), stop_words='english', top_n=8)
    return ', '.join([keyword[0] for keyword in keywords])

# Apply the function to the 'detailed_description' column to create a new column 'top_keyphrases'
df_steam_key['top_keyphrases'] = df_steam_key['short_description'].apply(extract_top_keyphrases)
# Now, df_steam_key contains a new 'top_keyphrases' column with the top 5 keyphrases for each game description.

df_steam_key.head()

In [None]:
# Extract keyphrases:
# Define a function to extract keyphrases from the descriptions
def extract_top_keyphrases(description):
    keywords = kw_model.extract_keywords(description, keyphrase_ngram_range=(2, 4), stop_words='english', top_n=8)
    return ', '.join([keyword[0] for keyword in keywords])

# Apply the function to the 'detailed_description' column to create a new column 'top_keyphrases'
df_steam_key['top_keyphrases'] = df_steam_key['short_description'].apply(extract_top_keyphrases)
# Now, df_steam_key contains a new 'top_keyphrases' column with the top 5 keyphrases for each game description.

df_steam_key.head()

In [3]:
df = pd.read_csv("df_kp_rating.csv", encoding="utf8")

In [4]:
df.head()

Unnamed: 0,appid,name,short_description,top_keyphrases,rating,developer,publisher,median_playtime,owners,steam_appid,detailed_description
0,570,Dota 2,"Every day, millions of players worldwide enter...","battle dota heroes matter, battle dota heroes,...",85.201281,Valve,Valve,801,100000000,570,<strong>The most-played game on Steam.</strong...
1,730,Counter-Strike: Global Offensive,Counter-Strike: Global Offensive (CS: GO) expa...,"counter strike global offensive, strike global...",86.310312,Valve;Hidden Path Entertainment,Valve,6502,50000000,730,Counter-Strike: Global Offensive (CS: GO) expa...
2,578080,PLAYERUNKNOWN'S BATTLEGROUNDS,PLAYERUNKNOWN'S BATTLEGROUNDS is a battle roya...,"playerunknown battlegrounds battle royale, pla...",50.066901,PUBG Corporation,PUBG Corporation,12434,50000000,578080,<strong>PLAYERUNKNOWN'S BATTLEGROUNDS</strong>...
3,440,Team Fortress 2,Nine distinct classes provide a broad range of...,"tactical abilities, classes provide broad rang...",92.933233,Valve,Valve,623,20000000,440,"<h1>The Jungle Inferno Update</h1><p><a href=""..."
4,230410,Warframe,Warframe is a cooperative free-to-play third p...,"warframe cooperative free play, warframe coope...",90.695302,Digital Extremes,Digital Extremes,394,20000000,230410,"<h1>Just Updated</h1><p><img src=""https://stea..."


In [5]:
df['top_keyphrases'] = df.apply(lambda row: str(row[3]).split(',') if not pd.isnull(row[3]) else [], axis=1)

  df['top_keyphrases'] = df.apply(lambda row: str(row[3]).split(',') if not pd.isnull(row[3]) else [], axis=1)


In [6]:
df['developer'] = df.apply(lambda row: str(row[5]).split(';') if not pd.isnull(row[5]) else [], axis=1)

  df['developer'] = df.apply(lambda row: str(row[5]).split(';') if not pd.isnull(row[5]) else [], axis=1)


In [25]:
df.groupby(['name']).count()

Unnamed: 0_level_0,appid,rating,detailed_description,top_keyphrases
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A Story About My Uncle,5,5,5,5
APB Reloaded,5,5,5,5
ARK: Survival Evolved,5,5,5,5
Age of Empires II HD,5,5,5,5
Alien Swarm,5,5,5,5
...,...,...,...,...
Warface,5,5,5,5
Warframe,5,5,5,5
"Warhammer 40,000: Dawn of War II",5,5,5,5
Z1 Battle Royale,5,5,5,5


In [7]:
df

Unnamed: 0,appid,name,short_description,top_keyphrases,rating,developer,publisher,median_playtime,owners,steam_appid,detailed_description
0,570,Dota 2,"Every day, millions of players worldwide enter...","[battle dota heroes matter, battle dota heroe...",85.201281,[Valve],Valve,801,100000000,570,<strong>The most-played game on Steam.</strong...
1,730,Counter-Strike: Global Offensive,Counter-Strike: Global Offensive (CS: GO) expa...,"[counter strike global offensive, strike glob...",86.310312,"[Valve, Hidden Path Entertainment]",Valve,6502,50000000,730,Counter-Strike: Global Offensive (CS: GO) expa...
2,578080,PLAYERUNKNOWN'S BATTLEGROUNDS,PLAYERUNKNOWN'S BATTLEGROUNDS is a battle roya...,"[playerunknown battlegrounds battle royale, p...",50.066901,[PUBG Corporation],PUBG Corporation,12434,50000000,578080,<strong>PLAYERUNKNOWN'S BATTLEGROUNDS</strong>...
3,440,Team Fortress 2,Nine distinct classes provide a broad range of...,"[tactical abilities, classes provide broad ra...",92.933233,[Valve],Valve,623,20000000,440,"<h1>The Jungle Inferno Update</h1><p><a href=""..."
4,230410,Warframe,Warframe is a cooperative free-to-play third p...,"[warframe cooperative free play, warframe coo...",90.695302,[Digital Extremes],Digital Extremes,394,20000000,230410,"<h1>Just Updated</h1><p><img src=""https://stea..."
...,...,...,...,...,...,...,...,...,...,...,...
4995,600750,Star Trek Timelines,Explore the Final Frontier in the ultimate sci...,"[rpg star trek timelines, strategy rpg star t...",47.631427,[Disruptor Beam Inc.],Disruptor Beam Inc.,2473,50000,600750,"<h1>We are the Borg</h1><p><img src=""https://s..."
4996,601180,Doug and Lily,Doug and Lily go to rescue the child from wizard.,"[doug lily rescue child, lily rescue child wi...",65.795414,[the_dobrokot],the_dobrokot,315,50000,601180,Doug and Lily go to rescue the child from wiza...
4997,601340,Project of the Developer,Project of the Developer is a unique first per...,"[project developer unique, project developer,...",29.242639,[AL-GAME],AL-GAME,244,50000,601340,"<img src=""https://steamcdn-a.akamaihd.net/stea..."
4998,601530,Darkarta: A Broken Heart's Quest Collector's E...,Haunted by strange curses from her orphanage d...,"[netherworld destiny written blood, enchanted...",80.408835,[Tuttifrutti Interactive],Tuttifrutti Interactive,0,50000,601530,"<img src=""https://steamcdn-a.akamaihd.net/stea..."


In [13]:
from urllib.parse import quote, unquote

def keep_alphanumeric(input_string):
    return ''.join(char for char in input_string if char.isalnum() or char == ' ')


def encode_as_iri(input_string):
    return quote(input_string, safe='')

def to_rdf(df):
    prelude = """@prefix : <http://ex.org/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#>.
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
"""
    retstr = ""
    for _,row in df.iterrows():
        name = row["name"]
        name = keep_alphanumeric(name)
        retstr += f':{row["appid"]} rdfs:label "{name}"; :rating "{row["rating"]}"^^xsd:float; :med_play "{row["median_playtime"]}"^^xsd:int; :owners "{row["owners"]}"^^xsd:int;'
        s= ""
        for i,kp in enumerate(row['top_keyphrases']):
            kp2add = kp.strip().replace(" ", "_")
            kp2add = encode_as_iri(kp2add)
            if i==4:
                s+= f' :hasKP :{kp2add};'
                break
            s += f' :hasKP :{kp2add} ;'
        retstr += s
        s=""
        for i,dev in enumerate(row['developer']):
            dev2add = dev.strip().replace(" ", "_").replace(".","")
            dev2add = encode_as_iri(dev2add)
            if i == (len(row['developer']) - 1):
                s+= f' :dev :{dev2add}'
                break
            s += f' :dev :{dev2add} ;'
        retstr += s
        retstr += " .\n"
    return prelude+retstr

In [14]:
with open("steam_games.ttl", "w", encoding="utf-8") as f:
    f.write(to_rdf(df))

In [112]:
app_id=570
df[df.appid==app_id].name[0]

'Dota 2'