In [3]:
import numpy as np
import pandas as pd

## Read Files

In [35]:
# Read games.csv
game_details = pd.read_csv('final_data/game_details_subset.csv')
game_details.drop(['date_release', 'price', 'image_url', 'web_url', 'rating'], axis='columns', inplace=True)
game_details.set_index('app_id', inplace=True)
print(game_details.head())

                                    name  \
app_id                                     
440                      Team Fortress 2   
550                        Left 4 Dead 2   
570                               Dota 2   
620                             Portal 2   
730     Counter-Strike: Global Offensive   

                                              description  \
app_id                                                      
440     The most fun you can have online - PC Gamer Is...   
550     Set in the zombie apocalypse, Left 4 Dead 2 (L...   
570     The most-played game on Steam. Every day, mill...   
620     Portal 2 draws from the award-winning formula ...   
730     Counter-Strike: Global Offensive (CS: GO) expa...   

                                                languages  platform_windows  \
app_id                                                                        
440     ['English', 'Danish', 'Dutch', 'Finnish', 'Fre...              True   
550     ['Danish', 'Dutch'

In [36]:
# Split dataset for separate processing
name_df = game_details.loc[:,['name']]                  #map
desc_df = game_details.loc[:,['description']]           #tdidf
lang_df = game_details.loc[:,['languages']]             #ohe
plat_df = game_details.loc[:,['platform_windows',
                              'platform_mac',
                              'platform_linux']]        #ohe
dev_df = game_details.loc[:,['developers']]             #ohe
cat_df = game_details.loc[:,['categories']]             #ohe
genres_df = game_details.loc[:,['genres']]              #ohe
genres_df.head()

Unnamed: 0_level_0,genres
app_id,Unnamed: 1_level_1
440,"action,freetoplay"
550,action
570,"action,freetoplay,strategy"
620,"action,adventure"
730,"action,freetoplay"


In [37]:
# Genre - One Hot Encoding (by CountVectorizer)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
feature_vectors = vectorizer.fit_transform(genres_df['genres']).toarray()
vocab = vectorizer.get_feature_names_out()
genres_ohe = pd.DataFrame(data = feature_vectors, index = genres_df.index, columns = vocab)
genres_ohe.head(10)

Unnamed: 0_level_0,action,adventure,animationmodeling,casual,designillustration,earlyaccess,freetoplay,indie,massivelymultiplayer,photoediting,racing,rpg,simulation,sports,strategy,utilities
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
440,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
550,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
570,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
620,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
730,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4000,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
49520,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
72850,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
105600,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0
107410,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0


In [100]:
# Perform Cosine Similarity Matrix
from sklearn.metrics.pairwise import cosine_similarity
cs_matrix = pd.DataFrame(cosine_similarity(genres_ohe), index=genres_ohe.index, columns=genres_ohe.index)
cs_matrix.head(10)

# Save to File
cs_matrix.to_csv('final_data/content-based-similarity-matrix.csv', index=True)

In [111]:
# Given App_Id, Find Most Similar Games (by Genres)
def get_similar_list(query_app_id, index_start, index_end):
    query_app_id = str(query_app_id)
    cs_matrix = pd.read_csv('final_data/content-based-similarity-matrix.csv')
    result = cs_matrix.loc[:,['app_id', query_app_id]].sort_values(by=query_app_id,ascending=False)
    return result['app_id'][index_start:index_end].values

query_app_id = 550
final = get_similar_list(query_app_id, 0, 10)
final

array([ 582010,  381210,  374320,  359550,     550,     440,  239140,
       1174180, 1172620,  444090], dtype=int64)