In [2]:
import numpy as np
import pandas as pd

## Read Files

In [3]:
# Read games.csv
game_details = pd.read_csv('final_data/game_details_subset_clean.csv')
game_details.drop(['date_release', 'price', 'image_url', 'web_url', 'rating'], axis='columns', inplace=True)
game_details.set_index('app_id', inplace=True)
print(game_details.head())

                                    name  \
app_id                                     
1                        Team Fortress 2   
2                          Left 4 Dead 2   
3                                 Dota 2   
4                               Portal 2   
5       Counter-Strike: Global Offensive   

                                              description  \
app_id                                                      
1       Play as much as you want, as long as you like!...   
2       Set in the zombie apocalypse, Left 4 Dead 2 (L...   
3       The most-played game on Steam. Every day, mill...   
4       Portal 2 draws from the award-winning formula ...   
5       Counter-Strike: Global Offensive (CS: GO) expa...   

                                                languages  platform_windows  \
app_id                                                                        
1       ['English', 'Danish', 'Dutch', 'Finnish', 'Fre...              True   
2       ['Danish', 'Dutch'

In [4]:
# Split dataset for separate processing
name_df = game_details.loc[:,['name']]                  #map
desc_df = game_details.loc[:,['description']]           #tdidf
lang_df = game_details.loc[:,['languages']]             #ohe
plat_df = game_details.loc[:,['platform_windows',
                              'platform_mac',
                              'platform_linux']]        #ohe
dev_df = game_details.loc[:,['developers']]             #ohe
cat_df = game_details.loc[:,['categories']]             #ohe
genres_df = game_details.loc[:,['genres']]              #ohe
genres_df.head()

Unnamed: 0_level_0,genres
app_id,Unnamed: 1_level_1
1,"action,f2p"
2,action
3,"action,f2p,strategy"
4,"action,adventure"
5,"action,f2p"


In [5]:
# Genre - One Hot Encoding (by CountVectorizer)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
feature_vectors = vectorizer.fit_transform(genres_df['genres']).toarray()
vocab = vectorizer.get_feature_names_out()
genres_ohe = pd.DataFrame(data = feature_vectors, index = genres_df.index, columns = vocab)
genres_ohe.head(10)

Unnamed: 0_level_0,action,adventure,casual,design,earlyaccess,f2p,indie,mmo,racing,rpg,simulation,sports,strategy,utilities
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,0,0,0,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,0,0,1,0
4,1,1,0,0,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,1,0,0,0
7,1,0,0,0,0,0,0,0,0,1,0,0,0,0
8,0,0,0,0,0,0,0,0,0,1,0,0,0,0
9,1,1,0,0,0,0,1,0,0,1,0,0,0,0
10,1,0,0,0,0,0,0,0,0,0,1,0,1,0


In [6]:
# Perform Cosine Similarity Matrix
from sklearn.metrics.pairwise import cosine_similarity
cs_matrix = pd.DataFrame(cosine_similarity(genres_ohe), index=genres_ohe.index, columns=genres_ohe.index)
cs_matrix.head(10)

# Save to File
cs_matrix.to_csv('model_output/content_based_game_similarity_matrix.csv', index=True)

In [7]:
# Given App_Id, Find Most Similar Games (by Genres)
def get_similar_list(query_app_id, index_start, index_end):
    cs_matrix = pd.read_csv('model_output/content_based_game_similarity_matrix.csv')
    result = cs_matrix.loc[:,['app_id', str(query_app_id)]].sort_values(by=str(query_app_id),ascending=False)
    result = result[result['app_id'] != int(query_app_id)]
    return result['app_id'][index_start:index_end].values

query_app_id = 1
final = get_similar_list(query_app_id, 0, 10)
final

array([ 5, 36, 24,  3, 47, 44, 14, 39, 38, 33], dtype=int64)