In [188]:
import numpy as np
import pandas as pd

## Read Files
Note: These data should come from the DB instead. These are just temp steps for working on implementing a recommender system.

In [189]:
# Read games_metadata.json
#raw_tags = pd.read_json('data/games_metadata.json', lines=True)
#raw_tags.drop(['description'], axis='columns', inplace=True)
#print(raw_tags.head(), '\n')
#print('--------------------')

# Read games.csv
raw_games = pd.read_csv('data/games.csv')
raw_games.drop(['date_release', 'rating', 'price_original', 'discount', 'user_reviews'], axis='columns', inplace=True)
#print(raw_games.head(), '\n')
#print('--------------------')

# Read game_detail.csv
raw_details = pd.read_csv('data/game_detail.csv')
raw_details = raw_details[['AppID', 'Developers', 'Genres']]
raw_details.rename(columns={'AppID': 'app_id', 'Developers': 'developers', 'Genres': 'genres'}, inplace=True)
raw_details.dropna(inplace=True)
#print(raw_developers.shape)


In [276]:
# Join
games_df = raw_games.merge(raw_details, on='app_id', how='inner')
games_df.replace({True:1, False:0}, inplace=True)
games_df.set_index('app_id', inplace=True)

platform_df = games_df.iloc[:, 1:5]
numerical_df = games_df.iloc[:, 5:7]
developer_df = games_df.iloc[:, 7:8]
genres_df = games_df.iloc[:, 8:9]

print(games_df.shape)
#print(platform_df.head())       # Categorical
#print(numerical_df.head())      # Numerical
#print(developer_df.head())      # Requires OHE
#print(genres_df.head())         # Requires OHE

# Lowercase + Remove Spaces
genres_df['genres'] = genres_df['genres'].str.lower()
genres_df['genres'] = genres_df['genres'].replace(' ', '', regex=True)
genres_df['genres']


(39698, 9)


app_id
13500                            action,adventure
113020     action,adventure,casual,indie,strategy
226560                                  adventure
249050               adventure,indie,rpg,strategy
250180                                     action
                            ...                  
632470                                        rpg
1599660                                 adventure
250900                                     action
920210                           action,adventure
1361510             action,adventure,casual,indie
Name: genres, Length: 39698, dtype: object

In [271]:
# Genre - One Hot Encoding (by CountVectorizer)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
feature_vectors = vectorizer.fit_transform(genres_df['genres']).toarray()
vocab = vectorizer.get_feature_names_out()
genres_ohe = pd.DataFrame(data = feature_vectors, index = genres_df.index, columns= vocab)
genres_ohe

Unnamed: 0_level_0,accounting,action,adventure,animation,audioproduction,casual,design,earlyaccess,education,freetoplay,...,rpg,sexualcontent,simulation,softwaretraining,sports,strategy,utilities,videoproduction,violent,webpublishing
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13500,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
113020,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
226560,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
249050,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
250180,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
632470,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1599660,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250900,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
920210,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [191]:
# Scale Data
#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler()
#scaled_df = numerical_df.drop(['positive_ratio', 'price_final'], axis='columns')

#for column in numerical_df.columns:
#    data = games_df.loc[:, [column]].values
#    new = scaler.fit_transform(data)
#    scaled_df[column+'_scaled'] = new

#print(numerical_df.head())
#print(scaled_df.head())

In [278]:
# Perform Cosine Similarity w/ Query
query_id = 113020
query_item_genres = genres_ohe[genres_ohe.index == query_id]

from sklearn.metrics.pairwise import cosine_similarity
games_df['cs'] = cosine_similarity(query_item_genres, genres_ohe)[0]
games_df[['title', 'genres', 'cs']]

Unnamed: 0_level_0,title,genres,cs
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
13500,Prince of Persia: Warrior Within™,"Action,Adventure",0.632456
113020,Monaco: What's Yours Is Mine,"Action,Adventure,Casual,Indie,Strategy",1.000000
226560,Escape Dead Island,Adventure,0.447214
249050,Dungeon of the ENDLESS™,"Adventure,Indie,RPG,Strategy",0.670820
250180,METAL SLUG 3,Action,0.447214
...,...,...,...
632470,Disco Elysium - The Final Cut,RPG,0.000000
1599660,Sackboy™: A Big Adventure,Adventure,0.447214
250900,The Binding of Isaac: Rebirth,Action,0.447214
920210,LEGO® Star Wars™: The Skywalker Saga,"Action,Adventure",0.632456


In [279]:
# Given App_Id, Find Top 5 Most Similar (by Genres)
sorted = games_df.sort_values(['cs', 'positive_ratio'], ascending=False)  # Sort by Est Owners instead
sorted = sorted[sorted.index != query_id]
print(sorted.head())
print(sorted.head(5).index)

                              title  win  mac  linux  steam_deck  \
app_id                                                             
1361270                Flying Slime    1    0      0           1   
1757560               One Pixel TWB    1    1      1           1   
1588910           Peppy's Adventure    1    1      1           1   
778780   Desktop Agents - Cov1d-999    1    0      0           1   
1271170     The Wizard and The Slug    1    0      0           1   

         positive_ratio  price_final        developers  \
app_id                                                   
1361270             100         2.99         JoinGames   
1757560             100         1.99            Ireero   
1588910             100         6.99    Pepite Studios   
778780              100         0.99  Microblast Games   
1271170             100         4.99          Silkworm   

                                         genres   cs  
app_id                                                
136127