In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

In [2]:
games = pd.concat([pd.read_csv('../data/steam_games_cleaned_1.csv'), pd.read_csv('../data/steam_games_cleaned_2.csv'), pd.read_csv('../data/steam_games_cleaned_3.csv')], axis=0).drop('Unnamed: 0', axis=1)
print(games.shape)
games.head()

(56763, 469)


Unnamed: 0,appid,name,genre,developer,publisher,owners,average_forever,median_forever,pos_rating_pct,total_ratings,...,Web Publishing,Well-Written,Werewolves,Western,Word Game,World War I,World War II,Wrestling,Zombies,e-sports
0,10,Counter-Strike,Action,Valve,Valve,"10,000,000 .. 20,000,000",11666.0,244.0,0.974693,189081.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217837
1,20,Team Fortress Classic,Action,Valve,Valve,"2,000,000 .. 5,000,000",91.0,18.0,0.857002,6105.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30,Day of Defeat,Action,Valve,Valve,"5,000,000 .. 10,000,000",403.0,26.0,0.900055,5423.0,...,0.0,0.0,0.0,0.0,0.0,0.015287,0.313376,0.0,0.0,0.0
3,40,Deathmatch Classic,Action,Valve,Valve,"5,000,000 .. 10,000,000",33.0,6.0,0.816689,2193.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50,Half-Life: Opposing Force,Action,Gearbox Software,Valve,"5,000,000 .. 10,000,000",322.0,127.0,0.951749,13119.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
games['has_tag'] = [games.iloc[i,42:].values.max() for i in range(len(games))]
games['has_tag'].value_counts()

1.000000    48286
0.000000     8463
0.962500        1
0.925926        1
0.968750        1
0.923077        1
0.877551        1
0.950000        1
0.981013        1
0.983010        1
0.958333        1
0.965714        1
0.941176        1
0.953488        1
0.956522        1
0.967213        1
Name: has_tag, dtype: int64

In [4]:
# recommender will use tag ratios so we'll need to trim out any games missing tags
games = games.loc[games['has_tag'] == 1]

search_df = games[['appid','name']]
appid_dict = search_df.set_index('appid').to_dict()['name']
search_df.set_index('name', inplace=True)

In [5]:
# set appids as our index since we may have similar or duplicate game names
games.set_index('appid', inplace=True)

In [6]:
games.sort_index(inplace=True)
print(games.shape)
games.head()

(48286, 469)


Unnamed: 0_level_0,name,genre,developer,publisher,owners,average_forever,median_forever,pos_rating_pct,total_ratings,price,...,Well-Written,Werewolves,Western,Word Game,World War I,World War II,Wrestling,Zombies,e-sports,has_tag
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,Counter-Strike,Action,Valve,Valve,"10,000,000 .. 20,000,000",11666.0,244.0,0.974693,189081.0,999.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217837,1.0
20,Team Fortress Classic,Action,Valve,Valve,"2,000,000 .. 5,000,000",91.0,18.0,0.857002,6105.0,499.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
30,Day of Defeat,Action,Valve,Valve,"5,000,000 .. 10,000,000",403.0,26.0,0.900055,5423.0,499.0,...,0.0,0.0,0.0,0.0,0.015287,0.313376,0.0,0.0,0.0,1.0
40,Deathmatch Classic,Action,Valve,Valve,"5,000,000 .. 10,000,000",33.0,6.0,0.816689,2193.0,499.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50,Half-Life: Opposing Force,Action,Gearbox Software,Valve,"5,000,000 .. 10,000,000",322.0,127.0,0.951749,13119.0,499.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
search = 'Beat Hazard'
titles = search_df[search_df.index.str.contains(search)]
print(titles)

                appid
name                 
Beat Hazard     49600
Beat Hazard 2  618740


In [8]:
search_df.loc[search,'appid']

49600

In [9]:
%%time
recommender = games.loc[ : , 'Action' : 'e-sports']
sparse_rec = sparse.csr_matrix(recommender)
dists = pairwise_distances(sparse_rec, metric='cosine')
recommender_df = pd.DataFrame(dists, columns=recommender.index, index=recommender.index)

Wall time: 4min 18s


In [10]:
top_recommendations = recommender_df[search_df.loc[search,'appid']].sort_values()[1:21]
top_recs_df = pd.DataFrame(top_recommendations)
top_recs_df['name'] = [appid_dict[ind] for ind in top_recs_df.index]
top_recs_df[['name',top_recommendations.name]].rename(columns={'name':'Game Name', top_recommendations.name : f'Similarity to {search} (lower is better)'})

Unnamed: 0_level_0,Game Name,Similarity to Beat Hazard (lower is better)
appid,Unnamed: 1_level_1,Unnamed: 2_level_1
207750,Symphony,0.099637
618740,Beat Hazard 2,0.133285
1180620,Bullet Beat: Musical Shoot'em up,0.192314
12900,AudioSurf,0.206729
235800,Audiosurf 2,0.227873
67000,The Polynomial - Space of the music,0.231299
290000,DubWars,0.259847
513510,Intralism,0.284387
255370,KickBeat Steam Edition,0.29086
301190,Frederic: Resurrection of Music,0.296324


In [36]:
%%time
top_100_frame = pd.DataFrame(columns = ['games'])
for game_index in list(recommender_df.index):
    top100 = recommender_df[game_index].sort_values()[1:101]
    top_20_frame.loc[game_index] = str(list(top100.index))
    print(f'{round(top_100_frame.shape[0]/games.shape[0],2)*100}% complete', end= '\r')

48286

In [37]:
top_100_frame.to_csv('../data/top20_simils.csv')

In [40]:
games.loc[[301200, 301190, 453100, 762500, 329320, 357720, 708890, 394140, 656600, 231040, 1268860, 290000, 412970, 67000, 340480, 63700, 1077600, 372690, 1006470, 583760, 1021680]]

Unnamed: 0_level_0,name,genre,developer,publisher,owners,average_forever,median_forever,pos_rating_pct,total_ratings,price,...,Well-Written,Werewolves,Western,Word Game,World War I,World War II,Wrestling,Zombies,e-sports,has_tag
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
301200,Frederic: Evil Strikes Back,"Action, Casual, Indie",Forever Entertainment S. A.,Forever Entertainment S. A.,"100,000 .. 200,000",164.0,181.0,0.844037,763.0,799.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
301190,Frederic: Resurrection of Music,"Action, Casual, Indie",Forever Entertainment S. A.,Forever Entertainment S. A.,"100,000 .. 200,000",156.0,194.0,0.839305,921.0,299.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
453100,Frederic: Resurrection of Music Director's Cut,"Action, Casual, Indie",Forever Entertainment S. A.,Forever Entertainment S. A.,"20,000 .. 50,000",168.0,213.0,0.823729,295.0,599.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
762500,Rhythm Girl,"Action, Casual, Indie",Pauline Game,Pauline Game,"0 .. 20,000",0.0,0.0,0.846154,26.0,199.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
329320,QbQbQb,"Action, Casual, Indie",Rezoner,Rezoner,"20,000 .. 50,000",0.0,0.0,0.894231,104.0,499.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
357720,Inside My Radio,"Action, Casual, Indie",Seaven Studio,Iceberg Interactive,"50,000 .. 100,000",360.0,360.0,0.884876,443.0,499.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
708890,BEATris,"Action, Casual, Indie",Rhythmical Badass,Rhythmical Badass,"0 .. 20,000",0.0,0.0,0.860465,43.0,299.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
394140,Sound Shift,"Action, Casual, Indie",Matthew Brown,Matthew Brown,"20,000 .. 50,000",14.0,14.0,0.826087,138.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
656600,Slash It Ultimate,"Action, Casual, Indie",EGAMER,EGAMER,"100,000 .. 200,000",130.0,119.0,0.717213,488.0,1099.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
231040,Beatbuddy: Tale of the Guardians,"Action, Adventure, Casual, Indie",Threaks,Threaks,"200,000 .. 500,000",251.0,289.0,0.770889,1113.0,999.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
