In [1]:
import pandas as pd
import numpy as np
import ast
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

In [2]:
games = pd.concat([pd.read_csv('../data/steam_games_cleaned_1.csv'), pd.read_csv('../data/steam_games_cleaned_2.csv'), pd.read_csv('../data/steam_games_cleaned_3.csv')], axis=0)
print(games.shape)
games.head()

(56763, 502)


Unnamed: 0,appid,name,developer,publisher,owners,average_forever,median_forever,pos_rating_pct,total_ratings,price,...,Web Publishing,Well-Written,Werewolves,Western,Word Game,World War I,World War II,Wrestling,Zombies,e-sports
0,10,Counter-Strike,Valve,Valve,"10,000,000-20,000,000",11666,244,0.974693,189081,9.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217837
1,20,Team Fortress Classic,Valve,Valve,"2,000,000-5,000,000",91,18,0.857002,6105,4.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30,Day of Defeat,Valve,Valve,"5,000,000-10,000,000",403,26,0.900055,5423,4.99,...,0.0,0.0,0.0,0.0,0.0,0.015287,0.313376,0.0,0.0,0.0
3,40,Deathmatch Classic,Valve,Valve,"5,000,000-10,000,000",33,6,0.816689,2193,4.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,"5,000,000-10,000,000",322,127,0.951749,13119,4.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
games['has_tag'] = [games.iloc[i,72:].drop(columns = ['Free to Play','Early Access']).values.max() for i in range(len(games))]
games['has_tag'].value_counts()

1.0    48300
0.0     8463
Name: has_tag, dtype: int64

In [4]:
# recommender will use tag ratios so we'll need to trim out any games missing tags
games = games.loc[games['has_tag'] > 0]

search_df = games.loc[ : , ['appid','name']]
search_df['name'] = [name.lower() for name in search_df['name']]
search_df.set_index('name', inplace=True)

In [5]:
# set appids as our index since we may have similar or duplicate game names
games.set_index('appid', inplace=True)
games.sort_index(inplace=True)
print(games.shape)
games.head()

(48300, 502)


Unnamed: 0_level_0,name,developer,publisher,owners,average_forever,median_forever,pos_rating_pct,total_ratings,price,languages_arabic,...,Well-Written,Werewolves,Western,Word Game,World War I,World War II,Wrestling,Zombies,e-sports,has_tag
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,Counter-Strike,Valve,Valve,"10,000,000-20,000,000",11666,244,0.974693,189081,9.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217837,1.0
20,Team Fortress Classic,Valve,Valve,"2,000,000-5,000,000",91,18,0.857002,6105,4.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
30,Day of Defeat,Valve,Valve,"5,000,000-10,000,000",403,26,0.900055,5423,4.99,0,...,0.0,0.0,0.0,0.0,0.015287,0.313376,0.0,0.0,0.0,1.0
40,Deathmatch Classic,Valve,Valve,"5,000,000-10,000,000",33,6,0.816689,2193,4.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50,Half-Life: Opposing Force,Gearbox Software,Valve,"5,000,000-10,000,000",322,127,0.951749,13119,4.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
search = 'Beat Hazard'.lower()
titles = search_df[search_df.index.str.contains(search)]
print(titles)

                appid
name                 
beat hazard     49600
beat hazard 2  618740


In [7]:
search_df.loc[search,'appid']

49600

In [8]:
%%time
recommender = games.loc[ : , '1980s' : 'e-sports'].drop(columns = ['Free to Play','Early Access'])
sparse_rec = sparse.csr_matrix(recommender)
dists = pairwise_distances(sparse_rec, metric='cosine')
recommender_df = pd.DataFrame(dists, columns=recommender.index, index=recommender.index)

Wall time: 5min 32s


In [9]:
top_recommendations = recommender_df[search_df.loc[search,'appid']].sort_values()[1:21]
top_recs_df = pd.DataFrame(top_recommendations)
top_recs_df['name'] = [games.loc[ind]['name'] for ind in top_recs_df.index]
top_recs_df[['name',top_recommendations.name]].rename(columns={'name':'Game Name', top_recommendations.name : f'Similarity to {search} (lower is better)'})

Unnamed: 0_level_0,Game Name,Similarity to beat hazard (lower is better)
appid,Unnamed: 1_level_1,Unnamed: 2_level_1
207750,Symphony,0.099637
618740,Beat Hazard 2,0.140993
1180620,Bullet Beat: Musical Shoot'em up,0.203391
12900,AudioSurf,0.206729
235800,Audiosurf 2,0.227873
67000,The Polynomial - Space of the music,0.231299
290000,DubWars,0.259847
513510,Intralism,0.284387
255370,KickBeat Steam Edition,0.29086
301190,Frederic: Resurrection of Music,0.296324


In [10]:
%%time
top100_df = pd.DataFrame(columns = ['games'])
for game_index in list(recommender_df.index):
    top100 = recommender_df[game_index].sort_values()[1:101]
    top100_df.loc[game_index] = str(list(top100.index))
    print(' '*50, end = '\r')
    print(f'{int(round(top100_df.shape[0]/games.shape[0]*100,0))}% complete', end= '\r')
print(' '*50, end = '\r')
top100_df.to_csv('../data/top100_simils.csv')

Wall time: 6min 19s                               


In [11]:
top100_df.loc[search_df.loc[search]]

Unnamed: 0,games
49600,"[207750, 618740, 1180620, 12900, 235800, 67000..."


In [12]:
top100_df.loc[search_df.loc[search]].values[0][0]

'[207750, 618740, 1180620, 12900, 235800, 67000, 290000, 513510, 255370, 301190, 205080, 357720, 930620, 762500, 222660, 247140, 1020340, 301200, 892930, 823730, 231040, 63700, 1222930, 980360, 921630, 499890, 301540, 268260, 453100, 259530, 329320, 708890, 372690, 1019400, 807960, 865250, 614030, 16300, 878180, 945100, 323040, 1352730, 369030, 1014710, 980610, 463150, 420160, 205060, 246800, 735570, 205070, 381320, 684680, 348280, 397690, 911580, 1291720, 1001970, 926870, 1309980, 1152440, 1048300, 531510, 297110, 977950, 282760, 34920, 351990, 520470, 201570, 818620, 412740, 565660, 394140, 924560, 1132840, 1250350, 446560, 298280, 338000, 342650, 697600, 566780, 438460, 58400, 893030, 1021680, 281860, 727450, 1262180, 691160, 380550, 774171, 920700, 1088960, 266250, 397570, 461230, 218060, 744060]'

In [13]:
results = ast.literal_eval(top100_df.loc[search_df.loc[search]].values[0][0])
search_range = 20
games.loc[results][ : search_range]

Unnamed: 0_level_0,name,developer,publisher,owners,average_forever,median_forever,pos_rating_pct,total_ratings,price,languages_arabic,...,Well-Written,Werewolves,Western,Word Game,World War I,World War II,Wrestling,Zombies,e-sports,has_tag
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
207750,Symphony,Empty Clip Studios,Empty Clip Studios,"100,000-200,000",311,379,0.846369,1432,4.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
618740,Beat Hazard 2,Cold Beam Games,Cold Beam Games,"100,000-200,000",207,231,0.876216,1850,18.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1180620,Bullet Beat: Musical Shoot'em up,TERNOX,TERNOX,"0-20,000",0,0,0.764706,17,4.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12900,AudioSurf,Dylan Fitterer,"Codemasters, Electronic Arts","1,000,000-2,000,000",1804,276,0.95847,10354,9.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
235800,Audiosurf 2,Dylan Fitterer,Dylan Fitterer,"200,000-500,000",973,294,0.817476,5539,14.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
67000,The Polynomial - Space of the music,Dmytry Lavrov,Dmytry Lavrov,"100,000-200,000",15,13,0.735417,480,6.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
290000,DubWars,MURA Interactive,WOBBL3 Entertainment LLC,"50,000-100,000",144,144,0.787234,282,9.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
513510,Intralism,KHB-Soft,KHB-Soft,"500,000-1,000,000",216,265,0.877168,4844,2.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
255370,KickBeat Steam Edition,Zen Studios,Zen Studios,"100,000-200,000",119,154,0.664894,752,9.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
301190,Frederic: Resurrection of Music,Forever Entertainment S. A.,Forever Entertainment S. A.,"100,000-200,000",156,194,0.839305,921,2.99,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
search_df.to_csv('../data/search_keys.csv')

In [19]:
search_df.head()

Unnamed: 0_level_0,appid
name,Unnamed: 1_level_1
counter-strike,10
team fortress classic,20
day of defeat,30
deathmatch classic,40
half-life: opposing force,50


In [27]:
search = 'Counter'.lower()
titles = list(search_df[search_df.index.str.contains(search)].index)
list(games.loc[search_df.loc[titles]['appid'].values]['name'].values)


['Counter-Strike',
 'Counter-Strike: Condition Zero',
 'Counter-Strike: Condition Zero',
 'Counter-Strike: Condition Zero',
 'Counter-Strike: Condition Zero',
 'Counter-Strike: Source',
 'Counter-Strike: Global Offensive',
 'Harvest: Massive Encounter',
 'Serious Sam HD: The First Encounter',
 'Serious Sam HD: The Second Encounter',
 'Serious Sam Classic: The First Encounter',
 'Serious Sam Classic: The Second Encounter',
 'Serious Sam: The Random Encounter',
 'Counter-Strike Nexon: Studio',
 'Counter Spell',
 "Ghost Encounters: Deadwood - Collector's Edition",
 'Jet Racing Extreme: The First Encounter',
 'CTU: Counter Terrorism Unit',
 'OASE - Other Age Second Encounter',
 'Rencounter',
 'Pub Encounter',
 'CounterAttack',
 'Counter Agents',
 'Counter Fight',
 'Serious Sam VR: The First Encounter',
 'Serious Sam VR: The Second Encounter',
 'Counter Fight: Samurai Edition',
 'Encounter of Galaxies',
 'Graze Counter',
 'Last Encounter',
 'Hallowed Encounter',
 'The Ball Encounter',
 'Cou

In [None]:
games[(games['Action'] > 0) & (games['e-sports'] > 0)]