In [1]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
steam_games = pd.read_csv('dataset/steam-store-games/steam.csv')
steam_des = pd.read_csv('dataset/steam-store-games/steam_description_data.csv')

steam_200 = pd.read_csv('dataset/steam-200k.csv')

#steam_games = steam_games[['appid', 'name', 'developer', 'platforms', 'categories', 'genres', 'steamspy_tags']]
steam_games = steam_games[['appid', 'name', 'platforms', 'categories', 'steamspy_tags']]
steam_games.head()

Unnamed: 0,appid,name,platforms,categories,steamspy_tags
0,10,Counter-Strike,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action;FPS;Multiplayer
1,20,Team Fortress Classic,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action;FPS;Multiplayer
2,30,Day of Defeat,windows;mac;linux,Multi-player;Valve Anti-Cheat enabled,FPS;World War II;Multiplayer
3,40,Deathmatch Classic,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action;FPS;Multiplayer
4,50,Half-Life: Opposing Force,windows;mac;linux,Single-player;Multi-player;Valve Anti-Cheat en...,FPS;Action;Sci-fi


In [3]:
steam_des.head()

Unnamed: 0,steam_appid,detailed_description,about_the_game,short_description
0,10,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...
1,20,One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...
2,30,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...
3,40,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...
4,50,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...


In [4]:
steam_des.columns = columns = ['appid', 'detailed_description', 'about_the_game', 'description']
steam_des = steam_des[['appid', 'description']]
steam_des.head()

Unnamed: 0,appid,description
0,10,Play the world's number 1 online action game. ...
1,20,One of the most popular online action games of...
2,30,Enlist in an intense brand of Axis vs. Allied ...
3,40,Enjoy fast-paced multiplayer gaming with Death...
4,50,Return to the Black Mesa Research Facility as ...


In [5]:
df = pd.merge(steam_games, steam_des, on = 'appid')
df = df[:2000]
df.head()

Unnamed: 0,appid,name,platforms,categories,steamspy_tags,description
0,10,Counter-Strike,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action;FPS;Multiplayer,Play the world's number 1 online action game. ...
1,20,Team Fortress Classic,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action;FPS;Multiplayer,One of the most popular online action games of...
2,30,Day of Defeat,windows;mac;linux,Multi-player;Valve Anti-Cheat enabled,FPS;World War II;Multiplayer,Enlist in an intense brand of Axis vs. Allied ...
3,40,Deathmatch Classic,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action;FPS;Multiplayer,Enjoy fast-paced multiplayer gaming with Death...
4,50,Half-Life: Opposing Force,windows;mac;linux,Single-player;Multi-player;Valve Anti-Cheat en...,FPS;Action;Sci-fi,Return to the Black Mesa Research Facility as ...


In [6]:
df.drop(columns = ['appid'], inplace = True)
df.head()

Unnamed: 0,name,platforms,categories,steamspy_tags,description
0,Counter-Strike,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action;FPS;Multiplayer,Play the world's number 1 online action game. ...
1,Team Fortress Classic,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action;FPS;Multiplayer,One of the most popular online action games of...
2,Day of Defeat,windows;mac;linux,Multi-player;Valve Anti-Cheat enabled,FPS;World War II;Multiplayer,Enlist in an intense brand of Axis vs. Allied ...
3,Deathmatch Classic,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action;FPS;Multiplayer,Enjoy fast-paced multiplayer gaming with Death...
4,Half-Life: Opposing Force,windows;mac;linux,Single-player;Multi-player;Valve Anti-Cheat en...,FPS;Action;Sci-fi,Return to the Black Mesa Research Facility as ...


In [7]:
# discarding the commas and hyphens
df['platforms'] = df['platforms'].map(lambda x: x.split(';'))

df['categories'] = df['categories'].map(lambda x: x.lower().split(';'))

df['steamspy_tags'] = df['steamspy_tags'].map(lambda x: x.lower().split(';'))

#df['developer'] = df['developer'].map(lambda x: x.lower())

#df['genres'] = df['genres'].map(lambda x: x.lower())

df.set_index('name', inplace = True)

# merging together first and last name for each actor and director, so it's considered as one word 
# and there is no mix up between people sharing a first name
for index, row in df.iterrows():
    row['categories'] = [x.lower().replace(' ','') for x in row['categories']]
    #row['Director'] = ''.join(row['Director']).lower()


df.head()


Unnamed: 0_level_0,platforms,categories,steamspy_tags,description
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Counter-Strike,"[windows, mac, linux]","[multi-player, onlinemulti-player, localmulti-...","[action, fps, multiplayer]",Play the world's number 1 online action game. ...
Team Fortress Classic,"[windows, mac, linux]","[multi-player, onlinemulti-player, localmulti-...","[action, fps, multiplayer]",One of the most popular online action games of...
Day of Defeat,"[windows, mac, linux]","[multi-player, valveanti-cheatenabled]","[fps, world war ii, multiplayer]",Enlist in an intense brand of Axis vs. Allied ...
Deathmatch Classic,"[windows, mac, linux]","[multi-player, onlinemulti-player, localmulti-...","[action, fps, multiplayer]",Enjoy fast-paced multiplayer gaming with Death...
Half-Life: Opposing Force,"[windows, mac, linux]","[single-player, multi-player, valveanti-cheate...","[fps, action, sci-fi]",Return to the Black Mesa Research Facility as ...


In [8]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    des = row['description']
    
    # instantiating Rake, by default is uses english stopwords from NLTK
    # and discard all puntuation characters
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(des)

    # getting the dictionary whith key words and their scores
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['description'], inplace = True)
df.head()

Unnamed: 0_level_0,platforms,categories,steamspy_tags,Key_words
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Counter-Strike,"[windows, mac, linux]","[multi-player, onlinemulti-player, localmulti-...","[action, fps, multiplayer]","[play, teammates, success, affects, complete, ..."
Team Fortress Classic,"[windows, mac, linux]","[multi-player, onlinemulti-player, localmulti-...","[action, fps, multiplayer]","[abilities, time, game, play, modes, one, spy,..."
Day of Defeat,"[windows, mac, linux]","[multi-player, valveanti-cheatenabled]","[fps, world war ii, multiplayer]","[machine, sniper, disposal, light, missions, h..."
Deathmatch Classic,"[windows, mac, linux]","[multi-player, onlinemulti-player, localmulti-...","[action, fps, multiplayer]","[dmc, invites, players, futuristic, settings, ..."
Half-Life: Opposing Force,"[windows, mac, linux]","[single-player, multi-player, valveanti-cheate...","[fps, action, sci-fi]","[return, black, mesa, research, facility, new,..."


In [9]:
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        if col != 'developer' and col != 'genres':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)
df.head()
    

Unnamed: 0_level_0,bag_of_words
name,Unnamed: 1_level_1
Counter-Strike,windows mac linux multi-player onlinemulti-pla...
Team Fortress Classic,windows mac linux multi-player onlinemulti-pla...
Day of Defeat,windows mac linux multi-player valveanti-cheat...
Deathmatch Classic,windows mac linux multi-player onlinemulti-pla...
Half-Life: Opposing Force,windows mac linux single-player multi-player v...


In [10]:
#short_df = df[:250]

# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(df.index)
indices[:5]


0               Counter-Strike
1        Team Fortress Classic
2                Day of Defeat
3           Deathmatch Classic
4    Half-Life: Opposing Force
Name: name, dtype: object

In [11]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim


array([[1.        , 0.55260655, 0.34166514, ..., 0.21606791, 0.102262  ,
        0.27500955],
       [0.55260655, 1.        , 0.26978155, ..., 0.18898224, 0.0993808 ,
        0.26726124],
       [0.34166514, 0.26978155, 1.        , ..., 0.1176552 , 0.0556846 ,
        0.16638958],
       ...,
       [0.21606791, 0.18898224, 0.1176552 , ..., 1.        , 0.11268723,
        0.16835876],
       [0.102262  , 0.0993808 , 0.0556846 , ..., 0.11268723, 1.        ,
        0.07968191],
       [0.27500955, 0.26726124, 0.16638958, ..., 0.16835876, 0.07968191,
        1.        ]])

In [12]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_games = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_games.append(list(df.index)[i])
        
    return recommended_games

In [25]:
games = recommendations('Counter-Strike')

In [26]:
games_similarities = pd.DataFrame(games, columns=['name'])

In [27]:
games_similarities

Unnamed: 0,name
0,Team Fortress Classic
1,Ricochet
2,Half-Life
3,Counter-Strike: Condition Zero
4,Deathmatch Classic
5,The Ship: Murder Party
6,Natural Selection 2
7,Counter-Strike: Source
8,Call of Duty®: Modern Warfare® 2
9,Talisman: Digital Edition


## Filtrado colaborativo

In [28]:
steam_200.head()

Unnamed: 0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
0,151603712,The Elder Scrolls V Skyrim,play,273.0,0
1,151603712,Fallout 4,purchase,1.0,0
2,151603712,Fallout 4,play,87.0,0
3,151603712,Spore,purchase,1.0,0
4,151603712,Spore,play,14.9,0


In [29]:
steam_200.columns = columns = ['user_id', 'name', 'behavior', 'hours_playing', 'o']
steam_200.drop(columns=['o'], inplace=True)

In [30]:
steam_200 = steam_200[steam_200['behavior']=='play']
steam_200.head()

Unnamed: 0,user_id,name,behavior,hours_playing
0,151603712,The Elder Scrolls V Skyrim,play,273.0
2,151603712,Fallout 4,play,87.0
4,151603712,Spore,play,14.9
6,151603712,Fallout New Vegas,play,12.1
8,151603712,Left 4 Dead 2,play,8.9


In [31]:
df.head()

Unnamed: 0_level_0,bag_of_words
name,Unnamed: 1_level_1
Counter-Strike,windows mac linux multi-player onlinemulti-pla...
Team Fortress Classic,windows mac linux multi-player onlinemulti-pla...
Day of Defeat,windows mac linux multi-player valveanti-cheat...
Deathmatch Classic,windows mac linux multi-player onlinemulti-pla...
Half-Life: Opposing Force,windows mac linux single-player multi-player v...


In [32]:
games_CF = pd.merge(df, steam_200, on='name')

In [33]:
games_CF.head()

Unnamed: 0,name,bag_of_words,user_id,behavior,hours_playing
0,Counter-Strike,windows mac linux multi-player onlinemulti-pla...,30695285,play,0.2
1,Counter-Strike,windows mac linux multi-player onlinemulti-pla...,48845802,play,211.0
2,Counter-Strike,windows mac linux multi-player onlinemulti-pla...,62923086,play,91.0
3,Counter-Strike,windows mac linux multi-player onlinemulti-pla...,54103616,play,1008.0
4,Counter-Strike,windows mac linux multi-player onlinemulti-pla...,38763767,play,0.8


In [35]:
games_CF = games_CF[games_CF['behavior']=='play']
games_CF.head()

Unnamed: 0,name,bag_of_words,user_id,behavior,hours_playing
0,Counter-Strike,windows mac linux multi-player onlinemulti-pla...,30695285,play,0.2
1,Counter-Strike,windows mac linux multi-player onlinemulti-pla...,48845802,play,211.0
2,Counter-Strike,windows mac linux multi-player onlinemulti-pla...,62923086,play,91.0
3,Counter-Strike,windows mac linux multi-player onlinemulti-pla...,54103616,play,1008.0
4,Counter-Strike,windows mac linux multi-player onlinemulti-pla...,38763767,play,0.8


In [36]:
table_CF = steam_200.pivot_table(index='name', columns='user_id', values='hours_playing')

In [37]:
table_CF.head()

user_id,5250,76767,86540,144736,181212,229911,298950,381543,547685,554278,...,309228590,309255941,309262440,309265377,309404240,309434439,309554670,309626088,309824202,309903146
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
007 Legends,,,,,,,,,,,...,,,,,,,,,,
0RBITALIS,,,,,,,,,,,...,,,,,,,,,,
1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),,,,,,,,,,,...,,,,,,,,,,
10 Second Ninja,,,,,,,,,,,...,,,,,,,,,,
10000000,,,,,,,,,,,...,,,,,,,,,,


In [38]:
def create_corr(user_id):
    
    user_with_user_id = table_CF[int(user_id)]
    
    #creating coorrelation
    users_like_user_u = table_CF.corrwith(user_with_user_id)
    #creating a dataframe
    corr_user_u = pd.DataFrame(users_like_user_u, columns=['correlation'])
    corr_user_u.dropna(inplace=True)
    
    df_users_corr = corr_user_u[corr_user_u['correlation']>0.99]
    
    games_users_similarity = pd.merge(df_users_corr, steam_200, on='user_id')
    
    return games_users_similarity

In [39]:
users_similars= create_corr(76767)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


In [40]:
users_similars.sort_values('correlation', ascending=False).head()

Unnamed: 0,user_id,correlation,name,behavior,hours_playing
0,5250,1.0,Cities Skylines,play,144.0
3897,81160212,1.0,Red Faction,play,2.1
3904,81160212,1.0,"Papers, Please",play,1.2
3903,81160212,1.0,Borderlands The Pre-Sequel,play,1.6
3902,81160212,1.0,Company of Heroes Opposing Fronts,play,1.6


## hybrid part

In [41]:
merge_games = pd.merge(users_similars, games_similarities, on='name' )

In [43]:
merge_games.head()

Unnamed: 0,user_id,correlation,name,behavior,hours_playing
0,76767,1.0,Half-Life,play,1.2
1,4812175,1.0,Half-Life,play,0.2
2,7249363,1.0,Half-Life,play,2.3
3,14417857,0.997619,Half-Life,play,8.1
4,14544587,0.999796,Half-Life,play,1.5


In [50]:
games_of_user = table_CF[76767]
games_of_user= games_of_user.dropna()


In [56]:
games_of_user.index

Index(['Age of Empires II HD Edition', 'Alien Swarm', 'Banished',
       'Call of Duty Black Ops', 'Call of Duty Black Ops - Multiplayer',
       'Call of Duty Modern Warfare 2',
       'Call of Duty Modern Warfare 2 - Multiplayer',
       'Call of Duty Modern Warfare 3',
       'Call of Duty Modern Warfare 3 - Multiplayer',
       'Call of Duty World at War', 'Counter-Strike',
       'Counter-Strike Global Offensive', 'Counter-Strike Source', 'Half-Life',
       'Portal 2', 'Rise of Nations Extended Edition', 'The Stanley Parable',
       'Thief Deadly Shadows', 'Total War ATTILA', 'Worms Armageddon'],
      dtype='object', name='name')

In [70]:
recommended_games = []

for game in list(merge_games['name'].unique()):
    
    if game not in games_of_user.index:
        recommended_games.append(game)
    
    else:
        pass

In [71]:
recommended_games

['Natural Selection 2',
 'Team Fortress Classic',
 'Deathmatch Classic',
 'Ricochet']

In [73]:
def hybrid(user_id, game_name):
    
    games = recommendations(game_name)
    games_similarities = pd.DataFrame(games, columns=['name'])
    
    users_similars= create_corr(user_id)
    merge_games = pd.merge(users_similars, games_similarities, on='name')
    
    games_of_user = table_CF[user_id]
    games_of_user= games_of_user.dropna()
    
    recommended_games = []

    for game in list(merge_games['name'].unique()):
    
        if game not in games_of_user.index:
            recommended_games.append(game)
    
        else:
            pass
        
    return recommended_games   
    

In [74]:
recommend_for_user = hybrid(76767, 'Counter-Strike')

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


In [75]:
recommend_for_user

['Natural Selection 2',
 'Team Fortress Classic',
 'Deathmatch Classic',
 'Ricochet']