In [None]:
#default_exp content_based 

In [2]:
#export 

import scipy
import pandas as pd
import numpy as np


from sklearn import compose, feature_extraction, metrics
from functools import reduce, partial
import attr
from typing import Union

from game_recommender import steam_data

In [3]:
%cd ..

/home/kuba/Projects/game_recommender


In [4]:
??steam_data.load_steam_df

[0;31mSignature:[0m
[0msteam_data[0m[0;34m.[0m[0mload_steam_df[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msteam_games_metadata_path[0m[0;34m=[0m[0;34m'data/steam_games_metadata/steam_games.csv'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mload_steam_df[0m[0;34m([0m[0msteam_games_metadata_path[0m[0;34m=[0m[0mSTEAM_GAME_METADATA_PATH[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0mclean_steam_df[0m[0;34m([0m[0mpd[0m[0;34m.[0m[0mread_csv[0m[0;34m([0m[0msteam_games_metadata_path[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/Projects/game_recommender/game_recommender/steam_data.py
[0;31mType:[0m      function


In [5]:
steam_df = steam_data.load_steam_df() 

In [6]:
chosen_games_substring = 's.t.a.l.k'

In [7]:
chosen_games_df = steam_data.get_games_by_name(steam_df, chosen_games_substring) 

In [8]:
chosen_games_df['name']

376     s.t.a.l.k.e.r. shadow of chernobyl
1276        s.t.a.l.k.e.r. call of pripyat
7253              s.t.a.l.k.e.r. clear sky
Name: name, dtype: object

In [9]:
chosen_games_df.iloc[0]['popular_tags']

'atmospheric,post-apocalyptic,open world,fps,survival,masterpiece,action,rpg,survival horror,shooter,horror,first-person,singleplayer,classic,based on a novel,moddable,adventure,sci-fi,inventory management,difficult'

In [10]:
chosen_games_df.iloc[0]['game_details']

'single-player,multi-player'

In [11]:
chosen_games_df.iloc[0]['genre']

'action,rpg'

In [12]:
chosen_games_df['game_description'].iloc[0]

" About This Game In 1986, the world's worst nuclear disaster occurred at the Chernobyl Nuclear Power Plant. Soviet authorities established an 'Exclusion Zone' around, but a second explosion hit the reactor in 2006, creating The Zone as we know it – dangerous place, filled with mutated creatures, deadly radiation, and a strange, anomalous energy. The Zone was cordoned off by the military, who would shoot on sight anyone foolish enough to get inside.  Year 2012. The Zone is still a dangerous place and a threat to all mankind. Mercenaries, bounty hunters and explorers ventured further and further into the heart of the Zone driven by reports of strange 'artifacts' imbued with anomalous energy. To sell them on the black market or trying to find the 'truth' behind the Zone. Whatever their motivation, over time these individuals - Scavengers, Trespassers, Adventurers, Loners, Killers, Explorers and Robbers - have become known as S.T.A.L.K.E.R.s.  You will have to find your own way to survive

In [13]:
chosen_games_df['popular_tags'].iloc[0]

'atmospheric,post-apocalyptic,open world,fps,survival,masterpiece,action,rpg,survival horror,shooter,horror,first-person,singleplayer,classic,based on a novel,moddable,adventure,sci-fi,inventory management,difficult'

In [14]:
game_descriptions = steam_df['game_description']

# Finding games with similar descriptions

In [15]:
from collections import defaultdict

In [16]:
#export

def split_by_comma_tokenizer(text):
    return text.split(',') 


def _regroup_weights(weights, column_groups):
    assert len(weights) == len(column_groups), 'specify weight for each column group'
    return {
        col: weight / len(col_group)
        for (weight, col_group) in zip(weights, column_groups)
        for col in col_group
    }


def make_df_column_vectorizer(
        column_groups,
        vectorizer_classes=feature_extraction.text.TfidfVectorizer,
        weights=None,
        tokenizers=None
    ):
    if type(vectorizer_classes) is not list:
        vectorizer_classes = [vectorizer_classes for __ in range(len(column_groups))]
    if tokenizers is None:
        tokenizers = [None for __ in range(len(column_groups))]
    vectorizers = [vectorizer_class(tokenizer=tokenizer) for (vectorizer_class, grp, tokenizer) in zip(vectorizer_classes, column_groups, tokenizers)] 
    if type(weights) is list:
        weights = _regroup_weights(weights, column_groups)
    
    return compose.ColumnTransformer(
        [
            (col + '_tfidf', vectorizer, col)
            for columns, vectorizer in zip(column_groups, vectorizers)
            for col in columns
        ],
        transformer_weights=weights
    )

In [17]:
vectorizer = make_df_column_vectorizer([
        ['popular_tags'],
        ['game_description', 'desc_snippet'],
    ],
    vectorizer_classes=partial(
            feature_extraction.text.TfidfVectorizer,
            binary=True,
            min_df=5
    ),
    tokenizers=[split_by_comma_tokenizer, None]
)

steam_metadata_vectors = vectorizer.fit_transform(steam_df)

In [18]:
steam_df.shape

(24210, 21)

In [19]:
steam_metadata_vectors.shape

(24210, 31455)

In [20]:
#export

@attr.s
class SimilaritySearcher:
    
    df: pd.DataFrame = attr.ib()
    vectors: Union[scipy.sparse.csr.csr_matrix, np.array] = attr.ib()
    similarity = attr.ib(default=metrics.pairwise.cosine_similarity)
    name_col = attr.ib(default='name')
    
    def find_similar(
            self,
            chosen_items,
            n_similar=20):
        def get_names_from_indices(indices):
            return self.df[self.name_col].iloc[indices].values

        idxs = self.df[self.df[self.name_col].isin(chosen_items)].index
        chosen_vectors = self.vectors[idxs]
        mean_vector = chosen_vectors.mean(axis=0)
        similarities = self.similarity(chosen_vectors, self.vectors)
        similarity_sorted_indices = similarities.argsort(axis=1)[:,::-1]
        similarities_sorted = np.array([similarities[i, similarity_sorted_indices[i]] for i in range(len(chosen_items))])
        mean_similarity = self.similarity(mean_vector.reshape(1,-1), self.vectors)
        mean_similarity_sorted_indices = mean_similarity.argsort(axis=1)[:,::-1]
        mean_similarities_sorted = mean_similarity[0,mean_similarity_sorted_indices.reshape(-1)]

        mean_similar = get_names_from_indices(mean_similarity_sorted_indices[0][:n_similar])
        game_names = {}
        similarities = {}
    
        for (i, name) in enumerate(chosen_items):
            game_names[str(name)] = ( 
                    get_names_from_indices(
                        similarity_sorted_indices[i][1:n_similar+1]
                    )
            )
            similarities[str(name) + '_similarities'] = similarities_sorted[i][:n_similar]

        game_names['mean'] = mean_similar
        similarities['mean_similarity'] = mean_similarities_sorted[1:n_similar+1] 
        return pd.DataFrame(game_names), pd.DataFrame(similarities)

In [21]:
#export

def make_stacked_results_df(game_names, similarities):
    df = pd.concat([game_names, similarities], axis=1)
    return df.reindex(sorted(df.columns), axis=1)

In [22]:
metadata_df = steam_df
n_similar=20
metadata_vectors = steam_metadata_vectors


In [23]:
from sklearn import decomposition

n_components = 250


tsvd = decomposition.TruncatedSVD(n_components=n_components)
nmf = decomposition.NMF(n_components=n_components, solver='mu')

In [24]:
%%time
steam_metadata_reduced_vectors = tsvd.fit_transform(steam_metadata_vectors)

CPU times: user 33.4 s, sys: 16.5 s, total: 49.8 s
Wall time: 11.8 s


In [25]:
similarity_searcher = SimilaritySearcher(steam_df, steam_metadata_reduced_vectors)

# Searching by SVD reduced data

In [26]:
make_stacked_results_df(*similarity_searcher.find_similar(chosen_games_df['name']))

Unnamed: 0,mean,mean_similarity,s.t.a.l.k.e.r. call of pripyat,s.t.a.l.k.e.r. call of pripyat_similarities,s.t.a.l.k.e.r. clear sky,s.t.a.l.k.e.r. clear sky_similarities,s.t.a.l.k.e.r. shadow of chernobyl,s.t.a.l.k.e.r. shadow of chernobyl_similarities
0,s.t.a.l.k.e.r. clear sky,0.94767,s.t.a.l.k.e.r. clear sky,1.0,s.t.a.l.k.e.r. call of pripyat,1.0,s.t.a.l.k.e.r. clear sky,1.0
1,s.t.a.l.k.e.r. call of pripyat,0.918316,s.t.a.l.k.e.r. shadow of chernobyl,0.899797,s.t.a.l.k.e.r. shadow of chernobyl,0.899797,s.t.a.l.k.e.r. call of pripyat,0.830202
2,s.t.a.l.k.e.r. shadow of chernobyl,0.737528,metro last light redux,0.775911,metro last light redux,0.830202,metro last light redux,0.775911
3,metro last light redux,0.721175,metro 2033 redux,0.687445,metro 2033 redux,0.696545,metro 2033 redux,0.704464
4,metro 2033 redux,0.671775,dayz,0.684979,time ramesside a new reckoning,0.687418,metro exodus,0.668832
5,time ramesside a new reckoning,0.664509,far cry,0.676159,zombi,0.678284,nosferatu the wrath of malachi,0.630517
6,unreal 2 the awakening,0.658309,shadows of kurgansk,0.672839,doom 3,0.670487,creature hunt,0.628873
7,fallout 3,0.657512,time ramesside a new reckoning,0.649991,unreal 2 the awakening,0.651587,fallout 3,0.608828
8,far cry,0.65684,unreal 2 the awakening,0.649415,dead effect,0.646362,fallout new vegas,0.607839
9,dead effect,0.656077,doom 3 bfg edition,0.647965,doom 3 bfg edition,0.645923,fallout 3 game of the year edition,0.602699


# Searching by NMF reduced data

In [27]:
#%%time
#steam_metadata_nmf_vectors = nmf.fit_transform(steam_metadata_vectors)

In [28]:
#similar_games(chosen_games_df, steam_df, steam_metadata_nmf_vectors)

In [29]:
# Recommending

In [30]:
#export


@attr.s
class ContentBasedRecommender:
    
    similarity_searcher: SimilaritySearcher = attr.ib()
        
    def recommend_similar_games(self, rated_games, ratings, n_recommended, n_similar=20):
        similar_games, similarities = self.similarity_searcher.find_similar(rated_games, n_similar=n_similar)
        mean_similar_games = similar_games.pop('mean')
        mean_similarities = similarities.pop('mean_similarity')
        
        predicted_ratings = ratings * similarities / similarities.max()
        predicted_ratings_flat = pd.concat([predicted_ratings[col] for col in predicted_ratings.columns])
        similar_games_flat = pd.concat([similar_games[col] for col in similar_games.columns])
        # TODO: is this best way to handle duplicates?
        pred_df = pd.DataFrame({
            'game': similar_games_flat,
            'rating ': predicted_ratings_flat
        }).groupby('game').agg('mean')
        predicted_ratings_flat = pred_df.iloc[:,0].values
        similar_games_flat = pred_df.index.values
        
        best_rated_indices = np.argsort(predicted_ratings_flat)[::-1][:n_recommended]
        best_ratings = predicted_ratings_flat[best_rated_indices]
        best_rated_games = similar_games_flat[best_rated_indices]
        return best_rated_games, best_ratings
        
    def recommend_mean_similar_games(self, rated_games, ratings, n_recommended, n_similar=20):
        similar_games, similarities = self.similarity_searcher.find_similar(rated_games_df, n_similar=n_similar)
        mean_similar_games = similar_games.pop('mean')
        mean_similarities = similarities.pop('mean_similarity')
        return mean_similar_games, mean_similarities

In [31]:
ratings = [4, 5, 6]
rated_games = chosen_games_df['name']

In [32]:
content_recommender = ContentBasedRecommender(similarity_searcher)
recommended_games, ratings = content_recommender.recommend_similar_games(rated_games, ratings, 20)

In [33]:
recommended_games

array(['s.t.a.l.k.e.r. shadow of chernobyl',
       's.t.a.l.k.e.r. call of pripyat', 's.t.a.l.k.e.r. clear sky',
       'metro last light redux', 'zombi', 'doom 3', 'far cry', 'dayz',
       'doom 3 bfg edition', 'half life 2 episode two', 'estranged act i',
       'metro 2033 redux', 'arizona sunshine', 'shadows of kurgansk',
       'time ramesside a new reckoning', 'metro exodus',
       'unreal 2 the awakening', 'dying light', 'dead effect',
       'nosferatu the wrath of malachi'], dtype=object)

# User-based

In [34]:
import os
import pickle

game_dataset_pickle_path = 'data/game_dataset.pkl'
if os.path.exists(game_dataset_pickle_path):
    game_dataset_raw = pickle.load(open(game_dataset_pickle_path, 'rb'))
else:
    steam_ratings_df = steam_data.filter_ratings_with_metadata(raw_steam_ratings_df)
    game_dataset_raw = steam_data.RecommenderDataset.make_implicit_feedback_dataset(steam_ratings_df)
    pickle.dump(game_dataset_raw, open(game_dataset_pickle_path, 'wb'))

game_dataset = (
    game_dataset_raw
    .filter_out_insufficient_reviews('user_id', 5)
    .filter_out_insufficient_reviews('name', 2)
)

In [35]:
user_game_matrix = steam_data.get_item_user_matrix(game_dataset)

In [36]:
steam_ratings_df, target = game_dataset.data, game_dataset.target


In [38]:
#export


@attr.s
class UserBasedRecommender:
    
    user_similarity_searcher: SimilaritySearcher = attr.ib()
    ratings_matrix = attr.ib()
    games = attr.ib()
        
    def recommend_games_from_similar_users(self, user_id, n_recommended, n_similar=5):
        similar_users, similarities = self.user_similarity_searcher.find_similar([user_id], n_similar=n_similar)
        mean_similar_users = similar_users.pop('mean')
        mean_similarities = similarities.pop('mean_similarity')
        similarities = similarities.values.reshape(-1)
        similar_user_idxs = self.user_similarity_searcher.df[self.user_similarity_searcher.df['user_id'].isin(similar_users.values.reshape(-1))].index
        similar_users_ratings = self.ratings_matrix[similar_user_idxs]
        predicted_ratings_by_user = (np.diag(similarities / similarities.max())) @ similar_users_ratings
        predicted_ratings = np.nan_to_num(predicted_ratings_by_user.sum(axis=0) / (predicted_ratings_by_user > 0).sum(axis=0))
        best_rated_games_idxs = predicted_ratings.argsort()[::-1][:n_recommended]
        best_rated_games = self.games[best_rated_games_idxs]
        return best_rated_games, predicted_ratings[best_rated_games_idxs]
    
    @classmethod
    def make_from_ratings(cls, ratings_df, target, target_col):
        ratings_df = pd.concat([steam_ratings_df, target], axis=1)
        item_user_pivoted_df = ratings_df.pivot_table(index='user_id', columns='name', values=target_col)
        user_game_matrix = scipy.sparse.csr_matrix(item_user_pivoted_df.fillna(0))   
        user_df = pd.DataFrame({'user_id': item_user_pivoted_df.index})
        user_similarity_searcher = SimilaritySearcher(user_df, user_game_matrix, name_col='user_id')
        games = item_user_pivoted_df.columns
        return UserBasedRecommender(user_similarity_searcher, user_game_matrix, games)

In [39]:
user_based_recommender = UserBasedRecommender.make_from_ratings(game_dataset.data, game_dataset.target, 'log_hours')

In [40]:
ratings_df = pd.concat([game_dataset.data, game_dataset.target], axis=1)
item_user_pivoted_df = ratings_df.pivot_table(index='name', columns='user_id', values='log_hours')

In [41]:
user_based_recommender.user_similarity_searcher.df

Unnamed: 0,user_id
0,5250
1,76767
2,86540
3,103360
4,144736
...,...
2864,303129589
2865,303525289
2866,304971849
2867,306547522


In [42]:
# Sanity check: 

In [43]:
i = 10
user_id = user_based_recommender.user_similarity_searcher.df.iloc[10,0]
user_ratings = item_user_pivoted_df.iloc[:,10]

In [44]:
user_ratings[~user_ratings.isna()]

name
counter strike                     3.135494
counter strike global offensive    6.194405
counter strike nexon zombies       0.693147
counter strike source              0.693147
day of defeat                      0.693147
deathmatch classic                 0.693147
eldevin                            0.693147
half life                          0.693147
half life blue shift               0.693147
half life opposing force           0.693147
planetside 2                       0.693147
raceroom racing experience         0.336472
realm of the mad god               0.693147
ricochet                           0.693147
team fortress classic              0.693147
unturned                           0.693147
warface                            0.693147
Name: 547685, dtype: float64

In [45]:
user_idx = user_based_recommender.user_similarity_searcher.df[user_based_recommender.user_similarity_searcher.df['user_id'] == user_id].index[0]

In [46]:
user_based_recommender.user_similarity_searcher.df['user_id']

0            5250
1           76767
2           86540
3          103360
4          144736
          ...    
2864    303129589
2865    303525289
2866    304971849
2867    306547522
2868    309404240
Name: user_id, Length: 2869, dtype: int64

In [54]:
user_based_recommender.recommend_games_from_similar_users(user_id, n_recommended=10, n_similar=6)



(Index(['counter strike global offensive', 'counter strike',
        'counter strike nexon zombies', 'team fortress 2', 'dota 2',
        'counter strike source', 'half life blue shift',
        'half life opposing force', 'half life', 'team fortress classic'],
       dtype='object', name='name'),
 array([4.96166702, 3.15514907, 2.34060134, 0.98518951, 0.73594537,
        0.68583722, 0.66945441, 0.66945441, 0.66945441, 0.66945441]))