In [1]:
#default_exp content_based 

In [2]:
#export 

import scipy
import pandas as pd
import numpy as np


from sklearn import compose, feature_extraction, metrics
from functools import reduce, partial
import attr
from typing import Union
import umap
import altair


from game_recommender import steam_data

In [3]:
%cd ..

/home/kuba/Projects/game_recommender


In [4]:
??steam_data.load_steam_df

[0;31mSignature:[0m
[0msteam_data[0m[0;34m.[0m[0mload_steam_df[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msteam_games_metadata_path[0m[0;34m=[0m[0;34m'data/steam_games_metadata/steam_games.csv'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mload_steam_df[0m[0;34m([0m[0msteam_games_metadata_path[0m[0;34m=[0m[0mSTEAM_GAME_METADATA_PATH[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0mclean_steam_df[0m[0;34m([0m[0mpd[0m[0;34m.[0m[0mread_csv[0m[0;34m([0m[0msteam_games_metadata_path[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/Projects/game_recommender/game_recommender/steam_data.py
[0;31mType:[0m      function


In [5]:
steam_df = steam_data.load_steam_df() 

In [6]:
chosen_games_substring = 's.t.a.l.k'
chosen_games_df = steam_data.get_games_by_name(steam_df, chosen_games_substring) 

In [7]:
chosen_games_df['name']

376     s.t.a.l.k.e.r. shadow of chernobyl
1276        s.t.a.l.k.e.r. call of pripyat
7253              s.t.a.l.k.e.r. clear sky
Name: name, dtype: object

In [8]:
chosen_games_df.iloc[0]['popular_tags']

'atmospheric,post-apocalyptic,open world,fps,survival,masterpiece,action,rpg,survival horror,shooter,horror,first-person,singleplayer,classic,based on a novel,moddable,adventure,sci-fi,inventory management,difficult'

In [9]:
chosen_games_df.iloc[0]['game_details']

'single-player,multi-player'

In [10]:
chosen_games_df.iloc[0]['genre']

'action,rpg'

In [11]:
chosen_games_df['game_description'].iloc[0]

" About This Game In 1986, the world's worst nuclear disaster occurred at the Chernobyl Nuclear Power Plant. Soviet authorities established an 'Exclusion Zone' around, but a second explosion hit the reactor in 2006, creating The Zone as we know it â€“ dangerous place, filled with mutated creatures, deadly radiation, and a strange, anomalous energy. The Zone was cordoned off by the military, who would shoot on sight anyone foolish enough to get inside.  Year 2012. The Zone is still a dangerous place and a threat to all mankind. Mercenaries, bounty hunters and explorers ventured further and further into the heart of the Zone driven by reports of strange 'artifacts' imbued with anomalous energy. To sell them on the black market or trying to find the 'truth' behind the Zone. Whatever their motivation, over time these individuals - Scavengers, Trespassers, Adventurers, Loners, Killers, Explorers and Robbers - have become known as S.T.A.L.K.E.R.s.  You will have to find your own way to survi

In [12]:
chosen_games_df['popular_tags'].iloc[0]

'atmospheric,post-apocalyptic,open world,fps,survival,masterpiece,action,rpg,survival horror,shooter,horror,first-person,singleplayer,classic,based on a novel,moddable,adventure,sci-fi,inventory management,difficult'

In [13]:
game_descriptions = steam_df['game_description']

# Finding games with similar descriptions

In [14]:
from collections import defaultdict

In [15]:
#export

def split_by_comma_tokenizer(text):
    return text.split(',') 


def _regroup_weights(weights, column_groups):
    assert len(weights) == len(column_groups), 'specify weight for each column group'
    return {
        col: weight / len(col_group)
        for (weight, col_group) in zip(weights, column_groups)
        for col in col_group
    }


def make_df_column_vectorizer(
        column_groups,
        vectorizer_classes=feature_extraction.text.TfidfVectorizer,
        weights=None,
        tokenizers=None
    ):
    if type(vectorizer_classes) is not list:
        vectorizer_classes = [vectorizer_classes for __ in range(len(column_groups))]
    if tokenizers is None:
        tokenizers = [None for __ in range(len(column_groups))]
    vectorizers = [vectorizer_class(tokenizer=tokenizer) for (vectorizer_class, grp, tokenizer) in zip(vectorizer_classes, column_groups, tokenizers)] 
    if type(weights) is list:
        weights = _regroup_weights(weights, column_groups)
    
    return compose.ColumnTransformer(
        [
            (col + '_tfidf', vectorizer, col)
            for columns, vectorizer in zip(column_groups, vectorizers)
            for col in columns
        ],
        transformer_weights=weights
    )


def get_steam_metadata_column_groups():
    return [
        ['popular_tags'],
        ['game_description', 'desc_snippet'],
    ]


def make_steam_metadata_vectorizer(column_groups=get_steam_metadata_column_groups()):
    return make_df_column_vectorizer(
        column_groups,
        vectorizer_classes=partial(
                feature_extraction.text.TfidfVectorizer,
                binary=True,
                min_df=5
        ),
        tokenizers=[split_by_comma_tokenizer, None]
    )


def get_steam_metadata_vectors(steam_df):
    vectorizer = make_steam_metadata_vectorizer()
    return vectorizer.fit_transform(steam_df)

In [16]:
steam_metadata_vectors = get_steam_metadata_vectors(steam_df)

In [17]:
steam_df.shape

(24210, 21)

In [18]:
steam_metadata_vectors.shape

(24210, 31455)

In [19]:
x = None
if 1 is not None:
    print(x)

None


In [20]:
#export

@attr.s
class SimilaritySearcher:
    
    df: pd.DataFrame = attr.ib()
    vectors: Union[scipy.sparse.csr.csr_matrix, np.array] = attr.ib()
    similarity = attr.ib(default=metrics.pairwise.cosine_similarity)
    name_col = attr.ib(default='name')
    
    def find_similar(
            self,
            chosen_items=None,
            chosen_vectors=None,
            n_similar=20):
        def get_names_from_indices(indices):
            return self.df[self.name_col].iloc[indices].values
    
        assert chosen_items is not None or chosen_vectors is not None
        if not chosen_items is None:
            idxs = self.df[self.df[self.name_col].isin(chosen_items)].index
            chosen_vectors = self.vectors[idxs]
        else:
            chosen_items = list(map(str, range(chosen_vectors.shape[0])))
        mean_vector = chosen_vectors.mean(axis=0)
        similarities = self.similarity(chosen_vectors, self.vectors)
        similarity_sorted_indices = similarities.argsort(axis=1)[:,::-1]
        similarities_sorted = np.array([similarities[i, similarity_sorted_indices[i]] for i in range(len(chosen_items))])
        mean_similarity = self.similarity(mean_vector.reshape(1,-1), self.vectors)
        mean_similarity_sorted_indices = mean_similarity.argsort(axis=1)[:,::-1]
        mean_similarities_sorted = mean_similarity[0,mean_similarity_sorted_indices.reshape(-1)]

        mean_similar = get_names_from_indices(mean_similarity_sorted_indices[0][:n_similar])
        game_names = []
        similarities = [] 
        
        for (i, name) in enumerate(chosen_items):
            game_names.append(
                pd.Series(name=str(name), 
                    data=get_names_from_indices(
                        similarity_sorted_indices[i][1:n_similar+1]
                    )
                 )
            )
            similarities.append(pd.Series(name=str(name) + '_similarities', data= similarities_sorted[i][:n_similar]))

        game_names.append(pd.Series(name='mean', data=mean_similar))
        similarities.append(pd.Series(name='mean_similarity', data=mean_similarities_sorted[:n_similar]))
        return pd.DataFrame(game_names).T, pd.DataFrame(similarities).T

In [21]:
#export

def make_stacked_results_df(game_names, similarities):
    df = pd.concat([game_names, similarities], axis=1)
    return df.reindex(sorted(df.columns), axis=1)

In [22]:
metadata_df = steam_df
n_similar=20

In [23]:
from sklearn import decomposition

n_components = 250


tsvd = decomposition.TruncatedSVD(n_components=n_components)
nmf = decomposition.NMF(n_components=n_components, solver='mu')

In [24]:
%%time
steam_metadata_reduced_vectors = tsvd.fit_transform(steam_metadata_vectors)

CPU times: user 38.3 s, sys: 22 s, total: 1min
Wall time: 12.5 s


In [25]:
#export

def get_similar_game_names_from_results(results):
    return set(results.values.reshape(-1))


In [26]:
similarity_searcher = SimilaritySearcher(steam_df, steam_metadata_reduced_vectors)

In [27]:
get_similar_game_names_from_results(similarity_searcher.find_similar(chosen_games_df['name'])[0])

{'a.i.m.2 clan wars',
 'alien isolation',
 'arizona sunshine',
 'atomic heart',
 'brigand oaxaca',
 'creature hunt',
 'dayz',
 'dead effect',
 'doom 3',
 'doom 3 bfg edition',
 'dying light',
 'estranged act i',
 'fallout 3',
 'fallout 3 game of the year edition',
 'fallout new vegas',
 'far cry',
 'half life 2 episode two',
 'i am alive',
 'metro 2033 redux',
 'metro exodus',
 'metro last light redux',
 'nosferatu the wrath of malachi',
 'red faction',
 's.t.a.l.k.e.r. call of pripyat',
 's.t.a.l.k.e.r. clear sky',
 's.t.a.l.k.e.r. shadow of chernobyl',
 'shadows of kurgansk',
 'singularity',
 'system shock 2',
 'time ramesside a new reckoning',
 'unreal 2 the awakening',
 'unturned',
 'zombi'}

# Searching by SVD reduced data

In [28]:
make_stacked_results_df(*similarity_searcher.find_similar(chosen_games_df['name']))

Unnamed: 0,mean,mean_similarity,s.t.a.l.k.e.r. call of pripyat,s.t.a.l.k.e.r. call of pripyat_similarities,s.t.a.l.k.e.r. clear sky,s.t.a.l.k.e.r. clear sky_similarities,s.t.a.l.k.e.r. shadow of chernobyl,s.t.a.l.k.e.r. shadow of chernobyl_similarities
0,s.t.a.l.k.e.r. clear sky,0.96491,s.t.a.l.k.e.r. clear sky,1.0,s.t.a.l.k.e.r. call of pripyat,1.0,s.t.a.l.k.e.r. clear sky,1.0
1,s.t.a.l.k.e.r. call of pripyat,0.948868,s.t.a.l.k.e.r. shadow of chernobyl,0.898535,s.t.a.l.k.e.r. shadow of chernobyl,0.898535,s.t.a.l.k.e.r. call of pripyat,0.837679
2,s.t.a.l.k.e.r. shadow of chernobyl,0.92239,metro last light redux,0.785954,metro last light redux,0.837679,metro last light redux,0.785954
3,metro last light redux,0.728765,metro 2033 redux,0.681939,time ramesside a new reckoning,0.679098,metro 2033 redux,0.707117
4,metro 2033 redux,0.712735,dayz,0.680488,metro 2033 redux,0.675035,nosferatu the wrath of malachi,0.66607
5,time ramesside a new reckoning,0.672267,far cry,0.675927,zombi,0.674889,metro exodus,0.63096
6,unreal 2 the awakening,0.664012,doom 3 bfg edition,0.670886,unreal 2 the awakening,0.662971,creature hunt,0.625489
7,dead effect,0.660192,time ramesside a new reckoning,0.649384,dead effect,0.648057,a.i.m.2 clan wars,0.610693
8,zombi,0.659066,unreal 2 the awakening,0.647997,doom 3 bfg edition,0.645309,fallout 3,0.603616
9,far cry,0.65198,zombi,0.647528,doom 3,0.645188,i am alive,0.600783


# Searching by NMF reduced data

In [29]:
#%%time
#steam_metadata_nmf_vectors = nmf.fit_transform(steam_metadata_vectors)

In [30]:
#similar_games(chosen_games_df, steam_df, steam_metadata_nmf_vectors)

In [31]:
# Recommending

In [32]:
#export


@attr.s
class ContentBasedRecommender:
    
    similarity_searcher: SimilaritySearcher = attr.ib()
        
    def recommend_similar_games(self, user_ratings, n_recommended, n_similar=5):
        rated_games = user_ratings.index
        ratings = user_ratings.values
        similar_games, similarities = self.similarity_searcher.find_similar(rated_games, n_similar=n_similar)
        mean_similar_games = similar_games.pop('mean')
        mean_similarities = similarities.pop('mean_similarity')
        
        predicted_ratings = ratings * similarities / similarities.max()
        predicted_ratings_flat = pd.concat([predicted_ratings[col] for col in predicted_ratings.columns])
        similar_games_flat = pd.concat([similar_games[col] for col in similar_games.columns])
        # TODO: is this best way to handle duplicates?
        pred_df = pd.DataFrame({
            'game': similar_games_flat,
            'rating ': predicted_ratings_flat
        }).groupby('game').agg('mean')
        predicted_ratings_flat = pred_df.iloc[:,0].values
        similar_games_flat = pred_df.index.values
        
        best_rated_indices = np.argsort(predicted_ratings_flat)[::-1][:n_recommended]
        best_ratings = predicted_ratings_flat[best_rated_indices]
        best_rated_games = similar_games_flat[best_rated_indices]
        return pd.Series(index=best_rated_games, data=best_ratings)
        
    def recommend_mean_similar_games(self, user_ratings, n_recommended, n_similar=5):
        rated_games = user_ratings.index
        ratings = user_ratings.values
        similar_games, similarities = self.similarity_searcher.find_similar(rated_games, n_similar=n_similar)
        mean_similar_games = similar_games.pop('mean')
        mean_similarities = similarities.pop('mean_similarity')
        return pd.Series(index=mean_similar_games, data=ratings.mean() * mean_similarities.values)
    
    @staticmethod
    def make_from_steam_metadata(steam_df=None, steam_metadata_vectors=None):
        if steam_df is None:
            steam_df = steam_data.load_steam_df()
        if steam_metadata_vectors is None:
            steam_metadata_vectors = get_steam_metadata_vectors(steam_df)
        similarity_searcher = SimilaritySearcher(steam_df, steam_metadata_vectors)
        return ContentBasedRecommender(similarity_searcher)

In [33]:
ratings = [4, 5, 6]
rated_games = chosen_games_df['name']

user_ratings = pd.Series(index=rated_games, data=ratings)

In [34]:
content_recommender = ContentBasedRecommender(similarity_searcher)
recommendations = content_recommender.recommend_similar_games(user_ratings, 20)

In [35]:
recommendations

s.t.a.l.k.e.r. shadow of chernobyl    4.941942
s.t.a.l.k.e.r. call of pripyat        4.675357
s.t.a.l.k.e.r. clear sky              4.500000
time ramesside a new reckoning        4.074589
metro last light redux                4.033220
metro 2033 redux                      3.429458
dayz                                  3.402438
nosferatu the wrath of malachi        2.664278
dtype: float64

In [36]:
recommendations = content_recommender.recommend_mean_similar_games(user_ratings, 20)
recommendations

mean
s.t.a.l.k.e.r. clear sky              4.824548
s.t.a.l.k.e.r. call of pripyat        4.744342
s.t.a.l.k.e.r. shadow of chernobyl    4.611952
metro last light redux                3.643827
metro 2033 redux                      3.563677
dtype: float64

# Visualizing similar games with UMAP

In [105]:
%%time
umapper = umap.UMAP(
    random_state=0,
    n_neighbors=15,
    n_components=2,
    metric='cosine')
viz_data = umapper.fit_transform(steam_metadata_vectors)

KeyboardInterrupt: 

In [None]:
viz_df = pd.DataFrame({
    'X': viz_data[:,0],
    'Y': viz_data[:,1],
    'name': steam_df['name'],
    'is_similar': steam_df['name'].isin(recommended_games)
})


altair.data_transformers.disable_max_rows()


similar_points_altair_scatterplot = (
    altair.Chart(viz_df[viz_df['is_similar']])
        .mark_circle(size=100, color='red', opacity=1.0)
        .encode(x='X', y='Y', tooltip=['name'])
        .interactive()
)

not_similar_points_altair_scatterplot = (
    altair.Chart(viz_df[~viz_df['is_similar']])
        .mark_circle(size=10, color='blue', opacity=0.2)
        .encode(x='X', y='Y', tooltip=['name'])
        .interactive()
)


similar_points_altair_scatterplot + not_similar_points_altair_scatterplot

In [None]:
# persisting stuff for streamlit
pd.DataFrame(viz_df).to_csv('data/steam_metadata_umap_vectors.csv.gz', index=False)
pd.DataFrame(steam_df).to_csv('data/steam_games.csv.gz', index=False)
pd.DataFrame(steam_metadata_reduced_vectors.astype('float16')).to_csv('data/steam_metadata_reduced_vectors.csv.gz', index=False)