In [None]:
#default_exp content_based 

In [2]:
#export 

import scipy
import pandas as pd
import numpy as np


from sklearn import compose, feature_extraction, metrics
from functools import reduce, partial
import attr
from typing import Union

from game_recommender import steam_data

In [3]:
%cd ..

/home/kuba/Projects/game_recommender


In [4]:
steam_df = steam_data.load_steam_df() 

In [5]:
chosen_games_substring = 's.t.a.l.k'

In [6]:
chosen_games_df = steam_data.get_games_by_name(steam_df, chosen_games_substring) 

In [7]:
chosen_games_df['name']

376     s.t.a.l.k.e.r. shadow of chernobyl
1276        s.t.a.l.k.e.r. call of pripyat
7253              s.t.a.l.k.e.r. clear sky
Name: name, dtype: object

In [8]:
chosen_games_df.iloc[0]['popular_tags']

'atmospheric,post-apocalyptic,open world,fps,survival,masterpiece,action,rpg,survival horror,shooter,horror,first-person,singleplayer,classic,based on a novel,moddable,adventure,sci-fi,inventory management,difficult'

In [9]:
chosen_games_df.iloc[0]['game_details']

'single-player,multi-player'

In [10]:
chosen_games_df.iloc[0]['genre']

'action,rpg'

In [11]:
chosen_games_df['game_description'].iloc[0]

" About This Game In 1986, the world's worst nuclear disaster occurred at the Chernobyl Nuclear Power Plant. Soviet authorities established an 'Exclusion Zone' around, but a second explosion hit the reactor in 2006, creating The Zone as we know it – dangerous place, filled with mutated creatures, deadly radiation, and a strange, anomalous energy. The Zone was cordoned off by the military, who would shoot on sight anyone foolish enough to get inside.  Year 2012. The Zone is still a dangerous place and a threat to all mankind. Mercenaries, bounty hunters and explorers ventured further and further into the heart of the Zone driven by reports of strange 'artifacts' imbued with anomalous energy. To sell them on the black market or trying to find the 'truth' behind the Zone. Whatever their motivation, over time these individuals - Scavengers, Trespassers, Adventurers, Loners, Killers, Explorers and Robbers - have become known as S.T.A.L.K.E.R.s.  You will have to find your own way to survive

In [12]:
chosen_games_df['popular_tags'].iloc[0]

'atmospheric,post-apocalyptic,open world,fps,survival,masterpiece,action,rpg,survival horror,shooter,horror,first-person,singleplayer,classic,based on a novel,moddable,adventure,sci-fi,inventory management,difficult'

In [13]:
game_descriptions = steam_df['game_description']

# Finding games with similar descriptions

In [14]:
from collections import defaultdict

In [15]:
#export

def split_by_comma_tokenizer(text):
    return text.split(',') 


def _regroup_weights(weights, column_groups):
    assert len(weights) == len(column_groups), 'specify weight for each column group'
    return {
        col: weight / len(col_group)
        for (weight, col_group) in zip(weights, column_groups)
        for col in col_group
    }


def make_df_column_vectorizer(
        column_groups,
        vectorizer_classes=feature_extraction.text.TfidfVectorizer,
        weights=None,
        tokenizers=None
    ):
    if type(vectorizer_classes) is not list:
        vectorizer_classes = [vectorizer_classes for __ in range(len(column_groups))]
    if tokenizers is None:
        tokenizers = [None for __ in range(len(column_groups))]
    vectorizers = [vectorizer_class(tokenizer=tokenizer) for (vectorizer_class, grp, tokenizer) in zip(vectorizer_classes, column_groups, tokenizers)] 
    if type(weights) is list:
        weights = _regroup_weights(weights, column_groups)
    
    return compose.ColumnTransformer(
        [
            (col + '_tfidf', vectorizer, col)
            for columns, vectorizer in zip(column_groups, vectorizers)
            for col in columns
        ],
        transformer_weights=weights
    )

In [16]:
vectorizer = make_df_column_vectorizer([
        ['popular_tags'],
        ['game_description', 'desc_snippet'],
    ],
    vectorizer_classes=partial(
            feature_extraction.text.TfidfVectorizer,
            binary=True,
            min_df=5
    ),
    tokenizers=[split_by_comma_tokenizer, None]
)

steam_metadata_vectors = vectorizer.fit_transform(steam_df)

In [17]:
steam_df.shape

(24210, 21)

In [18]:
steam_metadata_vectors.shape

(24210, 31455)

In [19]:
#export

@attr.s
class MetadataSimilaritySearcher:
    
    metadata_df: pd.DataFrame = attr.ib()
    metadata_vectors: Union[scipy.sparse.csr.csr_matrix, np.array] = attr.ib()
    similarity = attr.ib(metrics.pairwise.cosine_similarity)
    
    def find_similar(
            self,
            chosen_games,
            n_similar=20):
        def get_names_from_indices(indices):
            return self.metadata_df['name'].iloc[indices].values

        chosen_metadata_vectors = self.metadata_vectors[chosen_games.index]
        mean_metadata_vector = chosen_metadata_vectors.mean(axis=0)
        similarities = self.similarity(chosen_metadata_vectors, self.metadata_vectors)
        similarity_sorted_indices = similarities.argsort(axis=1)[:,::-1]
        similarities_sorted = np.array([similarities[i, similarity_sorted_indices[i]] for i in range(len(chosen_games))])
        mean_similarity = self.similarity(mean_metadata_vector.reshape(1,-1), self.metadata_vectors)
        mean_metadata_similarity_sorted_indices = mean_similarity.argsort(axis=1)[:,::-1]
        mean_similarities_sorted = mean_similarity[0,mean_metadata_similarity_sorted_indices.reshape(-1)]

        mean_similar = get_names_from_indices(mean_metadata_similarity_sorted_indices[0][:n_similar])
        game_names = {}
        similarities = {}
    
        for (i, name) in enumerate(chosen_games.values):
            game_names[name] = ( 
                    get_names_from_indices(
                        similarity_sorted_indices[i][1:n_similar+1]
                    )
            )
            similarities[name + '_similarities'] = similarities_sorted[i][1:n_similar+1]

        game_names['mean'] = mean_similar
        similarities['mean_similarity'] = mean_similarities_sorted[1:n_similar+1] 
        return pd.DataFrame(game_names), pd.DataFrame(similarities)

In [20]:
#export

def make_stacked_results_df(game_names, similarities):
    df = pd.concat([game_names, similarities], axis=1)
    return df.reindex(sorted(df.columns), axis=1)

In [21]:
metadata_df = steam_df
n_similar=20
metadata_vectors = steam_metadata_vectors


In [22]:
from sklearn import decomposition

n_components = 250


tsvd = decomposition.TruncatedSVD(n_components=n_components)
nmf = decomposition.NMF(n_components=n_components, solver='mu')

In [23]:
%%time
steam_metadata_reduced_vectors = tsvd.fit_transform(steam_metadata_vectors)

CPU times: user 32.5 s, sys: 14.9 s, total: 47.3 s
Wall time: 11.5 s


In [24]:
similarity_searcher = MetadataSimilaritySearcher(steam_df, steam_metadata_reduced_vectors)

# Searching by SVD reduced data

In [25]:
make_stacked_results_df(*similarity_searcher.find_similar(chosen_games_df['name']))

Unnamed: 0,mean,mean_similarity,s.t.a.l.k.e.r. call of pripyat,s.t.a.l.k.e.r. call of pripyat_similarities,s.t.a.l.k.e.r. clear sky,s.t.a.l.k.e.r. clear sky_similarities,s.t.a.l.k.e.r. shadow of chernobyl,s.t.a.l.k.e.r. shadow of chernobyl_similarities
0,s.t.a.l.k.e.r. clear sky,0.948348,s.t.a.l.k.e.r. clear sky,0.903892,s.t.a.l.k.e.r. call of pripyat,0.903892,s.t.a.l.k.e.r. clear sky,0.835435
1,s.t.a.l.k.e.r. call of pripyat,0.919325,s.t.a.l.k.e.r. shadow of chernobyl,0.777306,s.t.a.l.k.e.r. shadow of chernobyl,0.835435,s.t.a.l.k.e.r. call of pripyat,0.777306
2,s.t.a.l.k.e.r. shadow of chernobyl,0.734108,metro last light redux,0.683559,metro last light redux,0.691622,metro last light redux,0.70656
3,metro last light redux,0.71281,metro 2033 redux,0.679452,metro 2033 redux,0.682702,metro 2033 redux,0.657964
4,metro 2033 redux,0.674541,dayz,0.677325,time ramesside (a new reckoning),0.679482,nosferatu the wrath of malachi,0.633565
5,time ramesside (a new reckoning),0.669246,far cry,0.675317,zombi,0.675415,metro exodus,0.623927
6,zombi,0.661252,zombi,0.657139,halflife 2 episode two,0.651758,creature hunt,0.611045
7,unreal 2 the awakening,0.655796,shadows of kurgansk,0.656761,doom 3,0.648996,a.i.m.2 clan wars,0.60455
8,far cry,0.655257,halflife 2 episode two,0.652912,unreal 2 the awakening,0.646886,fallout 3,0.600223
9,halflife 2 episode two,0.65327,time ramesside (a new reckoning),0.65217,doom 3 bfg edition,0.634805,fallout new vegas,0.593673


# Searching by NMF reduced data

In [26]:
#%%time
#steam_metadata_nmf_vectors = nmf.fit_transform(steam_metadata_vectors)

In [27]:
#similar_games(chosen_games_df, steam_df, steam_metadata_nmf_vectors)

In [28]:
# Recommending

In [29]:
#export


@attr.s
class ContentBasedRecommender:
    
    similarity_searcher: MetadataSimilaritySearcher = attr.ib()
        
    def recommend_similar_games(self, rated_games, ratings, n_recommended, n_similar=20):
        similar_games, similarities = self.similarity_searcher.find_similar(rated_games, n_similar=n_similar)
        mean_similar_games = similar_games.pop('mean')
        mean_similarities = similarities.pop('mean_similarity')
        
        predicted_ratings = ratings * similarities / similarities.max()
        predicted_ratings_flat = pd.concat([predicted_ratings[col] for col in predicted_ratings.columns])
        similar_games_flat = pd.concat([similar_games[col] for col in similar_games.columns])
        # TODO: is this best way to handle duplicates?
        pred_df = pd.DataFrame({
            'game': similar_games_flat,
            'rating ': predicted_ratings_flat
        }).groupby('game').agg('mean')
        predicted_ratings_flat = pred_df.iloc[:,0].values
        similar_games_flat = pred_df.index.values
        
        best_rated_indices = np.argsort(predicted_ratings_flat)[::-1][:n_recommended]
        best_ratings = predicted_ratings_flat[best_rated_indices]
        best_rated_games = similar_games_flat[best_rated_indices]
        return best_rated_games, best_ratings
        
    def recommend_mean_similar_games(self, rated_games, ratings, n_recommended, n_similar=20):
        similar_games, similarities = self.similarity_searcher.find_similar(rated_games_df, n_similar=n_similar)
        mean_similar_games = similar_games.pop('mean')
        mean_similarities = similarities.pop('mean_similarity')
        return mean_similar_games, mean_similarities

In [30]:
ratings = [4, 5, 6]
rated_games = chosen_games_df['name']

In [31]:
content_recommender = ContentBasedRecommender(similarity_searcher)
recommended_games, ratings = content_recommender.recommend_similar_games(rated_games, ratings, 20)

In [32]:
recommended_games

array(['s.t.a.l.k.e.r. shadow of chernobyl',
       's.t.a.l.k.e.r. call of pripyat', 's.t.a.l.k.e.r. clear sky',
       'arizona sunshine', 'halflife source', 'zombi', 'halflife 2',
       'halflife 2 episode two', 'far cry', 'metro last light redux',
       'dayz', 'doom 3', 'doom 3 bfg edition', 'estranged act i',
       'dead effect', 'metro 2033 redux', 'shadows of kurgansk',
       'time ramesside (a new reckoning)',
       'nosferatu the wrath of malachi', 'unreal 2 the awakening'],
      dtype=object)