In [1]:
#default_exp content_based 

In [35]:
#export 

import scipy
import pandas as pd
import numpy as np


from sklearn import compose, feature_extraction, metrics
from functools import reduce, partial
import attr
from typing import Union

from game_recommender import steam_data

In [3]:
%cd ..

/home/kuba/Projects/game_recommender


In [4]:
steam_df = steam_data.load_steam_df() 

In [5]:
chosen_games_substring = 'S.T.A.L.K'

In [6]:
chosen_games_df = steam_data.get_games_by_name(steam_df, chosen_games_substring) 

In [7]:
chosen_games_df['name']

376     S.T.A.L.K.E.R.: Shadow of Chernobyl
1276        S.T.A.L.K.E.R.: Call of Pripyat
7253              S.T.A.L.K.E.R.: Clear Sky
Name: name, dtype: object

In [8]:
chosen_games_df.iloc[0]['popular_tags']

'atmospheric,post-apocalyptic,open world,fps,survival,masterpiece,action,rpg,survival horror,shooter,horror,first-person,singleplayer,classic,based on a novel,moddable,adventure,sci-fi,inventory management,difficult'

In [9]:
chosen_games_df.iloc[0]['game_details']

'single-player,multi-player'

In [10]:
chosen_games_df.iloc[0]['genre']

'action,rpg'

In [11]:
chosen_games_df['game_description'].iloc[0]

" About This Game In 1986, the world's worst nuclear disaster occurred at the Chernobyl Nuclear Power Plant. Soviet authorities established an 'Exclusion Zone' around, but a second explosion hit the reactor in 2006, creating The Zone as we know it – dangerous place, filled with mutated creatures, deadly radiation, and a strange, anomalous energy. The Zone was cordoned off by the military, who would shoot on sight anyone foolish enough to get inside.  Year 2012. The Zone is still a dangerous place and a threat to all mankind. Mercenaries, bounty hunters and explorers ventured further and further into the heart of the Zone driven by reports of strange 'artifacts' imbued with anomalous energy. To sell them on the black market or trying to find the 'truth' behind the Zone. Whatever their motivation, over time these individuals - Scavengers, Trespassers, Adventurers, Loners, Killers, Explorers and Robbers - have become known as S.T.A.L.K.E.R.s.  You will have to find your own way to survive

In [12]:
chosen_games_df['popular_tags'].iloc[0]

'atmospheric,post-apocalyptic,open world,fps,survival,masterpiece,action,rpg,survival horror,shooter,horror,first-person,singleplayer,classic,based on a novel,moddable,adventure,sci-fi,inventory management,difficult'

In [13]:
game_descriptions = steam_df['game_description']

# Finding games with similar descriptions

In [14]:
from collections import defaultdict

In [15]:
#export

def split_by_comma_tokenizer(text):
    return text.split(',') 


def _regroup_weights(weights, column_groups):
    assert len(weights) == len(column_groups), 'specify weight for each column group'
    return {
        col: weight / len(col_group)
        for (weight, col_group) in zip(weights, column_groups)
        for col in col_group
    }


def make_df_column_vectorizer(
        column_groups,
        vectorizer_classes=feature_extraction.text.TfidfVectorizer,
        weights=None,
        tokenizers=None
    ):
    if type(vectorizer_classes) is not list:
        vectorizer_classes = [vectorizer_classes for __ in range(len(column_groups))]
    if tokenizers is None:
        tokenizers = [None for __ in range(len(column_groups))]
    vectorizers = [vectorizer_class(tokenizer=tokenizer) for (vectorizer_class, grp, tokenizer) in zip(vectorizer_classes, column_groups, tokenizers)] 
    if type(weights) is list:
        weights = _regroup_weights(weights, column_groups)
    
    return compose.ColumnTransformer(
        [
            (col + '_tfidf', vectorizer, col)
            for columns, vectorizer in zip(column_groups, vectorizers)
            for col in columns
        ],
        transformer_weights=weights
    )

In [16]:
vectorizer = make_df_column_vectorizer([
        ['popular_tags'],
        ['game_description', 'desc_snippet'],
    ],
    vectorizer_classes=partial(
            feature_extraction.text.TfidfVectorizer,
            binary=True,
            min_df=5
    ),
    tokenizers=[split_by_comma_tokenizer, None]
)

steam_metadata_vectors = vectorizer.fit_transform(steam_df)

In [17]:
steam_df.shape

(24210, 20)

In [18]:
steam_metadata_vectors.shape

(24210, 31455)

In [40]:


type(scipy.sparse.csc_matrix([]))

scipy.sparse.csc.csc_matrix

In [59]:
#export

@attr.s
class MetadataSimilaritySearcher:
    
    metadata_df: pd.DataFrame = attr.ib()
    metadata_vectors: Union[scipy.sparse.csr.csr_matrix, np.array] = attr.ib()
    similarity = attr.ib(metrics.pairwise.cosine_similarity)
    
    def find_similar(
            self,
            chosen_games_df,
            n_similar=20):
        def get_names_from_indices(indices):
            return self.metadata_df['name'].iloc[indices].values

        chosen_metadata_vectors = self.metadata_vectors[chosen_games_df.index]
        mean_metadata_vector = chosen_metadata_vectors.mean(axis=0)
        similarities = self.similarity(chosen_metadata_vectors, self.metadata_vectors)
        similarity_sorted_indices = similarities.argsort(axis=1)[:,::-1]
        similarities_sorted = np.array([similarities[i, similarity_sorted_indices[i]] for i in range(len(chosen_games_df))])
        mean_similarity = self.similarity(mean_metadata_vector.reshape(1,-1), self.metadata_vectors)
        mean_metadata_similarity_sorted_indices = mean_similarity.argsort(axis=1)[:,::-1]
        mean_similarities_sorted = mean_similarity[0,mean_metadata_similarity_sorted_indices.reshape(-1)]

        mean_similar = get_names_from_indices(mean_metadata_similarity_sorted_indices[0][:n_similar])
        game_names = {}
        similarities = {}

        for (i, name) in enumerate(chosen_games_df['name'].values):
            game_names[name] = ( 
                    get_names_from_indices(
                        similarity_sorted_indices[i][1:n_similar+1]
                    )
            )
            similarities[name + '_similarities'] = similarities_sorted[i][1:n_similar+1]

        game_names['mean'] = mean_similar
        similarities['mean_similarity'] = mean_similarities_sorted[1:n_similar+1] 
        return pd.DataFrame(game_names), pd.DataFrame(similarities)

In [53]:
#export

def make_stacked_results_df(game_names, similarities):
    df = pd.concat([game_names, similarities], axis=1)
    return df.reindex(sorted(df.columns), axis=1)

In [54]:
metadata_df = steam_df
n_similar=20
metadata_vectors = steam_metadata_vectors


In [55]:
from sklearn import decomposition

n_components = 250


tsvd = decomposition.TruncatedSVD(n_components=n_components)
nmf = decomposition.NMF(n_components=n_components, solver='mu')

In [30]:
%%time
steam_metadata_reduced_vectors = tsvd.fit_transform(steam_metadata_vectors)

CPU times: user 31.8 s, sys: 15.1 s, total: 46.8 s
Wall time: 11.3 s


In [60]:
similarity_searcher = MetadataSimilaritySearcher(steam_df, steam_metadata_reduced_vectors)

In [58]:
import pickle

pickle.dump(similarity_searcher, open('data/tsvd_similarity_searcher.pkl', 'wb'))

# Searching by SVD reduced data

In [61]:
show_stacked_results(*similarity_searcher.find_similar(chosen_games_df))

Unnamed: 0,S.T.A.L.K.E.R.: Call of Pripyat,S.T.A.L.K.E.R.: Call of Pripyat_similarities,S.T.A.L.K.E.R.: Clear Sky,S.T.A.L.K.E.R.: Clear Sky_similarities,S.T.A.L.K.E.R.: Shadow of Chernobyl,S.T.A.L.K.E.R.: Shadow of Chernobyl_similarities,mean,mean_similarity
0,S.T.A.L.K.E.R.: Clear Sky,0.897464,S.T.A.L.K.E.R.: Call of Pripyat,0.897464,S.T.A.L.K.E.R.: Clear Sky,0.838078,S.T.A.L.K.E.R.: Clear Sky,0.94743
1,S.T.A.L.K.E.R.: Shadow of Chernobyl,0.78053,S.T.A.L.K.E.R.: Shadow of Chernobyl,0.838078,S.T.A.L.K.E.R.: Call of Pripyat,0.78053,S.T.A.L.K.E.R.: Call of Pripyat,0.921327
2,Metro: Last Light Redux,0.679446,Metro: Last Light Redux,0.684372,Metro: Last Light Redux,0.705726,S.T.A.L.K.E.R.: Shadow of Chernobyl,0.72982
3,DayZ,0.675076,Time Ramesside (A New Reckoning),0.676534,Metro 2033 Redux,0.665219,Metro: Last Light Redux,0.710673
4,Metro 2033 Redux,0.672855,Metro 2033 Redux,0.676193,Nosferatu: The Wrath of Malachi,0.62662,Metro 2033 Redux,0.671281
5,Far Cry®,0.671834,ZOMBI,0.668388,Metro Exodus,0.622124,Time Ramesside (A New Reckoning),0.667812
6,Unreal 2: The Awakening,0.651053,DOOM 3,0.658594,Creature Hunt,0.614082,Unreal 2: The Awakening,0.658057
7,DOOM 3,0.648683,Unreal 2: The Awakening,0.655083,Fallout 3,0.607511,DOOM 3,0.656788
8,Doom 3: BFG Edition,0.648503,Doom 3: BFG Edition,0.64895,A.I.M.2 Clan Wars,0.600061,Dead Effect,0.656701
9,Shadows of Kurgansk,0.648089,Dead Effect,0.648827,Fallout: New Vegas,0.595463,Fallout 3,0.655331


# Searching by NMF reduced data

In [None]:
#%%time
#steam_metadata_nmf_vectors = nmf.fit_transform(steam_metadata_vectors)

In [32]:
#similar_games(chosen_games_df, steam_df, steam_metadata_nmf_vectors)

In [33]:
# Recommending

In [None]:
def recommend_similar_games(chosen_games, ratings):
    