In [1]:
#default_exp content_based 

In [2]:
#export 

from game_recommender import steam_data
import pandas as pd
from sklearn import compose, feature_extraction, metrics
from functools import reduce, partial

In [3]:
%cd ..

/home/kuba/Projects/game_recommender


In [4]:
steam_df = steam_data.load_steam_df() 

In [5]:
chosen_games_substring = 'S.T.A.L.K'

In [6]:
chosen_games_df = steam_data.get_games_by_name(steam_df, chosen_games_substring) 

In [7]:
chosen_games_df['name']

376     S.T.A.L.K.E.R.: Shadow of Chernobyl
1276        S.T.A.L.K.E.R.: Call of Pripyat
7253              S.T.A.L.K.E.R.: Clear Sky
Name: name, dtype: object

In [8]:
chosen_games_df.iloc[0]['popular_tags']

'atmospheric,post-apocalyptic,open world,fps,survival,masterpiece,action,rpg,survival horror,shooter,horror,first-person,singleplayer,classic,based on a novel,moddable,adventure,sci-fi,inventory management,difficult'

In [16]:
chosen_games_df.iloc[0]['game_details']

'single-player,multi-player'

In [17]:
chosen_games_df.iloc[0]['genre']

'action,rpg'

In [18]:
chosen_games_df['game_description'].iloc[0]

" About This Game In 1986, the world's worst nuclear disaster occurred at the Chernobyl Nuclear Power Plant. Soviet authorities established an 'Exclusion Zone' around, but a second explosion hit the reactor in 2006, creating The Zone as we know it – dangerous place, filled with mutated creatures, deadly radiation, and a strange, anomalous energy. The Zone was cordoned off by the military, who would shoot on sight anyone foolish enough to get inside.  Year 2012. The Zone is still a dangerous place and a threat to all mankind. Mercenaries, bounty hunters and explorers ventured further and further into the heart of the Zone driven by reports of strange 'artifacts' imbued with anomalous energy. To sell them on the black market or trying to find the 'truth' behind the Zone. Whatever their motivation, over time these individuals - Scavengers, Trespassers, Adventurers, Loners, Killers, Explorers and Robbers - have become known as S.T.A.L.K.E.R.s.  You will have to find your own way to survive

In [19]:
chosen_games_df['popular_tags'].iloc[0]

'atmospheric,post-apocalyptic,open world,fps,survival,masterpiece,action,rpg,survival horror,shooter,horror,first-person,singleplayer,classic,based on a novel,moddable,adventure,sci-fi,inventory management,difficult'

In [20]:
game_descriptions = steam_df['game_description']

# Finding games with similar descriptions

In [21]:
from collections import defaultdict

In [22]:
#export

def split_by_comma_tokenizer(text):
    return text.split(',') 


def _regroup_weights(weights, column_groups):
    assert len(weights) == len(column_groups), 'specify weight for each column group'
    return {
        col: weight / len(col_group)
        for (weight, col_group) in zip(weights, column_groups)
        for col in col_group
    }


def make_df_column_vectorizer(
        column_groups,
        vectorizer_classes=feature_extraction.text.TfidfVectorizer,
        weights=None,
        tokenizers=None
    ):
    if type(vectorizer_classes) is not list:
        vectorizer_classes = [vectorizer_classes for __ in range(len(column_groups))]
    if tokenizers is None:
        tokenizers = [None for __ in range(len(column_groups))]
    vectorizers = [vectorizer_class(tokenizer=tokenizer) for (vectorizer_class, grp, tokenizer) in zip(vectorizer_classes, column_groups, tokenizers)] 
    if type(weights) is list:
        weights = _regroup_weights(weights, column_groups)
    
    return compose.ColumnTransformer(
        [
            (col + '_tfidf', vectorizer, col)
            for columns, vectorizer in zip(column_groups, vectorizers)
            for col in columns
        ],
        transformer_weights=weights
    )

In [40]:
vectorizer = make_df_column_vectorizer([
        ['popular_tags'],
        ['game_description', 'desc_snippet'],
    ],
    vectorizer_classes=partial(
            feature_extraction.text.TfidfVectorizer,
            binary=True,
            min_df=5
    ),
    tokenizers=[split_by_comma_tokenizer, None]
)

steam_metadata_vectors = vectorizer.fit_transform(steam_df)

In [41]:
steam_df.shape

(24210, 20)

In [42]:
steam_metadata_vectors.shape

(24210, 31455)

In [43]:
#export

def similar_games(
        chosen_games_df,
        metadata_df,
        metadata_vectors,
        n_similar=20,
        distance=metrics.pairwise.cosine_distances):
    def get_names_from_indices(indices):
        return metadata_df['name'].iloc[indices].values
    
    chosen_metadata_vectors = metadata_vectors[chosen_games_df.index]
    mean_metadata_vector = chosen_metadata_vectors.mean(axis=0)
    similarity_sorted_indices = distance(chosen_metadata_vectors, metadata_vectors).argsort(axis=1)
    mean_metadata_similarity_sorted_indices = distance(mean_metadata_vector.reshape(1,-1), metadata_vectors).argsort(axis=1)
    
    mean_similar = get_names_from_indices(mean_metadata_similarity_sorted_indices[0][:n_similar])
    cols = {name: get_names_from_indices(similarity_sorted_indices[i][1:n_similar+1]) for (i, name) in enumerate(chosen_games_df['name'].values)}
    cols['mean'] = mean_similar
    df = pd.DataFrame(cols)
    return df

In [44]:
similar_games(chosen_games_df, steam_df, steam_metadata_vectors)

Unnamed: 0,S.T.A.L.K.E.R.: Shadow of Chernobyl,S.T.A.L.K.E.R.: Call of Pripyat,S.T.A.L.K.E.R.: Clear Sky,mean
0,S.T.A.L.K.E.R.: Clear Sky,S.T.A.L.K.E.R.: Clear Sky,S.T.A.L.K.E.R.: Call of Pripyat,S.T.A.L.K.E.R.: Clear Sky
1,S.T.A.L.K.E.R.: Call of Pripyat,S.T.A.L.K.E.R.: Shadow of Chernobyl,S.T.A.L.K.E.R.: Shadow of Chernobyl,S.T.A.L.K.E.R.: Call of Pripyat
2,Zone Anomaly,Shadows of Kurgansk,Zone Anomaly,S.T.A.L.K.E.R.: Shadow of Chernobyl
3,Fear The Wolves,Zone Anomaly,CHERNOBYL HISTORY OF NUCLEAR DISASTER,Zone Anomaly
4,Metro: Last Light Redux,CHERNOBYL HISTORY OF NUCLEAR DISASTER,Time Ramesside (A New Reckoning),CHERNOBYL HISTORY OF NUCLEAR DISASTER
5,Metro 2033 Redux,Fallout 3,Shadows of Kurgansk,Shadows of Kurgansk
6,CHERNOBYL HISTORY OF NUCLEAR DISASTER,Survival: Last Day,Arizona Sunshine,Fear The Wolves
7,Fallout 3,Far Cry®,DOOM 3,Fallout 3
8,Metro Exodus,Time Ramesside (A New Reckoning),Fallout 3,Metro: Last Light Redux
9,Into the Radius VR,Left 4 Dead 2,Fallout 3: Game of the Year Edition,Time Ramesside (A New Reckoning)


In [50]:
from sklearn import decomposition

n_components = 250


tsvd = decomposition.TruncatedSVD(n_components=n_components)
nmf = decomposition.NMF(n_components=n_components, solver='mu')

In [51]:
%%time
steam_metadata_reduced_vectors = tsvd.fit_transform(steam_metadata_vectors)

CPU times: user 1min 21s, sys: 36.3 s, total: 1min 57s
Wall time: 26.3 s


In [102]:
pd.read_csv('https://storage.googleapis.com/lambdastruck_bucket/datasets/steam/steam_metadata_reduced_vectors.csv.gz')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0.28100,-0.31670,0.0737,-0.029700,-0.27880,-0.16490,0.000873,0.04360,0.178000,-0.093000,...,-0.003540,0.011400,-0.000414,-0.025770,0.062230,-0.044430,0.002079,-0.030120,0.021350,-0.02260
1,0.28050,-0.23320,0.2096,0.096300,-0.16490,0.01701,0.018200,0.03973,0.214800,0.051030,...,0.000730,-0.026670,-0.032230,-0.020570,0.008410,-0.082340,-0.034000,0.000309,0.017150,0.00820
2,0.30830,-0.28560,0.1503,-0.098750,-0.08330,0.08920,-0.092160,-0.01001,-0.055800,-0.022490,...,0.020140,-0.001319,0.080500,-0.054300,-0.039250,0.027510,-0.008570,0.004074,0.064400,-0.06097
3,0.38260,-0.29320,0.1388,0.054320,-0.05728,-0.07530,0.114600,0.06080,0.299300,0.081700,...,-0.074770,-0.026350,0.020660,0.003618,-0.035980,-0.016710,0.015370,-0.015850,0.034550,0.01344
4,0.30500,-0.19920,0.2261,-0.002722,-0.03980,0.07580,-0.000929,0.08640,-0.007740,-0.003506,...,0.027660,-0.026460,-0.015480,0.015070,0.009390,-0.002850,-0.004364,0.024750,0.016300,-0.01648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24205,0.11725,-0.05260,0.0543,-0.009070,-0.02544,-0.03452,-0.012660,0.02448,0.005253,0.019210,...,0.042000,-0.009610,0.019030,-0.029050,0.000728,0.001472,-0.026660,-0.001333,0.006270,-0.02583
24206,0.69970,-0.04767,-0.4163,0.425000,-0.26320,0.18740,0.098940,-0.08230,-0.084050,-0.135100,...,0.013275,-0.007420,0.006790,-0.050480,0.024770,0.032600,0.045500,0.008340,0.027330,0.01251
24207,0.63960,-0.02760,-0.4329,0.463600,-0.26640,0.22470,0.109440,-0.05870,-0.097600,-0.152500,...,0.027980,-0.040900,0.013880,0.010010,-0.039920,0.025400,-0.026410,0.005410,0.033940,-0.05520
24208,0.38350,-0.26730,0.1589,0.084000,-0.04694,0.23430,-0.169800,0.17590,0.072500,-0.085900,...,-0.086300,0.036100,-0.030820,-0.002900,-0.034580,0.035800,-0.001870,-0.047450,-0.007423,0.03680


# Searching by SVD reduced data

In [52]:
similar_games(chosen_games_df, steam_df, steam_metadata_reduced_vectors)

Unnamed: 0,S.T.A.L.K.E.R.: Shadow of Chernobyl,S.T.A.L.K.E.R.: Call of Pripyat,S.T.A.L.K.E.R.: Clear Sky,mean
0,S.T.A.L.K.E.R.: Clear Sky,S.T.A.L.K.E.R.: Clear Sky,S.T.A.L.K.E.R.: Call of Pripyat,S.T.A.L.K.E.R.: Clear Sky
1,S.T.A.L.K.E.R.: Call of Pripyat,S.T.A.L.K.E.R.: Shadow of Chernobyl,S.T.A.L.K.E.R.: Shadow of Chernobyl,S.T.A.L.K.E.R.: Call of Pripyat
2,Metro: Last Light Redux,Metro: Last Light Redux,Metro: Last Light Redux,S.T.A.L.K.E.R.: Shadow of Chernobyl
3,Metro 2033 Redux,Metro 2033 Redux,Time Ramesside (A New Reckoning),Metro: Last Light Redux
4,Nosferatu: The Wrath of Malachi,DayZ,Metro 2033 Redux,Metro 2033 Redux
5,Metro Exodus,Far Cry®,ZOMBI,Time Ramesside (A New Reckoning)
6,Creature Hunt,Half-Life 2: Episode Two,Unreal 2: The Awakening,Unreal 2: The Awakening
7,A.I.M.2 Clan Wars,Unreal 2: The Awakening,DOOM 3,ZOMBI
8,Fallout 3,Time Ramesside (A New Reckoning),Half-Life 2: Episode Two,Half-Life 2: Episode Two
9,Fallout: New Vegas,ZOMBI,Doom 3: BFG Edition,Fallout 3


# Searching by NMF reduced data

In [53]:
%%time
steam_metadata_nmf_vectors = nmf.fit_transform(steam_metadata_vectors)

CPU times: user 12min 3s, sys: 3min 35s, total: 15min 39s
Wall time: 5min 18s


In [54]:
similar_games(chosen_games_df, steam_df, steam_metadata_nmf_vectors)

Unnamed: 0,S.T.A.L.K.E.R.: Shadow of Chernobyl,S.T.A.L.K.E.R.: Call of Pripyat,S.T.A.L.K.E.R.: Clear Sky,mean
0,S.T.A.L.K.E.R.: Clear Sky,S.T.A.L.K.E.R.: Clear Sky,S.T.A.L.K.E.R.: Call of Pripyat,S.T.A.L.K.E.R.: Clear Sky
1,S.T.A.L.K.E.R.: Call of Pripyat,S.T.A.L.K.E.R.: Shadow of Chernobyl,S.T.A.L.K.E.R.: Shadow of Chernobyl,S.T.A.L.K.E.R.: Call of Pripyat
2,Metro: Last Light Redux,Survival: Last Day,Arizona Sunshine,S.T.A.L.K.E.R.: Shadow of Chernobyl
3,Metro 2033 Redux,DayZ,Estranged: Act I,Metro 2033 Redux
4,Arizona Sunshine,Half-Life 2: Episode Two,Metro 2033 Redux,Arizona Sunshine
5,Estranged: Act I,Time Ramesside (A New Reckoning),Half-Life 2: Episode Two,Half-Life 2: Episode Two
6,Half-Life 2: Episode Two,ZOMBI,Time Ramesside (A New Reckoning),Metro: Last Light Redux
7,35MM,Metro 2033 Redux,ZOMBI,Time Ramesside (A New Reckoning)
8,Singularity™,Arizona Sunshine,Metro: Last Light Redux,Estranged: Act I
9,Metro Exodus,Fallout 3: Game of the Year Edition,Left 4 Dead 2,ZOMBI
