In [1]:
!wget -nc https://query.data.world/s/cqkss4aadchksn4uwdlwuusnvpihrs -O ../../assets/steam.csv

File ‘../../assets/steam.csv’ already there; not retrieving.


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances

In [3]:
steam_df = pd.read_csv('../../assets/steam.csv')

In [4]:
steam_df = steam_df[~steam_df['QueryName'].isna()]

In [5]:
chosen_game_names = steam_df['QueryName'][steam_df['QueryName'].str.contains('S.T.A.L.K')]

In [6]:
chosen_games_df = steam_df[steam_df['QueryName'].isin(chosen_game_names)]

In [7]:
chosen_games_df['AboutText'].iloc[0]

'In 1986 the worlds worst nuclear disaster occurred at the Chernobyl power station. Soviet authorities established a 30km Exclusion Zone around this nuclear wasteland but in 2006 a second explosion rocked the stricken reactor obliterating all living things and causing the Zones boundaries to ripple outwards. From this epicenter came waves of mutated creatures deadly radiation and a strange anomalous energy. The Zone was cordoned off by the military who would shoot on sight anyone foolish enough to brave the horrors within.It is now 2012 - man has ventured further and further into the heart of the Zone driven by reports of strange artifacts imbued with anomalous energy. Mercenaries and bounty hunters compete to recover these artifacts which command extortionate prices on the black market. Others seek to find the truth behind the Zone while some merely revel in the desolate lawlessness of the place. Whatever their motivation over time these individuals - Scavengers Trespassers Adventurer

In [8]:
game_descriptions = steam_df['AboutText']

In [9]:
vectorizer = TfidfVectorizer()
description_vectors = vectorizer.fit_transform(game_descriptions)

In [29]:
chosen_games_df['QueryName'].values

array(['S.T.A.L.K.E.R.: Shadow of Chernobyl', 'S.T.A.L.K.E.R.: Clear Sky',
       'S.T.A.L.K.E.R.: Call of Pripyat'], dtype=object)

In [69]:
def similar_games_titles(chosen_games_df, description_vectors, n_similar=20):
    def get_names_from_indices(indices):
        return steam_df['QueryName'][indices].values
    
    chosen_description_vectors = description_vectors[chosen_games_df.index]
    mean_description_vector = chosen_description_vectors.mean(axis=0)
    similarity_sorted_indices = cosine_distances(chosen_description_vectors, description_vectors).argsort()
    mean_description_similarity_sorted_indices = cosine_distances(mean_description_vector.reshape(1,-1), description_vectors).argsort()
    
    mean_similar = get_names_from_indices(mean_description_similarity_sorted_indices[0][:n_similar])
    
    cols = {name: get_names_from_indices(similarity_sorted_indices[i][:n_similar]) for (i, name) in enumerate(chosen_games_df['QueryName'].values)}
    cols['mean'] = mean_similar
    df = pd.DataFrame(cols)
    return df

In [70]:
similar_games_titles(chosen_games_df, description_vectors)

Unnamed: 0,S.T.A.L.K.E.R.: Shadow of Chernobyl,S.T.A.L.K.E.R.: Clear Sky,S.T.A.L.K.E.R.: Call of Pripyat,mean
0,S.T.A.L.K.E.R.: Shadow of Chernobyl,S.T.A.L.K.E.R.: Clear Sky,S.T.A.L.K.E.R.: Call of Pripyat,S.T.A.L.K.E.R.: Shadow of Chernobyl
1,Red Faction: Guerrilla Steam Edition,Company of Heroes: Tales of Valor,The Great Escape,S.T.A.L.K.E.R.: Clear Sky
2,Death and the Fly,Shapes of Gray,Subspace Continuum,S.T.A.L.K.E.R.: Call of Pripyat
3,Wave of Darkness,Disney Planes,Mahluk:Dark demon,Subspace Continuum
4,Detective Hank and the Golden Sneeze,"Invisible, Inc.",Stories of Bethem: Full Moon,Red Faction: Guerrilla Steam Edition
5,Sign Motion,VERGE:Lost chapter,Alien Robot Monsters,Shapes of Gray
6,World Ship Simulator,Homefront,Space Overlords,Solar Struggle
7,Subspace Continuum,Solar Struggle,STAR WARS™: X-Wing Special Edition,Mahluk:Dark demon
8,Muddy Heights 2,Supraball,World of Soccer online,Abduction Action! Plus
9,Solar Struggle,LEGO® Star Wars™ III: The Clone Wars™,Arelite Core,DNO Rasa's Journey
