### Testing various vectorizer parameters and distance metrics
change game title to any valid game you want to run tests on

In [10]:

import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# manhattan is pretty useless and takes time
# from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
games_df = pickle.load(open('../data/steam_games_clean.pkl', 'rb'))

In [12]:
# make description column into type string, otherwise tfidf cries
games_df['description_clean'] = games_df['description_clean'].astype(str)

In [13]:
# games_df drop columns not needed for model
game_tags = games_df.drop(columns=['appid', 'description_clean', 'name'])

In [14]:
# Construct a reverse map of indices and game titles
indices = pd.Series(games_df.index, index=games_df['name'])

In [15]:
vectorizer_params = [
    {'max_features': 500, 'min_df': 2, 'ngram_range': (1, 1)},
    {'max_features': 500, 'min_df': 5, 'ngram_range': (1, 1)},
    {'max_features': 1000, 'min_df': 2, 'ngram_range': (1, 1)},
    {'max_features': 1000, 'min_df': 5, 'ngram_range': (1, 1)},
    {'max_features': 2000, 'min_df': 2, 'ngram_range': (1, 1)},
    {'max_features': 2000, 'min_df': 5, 'ngram_range': (1, 1)},
    {'max_features': 500, 'min_df': 2, 'ngram_range': (1, 2)},
    {'max_features': 500, 'min_df': 5, 'ngram_range': (1, 2)},
    {'max_features': 1000, 'min_df': 2, 'ngram_range': (1, 2)},
    {'max_features': 1000, 'min_df': 5, 'ngram_range': (1, 2)},
    {'max_features': 2000, 'min_df': 2, 'ngram_range': (1, 2)},
    {'max_features': 2000, 'min_df': 5, 'ngram_range': (1, 2)},
    {'max_features': 500, 'min_df': 2, 'ngram_range': (1, 3)},
    {'max_features': 500, 'min_df': 5, 'ngram_range': (1, 3)},
    {'max_features': 1000, 'min_df': 2, 'ngram_range': (1, 3)},
    {'max_features': 1000, 'min_df': 5, 'ngram_range': (1, 3)},
    {'max_features': 2000, 'min_df': 2, 'ngram_range': (1, 3)},
    {'max_features': 2000, 'min_df': 5, 'ngram_range': (1, 3)}
]

distance_metrics = [
    ('cosine', cosine_similarity),
    ('euclidean', euclidean_distances),
    # ('manhattan', manhattan_distances),
    ('jaccard', pairwise_distances)
]

In [16]:
# change title to desired game name
title = 'Alien: Isolation'
game_idx = indices[title]

In [17]:
# for loop to run through various tfidf vectorizer params and distance metrics
for metric_name, metric_func in distance_metrics:
    for tfidf_param in vectorizer_params:
        tfidf = TfidfVectorizer(**tfidf_param)
        tfidf_matrix = tfidf.fit_transform(games_df['description_clean'])
        matrix = np.concatenate((tfidf_matrix.toarray(), game_tags.values), axis=1)
        # get similarity scores for our game
        if metric_name == 'jaccard':
            game_sim_scores = 1 - pairwise_distances(matrix[game_idx].reshape(1, -1), matrix, metric = "jaccard")
        elif metric_name == 'euclidean':
            game_sim_scores = 1 / (1 + euclidean_distances(matrix[game_idx].reshape(1, -1), matrix))
        else:
            game_sim_scores = metric_func(matrix[game_idx].reshape(1, -1), matrix)
        # sort scores
        sorted_game_sim_scores = sorted(list(enumerate(game_sim_scores[0])), key=lambda x: x[1], reverse=True)
        # make a dictionary with title as key and score as value
        content_similar_scores = {indices.index[i[0]]: i[1] for i in sorted_game_sim_scores[1:21]}
        # write results log
        with open(title + '_tfidf.log', 'a') as f:
            f.write(f'\n{metric_name} {tfidf_param}\n')
            for key, value in content_similar_scores.items():
                f.write(f'{key} {value}\n')



In [18]:
for metric_name, metric_func in distance_metrics:
    for countvec_param in vectorizer_params:
        countvec = CountVectorizer(**countvec_param)
        countvec_matrix = countvec.fit_transform(games_df['description_clean'])
        matrix = np.concatenate((countvec_matrix.toarray(), game_tags.values), axis=1)
        if metric_name == 'jaccard':
            game_sim_scores = 1 - pairwise_distances(matrix[game_idx].reshape(1, -1), matrix, metric = "jaccard")
        elif metric_name == 'euclidean':
            game_sim_scores = 1 / (1 + euclidean_distances(matrix[game_idx].reshape(1, -1), matrix))
        else:
            game_sim_scores = metric_func(matrix[game_idx].reshape(1, -1), matrix)
        # sort scores
        sorted_game_sim_scores = sorted(list(enumerate(game_sim_scores[0])), key=lambda x: x[1], reverse=True)
        # make a dictionary with title as key and score as value
        content_similar_scores = {indices.index[i[0]]: i[1] for i in sorted_game_sim_scores[1:21]}
        
        # write results to log
        with open(title + '_countvec.log', 'a') as f:
            f.write(f'\n{metric_name} {countvec_param}\n')
            for key, value in content_similar_scores.items():
                f.write(f'{key} {value}\n')

