In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from surprise import SVD
from surprise import SVDpp
from scipy.sparse import csr_matrix

In [3]:
# Pull in the dataframe.
# Josh G
# df_reviews = pd.read_csv('../../../../../Downloads/steam_reviews.csv', low_memory=False)
# Gabe
df_reviews = pd.read_csv('steam_reviews.csv', low_memory=False)

user_rec_df = df_reviews[["app_id","author.steamid", "recommended"]].copy()
user_rec_df.sample(5)

In [3]:
# going to be using thesee to create the user review matrix dimensions
n_ratings = len(user_rec_df)
n_games = len(user_rec_df['app_id'].unique())
n_users = len(user_rec_df['author.steamid'].unique())

print(n_ratings, n_games, n_users)

21747371 315 12406560


In [4]:
# maps idices to users and game IDs
user_map = dict(zip(np.unique(user_rec_df["author.steamid"]), list(range(n_users))))
game_map = dict(zip(np.unique(user_rec_df["app_id"]), list(range(n_games))))

print(len(user_map) == n_users)
print(len(game_map) == n_games)

user_i_map = dict(zip(list(range(n_users)), np.unique(user_rec_df["author.steamid"])))
game_i_map = dict(zip(list(range(n_games)), np.unique(user_rec_df["app_id"])))

True
True


In [5]:
# creating indices for csr_matrix
user_index = [user_map[i] for i in user_rec_df['author.steamid']]
game_index = [game_map[i] for i in user_rec_df['app_id']]

In [6]:
# creates csr matrix
matrix = csr_matrix((user_rec_df["recommended"], (game_index, user_index)), shape=(n_games, n_users))

In [7]:
# creating a dictionary to pull the name of games from based on IDs
game_names = dict(zip(df_reviews['app_id'], df_reviews['app_name']))

In [23]:
def find_related_games(game_id, total_matches):
    neighbour_ids_with_distance = {}
    
    game_index = game_map[game_id]
    game_vector = matrix[game_index]
    total_matches += 1
    
    kNN = NearestNeighbors(algorithm = 'brute', metric='cosine')
    kNN.fit(matrix)
    
    game_vec = game_vector.reshape(1,-1)
    distances, indices = kNN.kneighbors(game_vec, n_neighbors=total_matches)
    for i in range(0,len(distances.flatten())):
        n = indices.flatten()[i]
        neighbour_id = game_i_map[n]
        neighbour_ids_with_distance[game_names[neighbour_id]] = distances.flatten()[i]
    neighbour_ids_with_distance.pop(game_names[game_id], None)
    
    # Sort the data by accuracy
    sorted_neighbour_ids_with_distance = sorted(neighbour_ids_with_distance.items(), key=lambda x: x[1], reverse=False)
    
    # Print the games and their related accuracy.
    print(f"Games related to: {game_names[game_id]}\n")
    for game_name, accuracy in sorted_neighbour_ids_with_distance:
        print(f"{game_name}: {accuracy:.2f}")

In [24]:
# Set a test game id and run the function.
game_id = 225540
 
find_related_games(game_id, 10)

Games related to: Just Cause 3

Just Cause 4: 0.94
Grand Theft Auto V: 0.95
Watch_Dogs 2: 0.95
Dying Light: 0.95
Rise of the Tomb Raider: 0.96
Far Cry 5: 0.96
DOOM: 0.96
The Witcher 3: Wild Hunt: 0.96
Fallout 4: 0.96
Garry's Mod: 0.96


In [25]:
# Perform another test to see the results.
game_id = 4000
  
find_related_games(game_id, 10)

Games related to: Garry's Mod

PAYDAY 2: 0.90
Terraria: 0.91
Portal 2: 0.91
Counter-Strike: Source: 0.93
Half-Life: 0.93
Grand Theft Auto V: 0.93
Tom Clancy's Rainbow Six Siege: 0.93
Among Us: 0.93
Rust: 0.94
People Playground: 0.94


In [26]:
# Perform another test to see the results.
game_id = 413150

find_related_games(game_id, 10)

Games related to: Stardew Valley

Terraria: 0.93
Don't Starve Together: 0.95
Undertale: 0.96
Slime Rancher: 0.96
My Time At Portia: 0.96
Hollow Knight: 0.96
The Witcher 3: Wild Hunt: 0.96
RimWorld: 0.96
The Elder Scrolls V: Skyrim: 0.96
The Binding of Isaac: Rebirth: 0.96
