In [30]:
# Import the required libaries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [31]:
# Pull in the dataframe.
# Josh G
# df_reviews_dummies = pd.read_csv('../../../../../Downloads/Prepped_test_out.csv', low_memory=False)
# Gabe
df_reviews = pd.read_csv('steam_reviews.csv', low_memory=False)

In [32]:
df_reviews.columns

Index(['Unnamed: 0', 'app_id', 'app_name', 'review_id', 'language', 'review',
       'timestamp_created', 'timestamp_updated', 'recommended',
       'votes_helpful', 'votes_funny', 'weighted_vote_score', 'comment_count',
       'steam_purchase', 'received_for_free', 'written_during_early_access',
       'author.steamid', 'author.num_games_owned', 'author.num_reviews',
       'author.playtime_forever', 'author.playtime_last_two_weeks',
       'author.playtime_at_review', 'author.last_played'],
      dtype='object')

In [33]:
user_rec_df = df_reviews[["app_id","author.steamid", "recommended"]].copy()
user_rec_df.sample(10)

Unnamed: 0,app_id,author.steamid,recommended
7101146,945360,76561198370080884,True
14109834,578080,76561198926007617,False
4633169,431960,76561199025213862,True
6610381,359550,76561198308213607,True
1074224,4000,76561198435821504,True
4818198,787860,76561198079572865,True
6124015,359550,76561198320300927,True
12625224,35140,76561198159559630,True
5853831,413150,76561198100928867,True
12937927,271590,76561198253439004,True


In [34]:
# going to be using thesee to create the user review matrix dimensions
n_ratings = len(user_rec_df)
n_games = len(user_rec_df['app_id'].unique())
n_users = len(user_rec_df['author.steamid'].unique())
print(n_ratings, n_games, n_users)

21747371 315 12406560


In [35]:
#maps idices to users and game IDs

user_map = dict(zip(np.unique(user_rec_df["author.steamid"]), list(range(n_users))))
game_map = dict(zip(np.unique(user_rec_df["app_id"]), list(range(n_games))))
print(len(user_map) == n_users)
print(len(game_map) == n_games)

user_i_map = dict(zip(list(range(n_users)), np.unique(user_rec_df["author.steamid"])))
game_i_map = dict(zip(list(range(n_games)), np.unique(user_rec_df["app_id"])))

True
True


In [36]:
# creating indices for csr_matrix
user_index = [user_map[i] for i in user_rec_df['author.steamid']]
game_index = [game_map[i] for i in user_rec_df['app_id']]

In [37]:
# creates csr matrix
matrix = csr_matrix((user_rec_df["recommended"], (game_index, user_index)), shape=(n_games, n_users))
matrix

<315x12406560 sparse matrix of type '<class 'numpy.bool_'>'
	with 21612012 stored elements in Compressed Sparse Row format>

In [38]:
def find_related_games(game_id, k):
    # going to store the IDs for nearest games
    neighbour_ids = []
    
    # Honestly kinda confused as to what's going on here. I think the vectors are the euclidean distances of the other games in relation to the movie_id
    game_index = game_map[game_id]
    game_vector = matrix[game_index]
    k+=1
    # knn model fits created matrix so the 
    kNN = NearestNeighbors(algorithm = 'brute', metric='cosine')
    kNN.fit(matrix)
    game_vec = game_vector.reshape(1,-1)
    # uses kneighborst to fin the 'k' most related games
    distances, indices = kNN.kneighbors(game_vec, n_neighbors=k)
    # adds returned list of games to neighbor_ids
    for i in range(0,len(distances.flatten())):
        n = indices.flatten()[i]
        neighbour_ids.append(game_i_map[n])
    neighbour_ids.pop(0)
    return neighbour_ids
  

In [39]:
# creating a dictionary to pull the name of games from based on IDs
game_names = dict(zip(df_reviews['app_id'], df_reviews['app_name']))

In [40]:
game_id = 225540
  
recommended_games = find_related_games(game_id, k=10)
  
print(f"Games related to: {game_names[game_id]}")
for i in recommended_games:
    print(game_names[i])

Games related to: Just Cause 3
Just Cause 4
Grand Theft Auto V
Watch_Dogs 2
Dying Light
Rise of the Tomb Raider
Far Cry 5
DOOM
The Witcher 3: Wild Hunt
Fallout 4
Garry's Mod


In [41]:
game_id = 4000
  
recommended_games = find_related_games(game_id, k=10)
  
print(f"Games related to: {game_names[game_id]}")
for i in recommended_games:
    print(game_names[i])

Games related to: Garry's Mod
PAYDAY 2
Terraria
Portal 2
Counter-Strike: Source
Half-Life
Grand Theft Auto V
Tom Clancy's Rainbow Six Siege
Among Us
Rust
People Playground


In [42]:
game_id = 413150
  
recommended_games = find_related_games(game_id, k=10)
  
print(f"Games related to: {game_names[game_id]}")
for i in recommended_games:
    print(game_names[i])

Games related to: Stardew Valley
Terraria
Don't Starve Together
Undertale
Slime Rancher
My Time At Portia
Hollow Knight
The Witcher 3: Wild Hunt
RimWorld
The Elder Scrolls V: Skyrim
The Binding of Isaac: Rebirth
