# Modeling

Describes a system for taking a single user_id as an input and generating game recommendations.

### TODO:

- I'm getting a lot of ties. Integrate something like "positive review percent" that will prevent this.

- Combine all inference into a single function

- Develop a way to verify (remove a game from a user with 10+ games, try to infer it)

In [34]:
import pandas as pd
# import numpy as np
from scipy import stats
# import datatable as dt

import pickle
# import pyarrow as pa
import pyarrow.parquet as pq

from collections import Counter
from sklearn.preprocessing import MinMaxScaler

# import scipy.sparse as sp
# from scipy.sparse import coo_matrix, csr_matrix, lil_matrix
from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity

import time

import warnings
warnings.filterwarnings('ignore')

%store -r tags_dict

### Load data

In [21]:
# Load the dfs to display the rec results in a human-readable way
with open('../data/interim/1 - Games DF - Wrangled.pkl', 'rb') as file :
    games_df = pickle.load(file)
with open('../data/processed/usable_recently_played.pkl', 'rb') as file :
    recently_played_df = pickle.load(file)
review_table = pq.read_table('../data/processed/usable_review_table.parquet')
with open('../data/raw/all_users', 'rb') as file :
    all_users = pickle.load(file)

# Load the tables used to define input
game_tags_matrix = load_npz('../data/processed/full_game_tag_matrix.npz')
user_info_matrix = load_npz('../data/processed/user_info_matrix.npz')

# Load the tables used for inference
game_tags_matrix_reduced = load_npz('../data/processed/reduced_game_tag_matrix.npz')
user_info_matrix_reduced = load_npz('../data/processed/user_info_matrix_reduced.npz')

# Load the index converters for the games/tags matrix
with open('../data/processed/tag_to_col_index.pkl', "rb") as file :
    tag_to_col_index = pickle.load(file)
with open('../data/processed/game_reduced_index_to_full_index.pkl', 'rb') as file :
    game_reduced_index_to_full_index = pickle.load(file)
with open('../data/processed/game_to_full_index.pkl', 'rb') as file :
    game_to_full_index = pickle.load(file)

# Load the index converters for the users/games matrix
with open('../data/processed/user_to_full_index.pkl', 'rb') as file :
    user_to_full_index = pickle.load(file)
with open('../data/processed/game_to_col_index.pkl', 'rb') as file :
    game_to_col_index = pickle.load(file)
with open('../data/processed/user_reduced_index_to_full_index.pkl', 'rb') as file :
    user_reduced_index_to_full_index = pickle.load(file)

### Define our input

In [89]:
# Let's-a-go!

begin = time.time()

In [90]:
# Get user ID
target_user_index = 10000
target_user_id = user_to_full_index.inverse[target_user_index]
print(f"User id: {target_user_id}\n")

target_user_index = user_to_full_index[target_user_id]
target_user_row = user_info_matrix[target_user_index]
target_user_touched_games = set(target_user_row.indices)

print("User profile:")
for game in target_user_touched_games :
    app_id = game_to_col_index.inverse[game]
    score = user_info_matrix[target_user_index, game]
    print(f"{score} - {games_df[games_df['app_id']==app_id]['title'].values[0]}")

User id: 76561198074110258

User profile:
0.5 - Baldur's Gate 3
0.5 - Mass Effect™ Legendary Edition
0.5 - Overwatch® 2
0.5 - The Elder Scrolls® Online


In [91]:
# Load that user's info
target_user_index = user_to_full_index[target_user_id]
target_user_row = user_info_matrix[target_user_index]



### Collaborative recommendations

In [92]:
# Get an arbitrary number of similar users
def get_X_similar_users(user, X, verbose=False) :
    """
    Takes a user id and returns a sorted descending series of the X most similar users:
        keys = user index (in the reduced matrix)
        values = cosine similarity
    Rows that have played no games that the target user hasn't also played are removed,*
    as they have no novel info for prediction.
    """

    # Establish the indices of users that can be considered
    # (they must have played at least one game that the target user has not)
    # All users in the reduced table have played at least 5 games, so 
    if len(target_user_touched_games) < 5 :
        usable_indices = [index for index in range(user_info_matrix_reduced.shape[0]) if len(set(user_info_matrix_reduced[index].indices) - target_user_touched_games) > 0]
    else :
        usable_indices = range(0, user_info_matrix_reduced.shape[0])
    # NOTE: THIS IS WHERE I'M STUCK FOR NOW
    # I NEED TO FILTER THE MATRIX OR ASCERTAIN THE INDICES FOR USERS THAT HAVE PLAYED
    # AT LEAST ONE GAME THAT THE TARGET HAS NOT.
    # THIS MAY BE A NONISSUE, SINCE ALL USERS IN THE PREDICITON SET HAVE PLAYED AT
    # LEAST 5 GAMES, AND WE ARE REMOVING COSINE SIM SCORES OF 1
    # SO SOMEONE WHO HAS PLAYED 1 FEWER GAME MIGHT SCORE HIGHER THAN SOMEONE PLAYING 2 MORE
    # BUT ALL THE OPERATIONS I CAN THINK OF TO DO THIS ON A CSR ARE WAAAAY TOO SLOW

    # Go ahead and run cosine similarity now, so that the scores can be associated with the correct index.
    row_cosine_similarities = pd.Series(cosine_similarity(target_user_row, user_info_matrix_reduced)[0], name="similarity_score")

    # Remove the 1s
    row_cosine_similarities = row_cosine_similarities[row_cosine_similarities < 1]

    row_cosine_similarities.sort_values(ascending=False, inplace=True)
    top_X_similar = row_cosine_similarities[:X]

    if verbose==True :
        print(f"Top {X} most similar users to {user}:")
        for index, value in top_X_similar.items() :
            print(f"{round(value, 9)} -- {index}")
            
    return(top_X_similar)

In [93]:
similar_users = get_X_similar_users(target_user_id, 50, verbose=True)

Top 50 most similar users to 76561198074110258:
0.670820393 -- 614255
0.612372436 -- 437876
0.603022689 -- 407885
0.577350269 -- 254315
0.577350269 -- 486036
0.56694671 -- 66675
0.554700196 -- 141993
0.534522484 -- 469312
0.534522484 -- 521334
0.530330086 -- 161030
0.530330086 -- 377736
0.530330086 -- 47871
0.530330086 -- 125292
0.530330086 -- 532320
0.530330086 -- 474596
0.530330086 -- 527720
0.530330086 -- 311291
0.530330086 -- 136824
0.530330086 -- 332179
0.530330086 -- 322545
0.510310363 -- 130851
0.5 -- 69078
0.5 -- 465097
0.5 -- 446821
0.5 -- 281461
0.5 -- 13563
0.5 -- 362956
0.5 -- 175784
0.48507125 -- 578186
0.48507125 -- 158786
0.48507125 -- 158663
0.481125224 -- 111431
0.474341649 -- 127694
0.474341649 -- 237449
0.474341649 -- 99822
0.474341649 -- 581089
0.471404521 -- 374384
0.471404521 -- 417087
0.471404521 -- 174928
0.471404521 -- 236127
0.471404521 -- 599797
0.458831468 -- 54173
0.458831468 -- 604994
0.452267017 -- 115749
0.452267017 -- 335809
0.452267017 -- 498344
0.4522

In [94]:
# Generate suggestions based on that.
# First, find out all the games these "similar users" have played.
# Then, weight those playes by the similarity score, then sum them across users.
# I should probably normalize the similarity scores before doing this.
# All of this will be easier in a df.

# NOTE：I could speed all this up by just using arrays instead of rows/columns

normalized_similar_users = pd.DataFrame(similar_users)

normalized_similar_users['similarity_z_score'] = stats.zscore(normalized_similar_users['similarity_score'])

# Now I add all their played games as columns
relevant_games = set()
for user in similar_users.keys() :
    for game in user_info_matrix[user].indices :
        if game not in target_user_touched_games :
            relevant_games.add(game)
        
for game in relevant_games :
    normalized_similar_users[game] = 0

# Now fill those columns with the scores
for user in similar_users.keys() :
    for game in user_info_matrix[user].indices :
        normalized_similar_users.loc[user, game] = user_info_matrix[user, game]

# Now multiply the scores by the normalized similarity score
for user, row in normalized_similar_users.iterrows() :
    for game in relevant_games :
        if row[game] != 0 :
            normalized_similar_users.loc[user, game] = row['similarity_z_score'] * row[game]

# Now collect those scores
# NOTE: CONVERTS GAME COL INDEX BACK TO APP_ID AT THIS STEP
collab_filt_rec_scores = {}
for game in relevant_games:
    collab_filt_rec_scores[game_to_col_index.inverse[game]] = normalized_similar_users[game].sum()

collab_filt_rec_scores = pd.Series(collab_filt_rec_scores, name="collab_rec_score")

collab_filt_rec_scores = collab_filt_rec_scores.sort_values(ascending=False)

# To make them interactible with later scores, let's standardize them
scaler = MinMaxScaler()
collab_filt_scores = pd.Series(scaler.fit_transform(collab_filt_rec_scores.values.reshape(-1,1)).flatten(), index=collab_filt_rec_scores.index)

for app_id, score in collab_filt_rec_scores.items() :
    print(f"{round(score, 3)} -- {games_df[games_df['app_id']==app_id]['title'].values[0]}")

1.172 -- OUTRIDERS
1.172 -- Amnesia: A Machine for Pigs
1.172 -- Realm of the Mad God Exalt
1.028 -- Legend of Grimrock 2
1.028 -- Xanadu Next
1.028 -- Divinity: Original Sin 2 - Definitive Edition
1.028 -- King Arthur: Knight's Tale
1.028 -- Fallout: New Vegas
1.028 -- Moonring
1.028 -- Outcast - Second Contact
1.028 -- Baldur's Gate II: Enhanced Edition
1.028 -- Batman: Arkham City - Game of the Year Edition
1.028 -- Pillars of Eternity
1.028 -- Balatro
1.028 -- ATOM RPG: Post-apocalyptic indie game
1.028 -- FINAL FANTASY VII
1.028 -- SAND LAND Demo
1.028 -- Caves of Lore
0.762 -- War Thunder
0.762 -- MONOPOLY® PLUS
0.762 -- Another World – 20th Anniversary Edition
0.762 -- Assassin’s Creed® IV Black Flag™
0.762 -- The Witcher® 3: Wild Hunt
0.762 -- Max Payne 3
0.753 -- It Takes Two
0.753 -- TaleSpire
0.589 -- Wreckfest
0.558 -- Horizon Zero Dawn™ Complete Edition
0.536 -- Dota 2
0.511 -- Arma 3
0.5 -- Stygian: Reign of the Old Ones
0.5 -- Monster Sanctuary
0.5 -- The Wild Eight
0.5 

### Content-based recommendations

In [95]:
# Get user's most recent games.

# I already have the games in terms of col indices in the users matrix.
# I need the indices for the games matrix, to do cos similarity.
played_game_app_ids = [game_to_col_index.inverse[game] for game in target_user_touched_games]
played_game_full_matrix_row_indices = [game_to_full_index[game] for game in games_played_app_ids]
played_game_full_matrix_row_indices

[400]

In [96]:
# NOTE: STRETCH: Determine multimodality, possibly split the process here

In [97]:
# Determine most similar games for each, multiply by user preference, MinMaxScale.

def get_content_scores(user_row, limit, verbose=False) :

    """
    Takes a row from the users/games table, then does the following:
        1. Finds recs for each from the reduced game/tags matrix
        2. Creates a descending-sorted 10-row Series:
            keys = game's index (relative to main games_df)
            values = queried game's cosine similarity score to the queried game
        3. Weights all values in the series by the user's preference for the game
        4. Combines all resulting series into a single series with sim scores summed
        5. Returns the series
    """

    # Find the sim scores for each game, adding them to the main list

    similarity_series_list = []
    full_row_indexes = []

    for query_index in user_row.indices :

        # Get the reduced game/tags matrix index
        current_app_id = game_to_col_index.inverse[query_index] 
        current_full_row_index = game_to_full_index[current_app_id]
        # Let's grab the full row index of each game for use later
        full_row_indexes.append(current_full_row_index)
        try :
            reduced_row_index = game_reduced_index_to_full_index.inverse[current_full_row_index]
        except :
            ####
            continue 

        # Find the similarity score between games
        row_cosine_similarities = pd.Series(cosine_similarity(game_tags_matrix[current_full_row_index], game_tags_matrix_reduced)[0])
        # Reindex the predictions back to the main matrix index
        row_cosine_similarities.index = [game_reduced_index_to_full_index[index] for index in row_cosine_similarities.index]
        row_cosine_similarities.sort_values(ascending=False, inplace=True)
        top_10_similar = row_cosine_similarities[:11]

        # Get a coefficient to represent the user's preference for the game in question
        # First we have to find the game's column in the full user matrix
        preference_coefficient = user_row[0, query_index]
        # Then we just multiply.
        top_10_similar = top_10_similar * preference_coefficient

        # That's all we need for the score! Let's append.
        similarity_series_list.append(top_10_similar)

        if verbose == True :
            print(f"Recs for {games_df.loc[current_full_row_index]['title']}:")
            for rec in top_10_similar.items() :
                print(f"{round(rec[1], 3)} -- {games_df.loc[rec[0]]['title']}")

    # Combine the serieses into the main series
    final_scores = pd.Series()
    for similarity_series in similarity_series_list :
        final_scores = final_scores.add(similarity_series, fill_value=0)
    final_scores = final_scores.sort_values(ascending=False)

    # Remove already-played games from the main series
    for game in full_row_indexes :
        try :
            final_scores = final_scores.drop(labels=game)
        except :
            continue

    # Scale the series to make it similar to the collaborative score series
    scaler = MinMaxScaler()
    final_scores = pd.Series(scaler.fit_transform(final_scores.values.reshape(-1,1)).flatten(), index=final_scores.index)

    # Return the desired number of values
    if len(final_scores) < limit :
        limit = len(final_scores)
    final_scores = final_scores[:limit]

    # Set index to app_id
    final_scores.index = [game_to_full_index.inverse[game] for game in final_scores.index]

    if verbose == True :
        print('------------------')
        for game, score in final_scores.items() :
            print(f"{round(score, 3)} -- {games_df[games_df['app_id']==game]['title'].values[0]}")     
    
    return(final_scores)

In [98]:
content_filt_scores = get_content_scores(target_user_row, 50, verbose=True)


Recs for Mass Effect™ Legendary Edition:
0.5 -- Mass Effect™ Legendary Edition
0.394 -- Mass Effect (2007)
0.373 -- Mass Effect 2 (2010) Edition
0.371 -- Mass Effect™: Andromeda Deluxe Edition
0.325 -- STAR WARS™ Knights of the Old Republic™ II - The Sith Lords™
0.317 -- Ghostcon: Elementals
0.308 -- GreedFall
0.308 -- Dragon Age™ Inquisition
0.303 -- Jade Empire™: Special Edition
0.298 -- INSOMNIA: The Ark
0.298 -- STAR WARS™ Knights of the Old Republic™
Recs for Baldur's Gate 3:
0.5 -- Baldur's Gate 3
0.436 -- Hexxen: Hunters
0.435 -- The Way of Wrath
0.416 -- The Way of Wrath Demo
0.333 -- Dragonheir: Silent Gods
0.333 -- Solasta: Crown of the Magister
0.313 -- Dragon Age™ Inquisition
0.312 -- New Arc Line
0.304 -- Warhammer 40,000: Rogue Trader
0.3 -- Pathfinder: Wrath of the Righteous - Enhanced Edition
0.293 -- Forsaken Champions
Recs for The Elder Scrolls® Online:
0.5 -- The Elder Scrolls® Online
0.497 -- The Elder Scrolls Online: Gold Road
0.456 -- The Elder Scrolls Online: Nec

### Combine the recs for final rec

In [112]:
# It may be that tweaking the relative weights of the collaborative and content-based filter results
# can improve accuracy. Let's define this as a function so we can play with that programmatically
# later, if need be.

def combine_scores(collaborative, content_based, ratio=0.5, recs=10) :
    """
    Takes a series of collaborative filtering scores (key=app_id, value=score)
    And a series of content based filtering scores with the same schema
    And a 0-1 ratio of importance between the two (higher ratio favors collaborative scores)
    And the number of recommendations to be returned

    Returns a series of game names and recommendation scores
    """
    collaborative = collaborative * ratio
    content_based = content_based * (1-ratio)

    final_recs = collaborative.add(content_based, fill_value=0)

    final_recs.index = [games_df[games_df['app_id']==index]['title'].values[0] for index in final_recs.index]

    final_recs = final_recs.sort_values(ascending=False)

    if len(final_recs) < recs :
        recs = len(final_recs)
    
    final_recs = final_recs[:recs]

    return final_recs

In [121]:
# Do that
end_result = combine_scores(collab_filt_scores, content_filt_scores, ratio=0.4)

# Display the result!
print("------ User profile:")
for game in target_user_touched_games :
    app_id = game_to_col_index.inverse[game]
    score = user_info_matrix[target_user_index, game]
    print(f"{score} - {games_df[games_df['app_id']==app_id]['title'].values[0]}")

print('')
print('------ Recommendations')
print(end_result.head(10))


------ User profile:
0.5 - Baldur's Gate 3
0.5 - Mass Effect™ Legendary Edition
0.5 - Overwatch® 2
0.5 - The Elder Scrolls® Online

------ Recommendations
Dragon Age™ Inquisition               0.600000
Shatterline                           0.467377
FINAL FANTASY XIV Online              0.437352
OUTRIDERS                             0.400000
Realm of the Mad God Exalt            0.400000
Amnesia: A Machine for Pigs           0.400000
Outcast - Second Contact              0.392317
Legend of Grimrock 2                  0.392317
FINAL FANTASY VII                     0.392317
Baldur's Gate II: Enhanced Edition    0.392317
dtype: float64


In [102]:
print(f"Inference took {round(time.time()-begin, 2)}s")

Inference took 43.54s


### Testing

The basic idea is to remove a game or games from a user's profile (preferably a user with a significant number of touched games) and see if the engine recommends that game for the modified user profile.

In [None]:
# Programmatically assemble a set of usable user profiles (10+ touched games)

In [None]:
# Randomly remove one or more game(s) from each

In [None]:
# Run inference on each modified profile

In [None]:
# Assess error