### Preprocessing

Uses the following data:  
1. games_df, a table of general information about (almost) all Steam games
2. review_table, a table of 6+million reviews scraped from the Steam store, listing the user, game, etc
3. recently_played_df, a table of users' recently played games with playtimes

Accomplishes the following:
1. Generate tables containing vectors for every game and every user. Our inputs will be limited to items in this table.
2. Generate reduced tables containing vectors of only the games and users that have rich enough info to be useful for prediction.

In [74]:
import pandas as pd
import numpy as np

from bidict import bidict

import pickle
import pyarrow as pa
import pyarrow.parquet as pq


import scipy.sparse as sp
from scipy.sparse import coo_matrix, csr_matrix, lil_matrix, save_npz

%store -r tags_dict

In [75]:
# Load

with open('../data/interim/1 - Games DF - Wrangled.pkl', 'rb') as file :
    games_df = pickle.load(file)

review_table = pq.read_table('../data/interim/cleaned_reviews.parquet')

with open('../data/interim/recently_played_cleaned.pkl', 'rb') as file :
    recently_played_df = pickle.load(file)

### Make games vectors/matrices

We'll produce one matrix with all known games. This will be used to vectorize input.  
  
Then we'll produce a matrix that contains only those games with sufficient information to be subjects of recommendation.  

In [76]:
# Isolate the info we'll use for the vectors
games_df_tags_only = games_df[['app_id', 'tags', 'tag_list']]
games_df_tags_only.head()

Unnamed: 0,app_id,tags,tag_list
0,730,"[FPS, Shooter, Multiplayer, Competitive, Actio...","[FPS, Shooter, Multiplayer, Competitive, Actio..."
1,553850,"[Action, Online Co-Op, Multiplayer, Third-Pers...","[Action, Online Co-Op, Third-Person Shooter, M..."
2,1086940,"[RPG, Choices Matter, Story Rich, Character Cu...","[RPG, Choices Matter, Story Rich, Character Cu..."
3,1245620,"[Souls-like, Dark Fantasy, Open World, RPG, Di...","[Souls-like, Dark Fantasy, Open World, RPG, Di..."
4,1623730,"[Multiplayer, Open World, Survival, Creature C...","[Multiplayer, Open World, Survival, Creature C..."


In [77]:
# Make a handy dict for the app_ids and their new indexes
game_to_full_index = bidict()
for index, row in games_df_tags_only.iterrows() :
    game_to_full_index[index] = row['app_id']
game_to_full_index = game_to_full_index.inverse

In [78]:
# Get our list of columns
used_tags = set(tags_dict.values())

In [79]:
# Prepare our sparse matrix
# NOTE: We will weight PRIMARY tags more strongly than non-primary tags.

weight_ratio = 0.8

matrix_values = []

for index, row in games_df_tags_only.iterrows() :
    skips = set()
    for tag in row['tags'] :
        skips.add(tag)
        tup = (index, tag, 1)
        matrix_values.append(tup)
    for tag in row['tag_list'] :
        if tag not in skips :
                    tup = (index, tag, weight_ratio)
                    matrix_values.append(tup)

matrix_values[0]

(0, 'FPS', 1)

In [80]:
# To make the matrix, we must index our tags.
tag_to_col_index = bidict()
i = 0
for value in tags_dict.values() :
    tag_to_col_index[value] = i
    i += 1

In [81]:
# Make the matrix

rows = [row[0] for row in matrix_values]
columns = [tag_to_col_index[row[1]] for row in matrix_values]
values = [row[2] for row in matrix_values]

matrix_row_count = max(rows)+1
matrix_col_count = max(columns)+1

game_tags_matrix = coo_matrix((values, (rows, columns)), shape=(matrix_row_count, matrix_col_count))
game_tags_matrix = csr_matrix(game_tags_matrix)

In [82]:
# Good job, everybody! Let's save it and move on.
save_npz('../data/processed/full_game_tag_matrix.npz', game_tags_matrix)

# And also the index dicts.
with open('../data/processed/tag_to_col_index.pkl', "wb") as file :
    pickle.dump(tag_to_col_index, file)

with open('../data/processed/game_to_full_index.pkl', "wb") as file :
    pickle.dump(game_to_full_index, file)


Now we subset this matrix to include only games with 10+ tags.

This index will be smaller, but we must be careful to note the original index values. That's the only way we can relate the rows in this matrix to any other matrix.

In [83]:
nonzero_counts = pd.Series(game_tags_matrix.getnnz(axis=1))
can_keep = nonzero_counts >= 10
game_tags_matrix_reduced = game_tags_matrix[can_keep]

## This creates a dict with:
##  KEYS == reduced matrix index
##  VALUES == corresponding full matrix index
game_reduced_index_to_full_index = bidict()
i=0
for index, value in can_keep.items() :
    if value==True :
        game_reduced_index_to_full_index[i]=index
        i += 1

In [84]:
# Save! That! Matrix!
save_npz('../data/processed/reduced_game_tag_matrix.npz', game_tags_matrix_reduced)

# And the dict, of course.
with open('../data/processed/game_reduced_index_to_full_index.pkl', 'wb') as file :
    pickle.dump(game_reduced_index_to_full_index, file)

### Now we make the users matrices...

In [85]:
recently_played_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4181501 entries, 0 to 4181500
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   user         object
 1   app_id       int64 
 2   playtime_2w  int64 
 3   playtime_f   int64 
dtypes: int64(3), object(1)
memory usage: 127.6+ MB


In [86]:
review_table.shape

(6747619, 11)

In [87]:
# Because our games_df does not contain every single game on Steam, it's possible
# that a game will be touched in a review or recently_played about which we cannot
# make inference.
# Let's make a set of all usable games to help limit our tables to legal values.
usable_app_ids = set(games_df['app_id'].values)
len(usable_app_ids)

100894

In [88]:
# Now let's reduce the above datasets to only those which touch usable games.
recently_played_df = recently_played_df[recently_played_df['app_id'].isin(usable_app_ids)].reset_index(drop=True)
recently_played_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3593525 entries, 0 to 3593524
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   user         object
 1   app_id       int64 
 2   playtime_2w  int64 
 3   playtime_f   int64 
dtypes: int64(3), object(1)
memory usage: 109.7+ MB


In [89]:
# To subset the review_talbe, we'll have to pandacize it first.

review_df = review_table.to_pandas()
review_df = review_df[review_df['app_id'].isin(usable_app_ids)].reset_index(drop=True)
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5726093 entries, 0 to 5726092
Data columns (total 10 columns):
 #   Column           Dtype  
---  ------           -----  
 0   user             object 
 1   app_id           int64  
 2   positive         int64  
 3   total_playtime   float64
 4   review_playtime  float64
 5   text             object 
 6   helpful_count    int64  
 7   review_date      object 
 8   edit_date        object 
 9   date_scraped     object 
dtypes: float64(2), int64(3), object(5)
memory usage: 436.9+ MB


In [90]:
# We should save processed versions of these for use in the modeling notebook.
usable_review_table = pa.Table.from_pandas(review_df)
pq.write_table(usable_review_table, '../data/processed/usable_review_table.parquet')

with open('../data/processed/usable_recently_played.pkl', 'wb') as file :
    pickle.dump(recently_played_df, file)

In [91]:
# The two tables above contain different but overlapping sets of users.
# In order to combine them into a single users matrix, we must first create
# a unified index for all touched users.

recently_users = set(recently_played_df['user'].values)
review_users = set(review_table['user'].to_pylist())
touched_users = recently_users | review_users

user_to_full_index = bidict()
i=0
for user in touched_users :
    user_to_full_index[user] = i
    i+=1

In [92]:
# We also need a unified index for games.

recently_games =set(recently_played_df['app_id'].values)
review_games = set(review_table['app_id'].to_pylist())
touched_games = recently_games | review_games

game_to_col_index = bidict()
i=0
for game in touched_games :
    game_to_col_index[game] = i
    i+=1

In [93]:
# Now let's prepare our points.
recently_played_points = recently_played_df.apply( \
                                lambda row: (user_to_full_index[row['user']], game_to_col_index[row['app_id']], 0.5), axis=1) \
                                .tolist()

In [94]:
# Since we'll have to update this matrix in a sec, let's make it a lil_matrix first.

user_info_matrix = lil_matrix((len(touched_users), len(touched_games)), dtype=float)

for point in recently_played_points :
    user_info_matrix[point[0], point[1]] = point[2]

In [95]:
# Now let's prepare the review data to update that matrix.

positive_reviews = review_df[review_df['positive']==1][['user', 'app_id']]
negative_reviews = review_df[review_df['positive']==0][['user', 'app_id']]

positive_points = positive_reviews.apply( \
                lambda row: (user_to_full_index[row['user']], game_to_col_index[row['app_id']], 1), axis=1) \
                .tolist()
                                
negative_points = negative_reviews.apply( \
                lambda row: (user_to_full_index[row['user']], game_to_col_index[row['app_id']], -1), axis=1) \
                .tolist()

In [96]:
## Now, we update the matrix.

for point in positive_points :
    user_info_matrix[point[0], point[1]] = point[2]

for point in negative_points :
    user_info_matrix[point[0], point[1]] = point[2]

In [97]:
## Just for funsies, make it the same format as our games matrix.
user_info_matrix = user_info_matrix.tocsr()

In [98]:
# Save! That! Matrix!
save_npz('../data/processed/user_info_matrix.npz', user_info_matrix)

# And the dicts, of course.
with open('../data/processed/user_to_full_index.pkl', 'wb') as file :
    pickle.dump(user_to_full_index, file)

with open('../data/processed/game_to_col_index.pkl', 'wb') as file :
    pickle.dump(game_to_col_index, file)

In [99]:
# Now we create a subset of the matrix that contains only users with enough info for prediction.
# Where should we put the threshold?

nonzero_counts = pd.Series(user_info_matrix.getnnz(axis=1))
nonzero_counts.describe()

count    1.887354e+06
mean     4.779766e+00
std      7.041132e+00
min      0.000000e+00
25%      1.000000e+00
50%      3.000000e+00
75%      6.000000e+00
max      5.517000e+03
dtype: float64

In [100]:
# I'll arbitrarily choose 5. Sue me! I dare you.

can_keep = nonzero_counts >= 5
user_info_matrix_reduced = user_info_matrix[can_keep]

## This creates a dict with:
##  KEYS == reduced matrix index
##  VALUES == corresponding full matrix index
user_reduced_index_to_full_index = bidict()
i=0
for index, value in can_keep.items() :
    if value==True :
        user_reduced_index_to_full_index[i]=index
        i += 1

In [101]:
# Save! That! Matrix!
save_npz('../data/processed/user_info_matrix_reduced.npz', user_info_matrix_reduced)

# etc etc
with open('../data/processed/user_reduced_index_to_full_index.pkl', 'wb') as file :
    pickle.dump(user_reduced_index_to_full_index, file)

At time of inference, the top X consine similarity users' rows will be called up, and the most common games not already played by the user will be recommended.