## True hybrid content-based and collaborative filtering model

In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Reader, Dataset, KNNWithZScore
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from sklearn.metrics.pairwise import cosine_similarity

In [42]:
# import our processed datasets
users_df = pd.read_csv('../data/steam_playtime_clean.csv')
games_df = pd.read_csv('../data/steam_app_metadata_clean.csv')
# gamesinfo_df = pd.read_csv('../data/steam_gameinfo.csv')

### COLLABORATIVE FILTERING FOR USER PLAYTIME DATA

In [3]:
# check datasets appid length
len(users_df['appid'].unique()) == len(games_df['appid'].unique() == len(games_df))

True

#### 3 Scaling methods
#### Test around to see which method or combination of methods is best
1. RobustScaler and StandardScaler
2. Just normalize
3. RobustScaler and Normalize?

In [None]:
# first do robustscaler to minimize outliers
scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].array.reshape(-1,1))

In [6]:
# use StandardScaler to scale user playtimes
scaler = StandardScaler()
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].values.reshape(-1, 1))

In [4]:
# Normalize the targets between 0 and 1. Makes it easy to train.
min_playtime = min(users_df['playtime_forever'])
max_playtime = max(users_df['playtime_forever'])
users_df['playtime_forever'] = users_df['playtime_forever'].apply(lambda x: (x - min_playtime) / (max_playtime - min_playtime)).values

In [5]:
# instantiate surprise.Reader()
reader = Reader()

# make surprise dataset
data = Dataset.load_from_df(users_df[['steam_id', 'appid', 'playtime_forever']], reader)

In [6]:
# make a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

#### Collaborative Filtering: KNNWithZScore

GridSearch for KNN

In [25]:
# set up gridsearch param_grid for KNNWithZScore
knn_param_grid = {'k': [1000, 2000, 3000]}

In [26]:
grid_search = GridSearchCV(KNNWithZScore, knn_param_grid, cv=4, n_jobs=3, joblib_verbose=10, verbose=10)

In [27]:
grid_search.fit(data)

Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


In [1]:
# print best params for grid search
print('Best params: ', grid_search.best_params)

NameError: name 'grid_search' is not defined

In [16]:
# print best score for grid search
print('Score: ', grid_search.best_score)

Score:  {'rmse': 0.9886161579841967, 'mae': 0.9872910001510786}


#### Train KNN

In [9]:
k_value = 3000

In [12]:
# make knn model with best params from gridsearch
knn = KNNWithZScore(k=k_value, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [13]:
# fit the training data and test with our test set
predictions = knn.fit(trainset).test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [14]:
# get accuracy
accuracy.rmse(predictions)

RMSE: 0.9887


0.9886885593584188

In [7]:
# build a full trainset now
trainset = data.build_full_trainset()

In [10]:
# instantiate new knn
knn = KNNWithZScore(k=k_value, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [11]:
# fit the knn
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x7f1688320520>

#### Recommend KNN

In [12]:
# function that takes game title and returns the knn model inner id
def get_innerid(title):
    # get appid from games_df
    appid = games_df[games_df['name'] == title]['appid'].values[0]
    inner_id = trainset.to_inner_iid(appid)
    return inner_id

In [13]:
# function that takes knn model innerid and returns the game title
def get_title(inner_id):
    steam_id = trainset.to_raw_iid(inner_id)
    title = games_df[games_df['appid'] == steam_id].iloc[0]['name']
    return title

In [14]:
def get_knn_appid(inner_id):
    return trainset.to_raw_iid(inner_id)

### CONTENT-BASED Cosine Similarity for Game Metadata

In [15]:
# instantiate tfidfvectorizer
tfidf = TfidfVectorizer(max_features=1500, lowercase=False, min_df=5, ngram_range=(1,3))

In [16]:
# make description column into type string, otherwise tfidf cries
games_df['description_clean'] = games_df['description_clean'].astype(str)

In [17]:
# fit tfidfvectorizer to description column
tfidf_matrix = tfidf.fit_transform(games_df['description_clean'])

In [18]:
# make matrix into dataframe
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [19]:
# games_df columns 5 to second last are the tags
game_tags = games_df.iloc[:, 5:-1]

In [20]:
# concatentate tfidf_df and game_tags
tfidf_df = pd.concat([tfidf_df, game_tags], axis=1)

In [21]:
# make matrix for cosine sim
matrix = tfidf_df.values

In [22]:
matrix.shape

(19818, 1574)

In [23]:
# make a cosine similarity matrix, maybe linear kernel instead?
cosine_sim = cosine_similarity(matrix, matrix)

In [24]:
cosine_sim.shape

(19818, 19818)

In [43]:
# Construct a reverse map of indices and game titles
indices = pd.Series(games_df.index, index=games_df['name'])

In [45]:
def recommend_content(title, sim_matrix):
    '''Get similar games based on content using the similarity matrix'''
    # get index for our game
    idx = indices[title]
    
    # get pairwise similarity scores of all games w.r.t to our game
    sim_scores = list(enumerate(sim_matrix[idx]))
    
    # sort scores based on similarity
    sorted_sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[0]]: i[1] for i in sorted_sim_scores[1:21]}

    return content_similar_scores

In [27]:
# test cosine similarity matrix
recommend_content('LIMBO')

{'Jump King': 0.9090909090909093,
 '3000th Duel': 0.9090909090909093,
 'Driven Out': 0.9090909090909093,
 'Blasphemous': 0.8800214111431761,
 'Intruders: Hide and Seek': 0.875921833811626,
 'Hollow Knight': 0.8703882797784892,
 'Moonlighter': 0.8703882797784892,
 'Mute Crimson+': 0.8703882797784892,
 'Road 96 🛣️': 0.8703882797784892,
 'Celeste': 0.8703882797784892,
 'Jenny LeClue - Detectivu': 0.8703882797784892,
 'Valfaris': 0.8609166731619822,
 'GRIS': 0.8581163303210333,
 'Over the Alps': 0.8581163303210333,
 'Lost Ruins': 0.8581163303210333,
 'Super Meat Boy Forever': 0.8451443077823318,
 'Slime Rancher': 0.836242010007091,
 'CARRION': 0.836242010007091,
 'ScourgeBringer': 0.836242010007091,
 'Shovel Knight: Specter of Torment': 0.836242010007091}

### Create one weighted similarity matrix from knn and cosine similarity matrices

In [28]:
knn_similarities = knn.compute_similarities()

Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [29]:
# make knn_similarities into a numpy array
knn_similarities = np.array(knn_similarities)

In [30]:
# Test
(knn_similarities.transpose() == knn_similarities).all()

True

In [31]:
# function for mapping from knn similarity matrix to cosine similarity indices
def knn_index_to_cos_index(knn_index):
    return indices[get_title(knn_index)]

In [32]:
# given a knn vector, sort it to cosine vector order
def knn_vector_to_cos_vector(knn_vector):
    # make cos_vector that is of length knn_vector
    cos_vector = np.zeros(len(knn_vector))
    for i in range(len(knn_vector)):
        cos_vector[knn_index_to_cos_index(i)] = knn_vector[i]
    return cos_vector

In [34]:
# create a copy of knn_similarities with cosine indices
knn_sim_ordered = np.copy(knn_similarities)

In [35]:
for i in tqdm(range(len(knn_similarities))):
    knn_sim_ordered[knn_index_to_cos_index(i)] = knn_similarities[i]

100%|██████████| 19818/19818 [00:04<00:00, 4297.03it/s]


In [36]:
# test if vectors are in the correct location
knn_sim_ordered[indices['LIMBO']].mean() == knn_similarities[get_innerid('LIMBO')].mean()

True

In [37]:
# do the transpose
knn_sim_ordered = knn_sim_ordered.transpose()

In [38]:
knn_sim_ordered_2 = np.copy(knn_sim_ordered)

In [39]:
for i in tqdm(range(len(knn_similarities))):
    knn_sim_ordered_2[knn_index_to_cos_index(i)] = knn_sim_ordered[i]

100%|██████████| 19818/19818 [00:05<00:00, 3440.45it/s]


In [40]:
# test if transpose is the same as original
(knn_sim_ordered_2.transpose() == knn_sim_ordered_2).all()

True

In [41]:
# test if knn_sim_ordered_2 is proper, should give same results as knn
recommend_content('LIMBO', knn_sim_ordered_2)

{'Mark of the Ninja': 0.8270509213987579,
 'VVVVVV': 0.8235271539319691,
 'World of Goo': 0.8231959832567497,
 "Mirror's Edge™": 0.8184542146833247,
 'Dead Space': 0.7965762606112908,
 'ORION: Prelude': 0.7853443560497188,
 'Magicka': 0.7791670352261109,
 'Trine 2: Complete Story': 0.7686020973785362,
 'INSIDE': 0.7651592572208243,
 'F.E.A.R. 3': 0.7624763911310805,
 'Rock of Ages': 0.7596923464643296,
 'Little Nightmares': 0.7578189386583029,
 'Scribblenauts Unlimited': 0.7487106438148136,
 'McPixel': 0.7445713729424711,
 'To the Moon': 0.7405275249772677,
 'Gone Home': 0.7386569119513423,
 'Tomb Raider': 0.7362503212578412,
 'Dear Esther': 0.7170404371258016,
 'Indie Game: The Movie': 0.7156302502155343,
 'Deadlight': 0.704006593446912}

In [42]:
# change weights if desired
knn_sim_weight = 0.75
cos_sim_weight = 0.25

In [43]:
# make a weighted similarity matrix 0.75 knn, 0.25 cosine
weighted_sim = (knn_sim_ordered_2 * knn_sim_weight) + (cosine_sim * cos_sim_weight)

In [44]:
# test weighted matrix, should give same results as weighted_recommend_content
recommend_content('LIMBO', sim_matrix=weighted_sim)

{'Mark of the Ninja': 0.8021063728672503,
 'VVVVVV': 0.7775459027156847,
 'Dead Space': 0.7733138131288303,
 "Mirror's Edge™": 0.7584975075735776,
 'INSIDE': 0.75378938949706,
 'Tomb Raider': 0.7480251038935409,
 'Trine 2: Complete Story': 0.7409394967738444,
 'World of Goo': 0.740488478421895,
 'Little Nightmares': 0.728264741260435,
 'Magicka': 0.7206125916478496,
 'Deadlight': 0.7186974629343026,
 'Firewatch': 0.717219242194181,
 'Gone Home': 0.7130835930544159,
 'To the Moon': 0.7061513160218327,
 'McPixel': 0.6916789774291098,
 'ORION: Prelude': 0.6901382464067753,
 "Hellblade: Senua's Sacrifice": 0.6872420125138625,
 'METAL GEAR SOLID V: GROUND ZEROES': 0.6753099630586775,
 'F.E.A.R. 3': 0.6723610748742316,
 'Brutal Legend': 0.6696558244998068}

In [45]:
# store weighted similarity matrix in pickle
pickle.dump(weighted_sim, open('../models/weighted_sim.pkl', 'wb'))

### Matrix is big and symmetric, do stuff to save space

In [33]:
# convert float64 to float16 to save space
weighted_sim_compressed = np.array(weighted_sim, dtype=np.float16)

In [110]:
# store upper in pickle 
pickle.dump(weighted_sim_compressed, open('../models/weighted_sim_compressed.pkl', 'wb'))

Upper triangle of symmetric matrix to array WIP

Theoretically even more space could be saved, possibly at the expense of column retrieval overhead

In [94]:
# keep only upper triangle of matrix
compressed_upper_triangle = np.triu(weighted_sim_compressed, k=1)

In [111]:
upper_triangle_indices = np.triu_indices(len(weighted_sim_compressed), k=1)

In [112]:
# make compressed_upper_triangle into an array
compressed_upper_triangle = compressed_upper_triangle[np.nonzero(compressed_upper_triangle)]

In [None]:
column_length=weighted_sim_compressed.shape[0]

In [25]:
def get_similarity_array(i, length=column_length, array=compressed_upper_triangle):
    '''Given the index i, return array equivalent to weighted_sim[i]'''


In [52]:
def recommend_content_upper(title, upper_matrix_array = upper):
    ''' Get similar games based on weighted'''
    idx = indices[title]

    # create pairwise similarity score from upper matrix array
    sim_scores =

    # sort scores based on similarity
    sorted_sim_scores = sorted(sim_scores, key=lambda x: x[0], reverse=True)

    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[1]]: i[0] for i in sorted_sim_scores[1:21]}

    return content_similar_scores


    

In [53]:
# test recommendation retrieval
recommend_content_upper('LIMBO')

IndexError: tuple index out of range

In [103]:
# store upper in pickle
pickle.dump(compressed_upper_triangle, open('../models/compressed_triangle_test.pkl', 'wb'))