## True hybrid content-based and collaborative filtering model

In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Reader, Dataset, KNNWithZScore
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import our processed datasets
users_df = pd.read_csv('../data/steam_playtime_clean.csv')
games_df = pickle.load(open('../data/steam_games_clean.pkl', 'rb'))

### COLLABORATIVE FILTERING FOR USER PLAYTIME DATA

#### 3 Scaling methods
#### Test around to see which method or combination of methods is best
1. RobustScaler and StandardScaler
2. Just normalize
3. RobustScaler and Normalize?

In [3]:
# first do robustscaler to minimize outliers
# scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
# users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].array.reshape(-1,1))

In [4]:
# use StandardScaler to scale user playtimes
# scaler = StandardScaler()
# users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].values.reshape(-1, 1))

In [5]:
# Normalize the targets between 0 and 1. Makes it easy to train.
min_playtime = min(users_df['playtime_forever'])
max_playtime = max(users_df['playtime_forever'])
users_df['playtime_forever'] = users_df['playtime_forever'].apply(lambda x: (x - min_playtime) / (max_playtime - min_playtime)).values

In [6]:
# instantiate surprise.Reader()
reader = Reader()

# make surprise dataset
data = Dataset.load_from_df(users_df[['steam_id', 'appid', 'playtime_forever']], reader)

In [7]:
# make a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

#### Collaborative Filtering: KNNWithZScore

GridSearch for KNN

In [8]:
# set up gridsearch param_grid for KNNWithZScore
# knn_param_grid = {'k': [1000, 2000, 3000]}

In [9]:
# grid_search = GridSearchCV(KNNWithZScore, knn_param_grid, cv=4, n_jobs=3, joblib_verbose=10, verbose=10)

In [10]:
# grid_search.fit(data)

In [11]:
# print best params for grid search
# print('Best params: ', grid_search.best_params)

In [12]:
# print best score for grid search
# print('Score: ', grid_search.best_score)

#### Train KNN

In [13]:
k_value = 3000

In [14]:
# make knn model with best params from gridsearch
# knn = KNNWithZScore(k=k_value, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [15]:
# fit the training data and test with our test set
# predictions = knn.fit(trainset).test(testset)

In [16]:
# get accuracy
# accuracy.rmse(predictions)

In [17]:
# build a full trainset now
trainset = data.build_full_trainset()

In [18]:
# instantiate new knn
knn = KNNWithZScore(k=k_value, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [19]:
# fit the knn
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x7f76fbe09a90>

#### Recommend KNN

In [20]:
# function that takes game title and returns the knn model inner id
def get_innerid(title):
    # get appid from games_df
    appid = games_df[games_df['name'] == title]['appid'].values[0]
    inner_id = trainset.to_inner_iid(appid)
    return inner_id

In [21]:
# function that takes knn model innerid and returns the game title
def get_title(inner_id):
    steam_id = trainset.to_raw_iid(inner_id)
    title = games_df[games_df['appid'] == steam_id].iloc[0]['name']
    return title

In [22]:
def get_knn_appid(inner_id):
    return trainset.to_raw_iid(inner_id)

### CONTENT-BASED Cosine Similarity for Game Metadata

In [23]:
# instantiate tfidfvectorizer
tfidf = TfidfVectorizer(max_features=1500, lowercase=False, min_df=5, ngram_range=(1,3))

In [24]:
# make description column into type string, otherwise tfidf cries
games_df['description_clean'] = games_df['description_clean'].astype(str)

In [25]:
# fit tfidfvectorizer to description column
tfidf_matrix = tfidf.fit_transform(games_df['description_clean'])

In [26]:
# games_df drop columns not needed for model
game_tags = games_df.drop(columns=['appid', 'description_clean', 'name'])

In [27]:
# concatenate game_tags to tfidf_matrix
matrix = np.concatenate((tfidf_matrix.toarray(), game_tags.values), axis=1)


In [28]:
# make a cosine similarity matrix, maybe linear kernel instead?
cosine_sim = cosine_similarity(matrix, matrix)

In [29]:
cosine_sim.shape

(19818, 19818)

In [30]:
# Construct a reverse map of indices and game titles
indices = pd.Series(games_df.index, index=games_df['name'])

In [31]:
def recommend_content(title, sim_matrix):
    '''Get similar games based on content using the similarity matrix'''
    # get index for our game
    idx = indices[title]
    
    # get pairwise similarity scores of all games w.r.t to our game
    sim_scores = list(enumerate(sim_matrix[idx]))
    
    # sort scores based on similarity
    sorted_sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[0]]: i[1] for i in sorted_sim_scores[1:21]}

    return content_similar_scores

In [32]:
# test cosine similarity matrix
recommend_content('DOOM', cosine_sim)

{'Saints Row: The Third': 0.9139263169053028,
 'DOOM Eternal': 0.8141261227327214,
 'Psychonauts': 0.8138345745792516,
 'Toukiden: Kiwami': 0.8123939580502787,
 'Danganronpa Another Episode: Ultra Despair Girls': 0.8107336745847363,
 'Crash Bandicoot™ N. Sane Trilogy': 0.8088163894163151,
 'Monster Hunter: World': 0.7892972248731135,
 'UNLOVED': 0.781898773458957,
 'NieR:Automata™': 0.7784778839759847,
 'Dead Rising® 2': 0.7737850076094995,
 'Sniper Elite V2': 0.7727015695804936,
 'Hybrid Wars': 0.7718737853520514,
 'Dead Rising 2: Off the Record': 0.7706058672550028,
 'Toukiden 2': 0.7661765608767603,
 'God Mode': 0.7660913162292741,
 'Borderlands 2': 0.7649451153087529,
 'DRAGON BALL XENOVERSE': 0.7581827283721905,
 'HITMAN™ 2': 0.7577488278289158,
 'UnEpic': 0.757106503356417,
 'SENRAN KAGURA ESTIVAL VERSUS': 0.755555685396987}

### Create one weighted similarity matrix from knn and cosine similarity matrices

In [33]:
knn_similarities = knn.compute_similarities()

Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [34]:
# make knn_similarities into a numpy array
knn_similarities = np.array(knn_similarities)

In [35]:
# Test
(knn_similarities.transpose() == knn_similarities).all()

True

In [36]:
len(knn_similarities)

19818

In [37]:
games_df.head()

Unnamed: 0,appid,name,Captions available,Co-op,Commentary available,Cross-Platform Multiplayer,Downloadable Content,Full controller support,Game demo,In-App Purchases,...,Simulation,Software Training,Sports,Strategy,Tutorial,Utilities,Video Production,Violent,Web Publishing,description_clean
0,300,Day of Defeat: Source,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,day defeat offer intense online action gamepla...
1,320,Half-Life 2: Deathmatch,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,fast multiplayer action set halflife universe ...
2,340,Half-Life 2: Lost Coast,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,originally planned section highway chapter hal...
3,10180,Call of Duty®: Modern Warfare® 2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,mostanticipated game year sequel bestselling f...
4,550,Left 4 Dead 2,1,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,set zombie apocalypse left dead ld highly anti...


In [38]:
# function for mapping from knn similarity matrix to cosine similarity indices
def knn_index_to_cos_index(knn_index):
    return indices[get_title(knn_index)]

In [39]:
# given a knn vector, sort it to cosine vector order
def knn_vector_to_cos_vector(knn_vector):
    # make cos_vector that is of length knn_vector
    cos_vector = np.zeros(len(knn_vector))
    for i in range(len(knn_vector)):
        cos_vector[knn_index_to_cos_index(i)] = knn_vector[i]
    return cos_vector

In [40]:
# create a copy of knn_similarities with cosine indices
knn_sim_ordered = np.copy(knn_similarities)

In [41]:
for i in tqdm(range(len(knn_similarities))):
    knn_sim_ordered[knn_index_to_cos_index(i)] = knn_similarities[i]

100%|██████████| 19818/19818 [00:04<00:00, 4053.88it/s]


In [42]:
# test if vectors are in the correct location
knn_sim_ordered[indices['LIMBO']].mean() == knn_similarities[get_innerid('LIMBO')].mean()

True

In [43]:
# do the transpose
knn_sim_ordered = knn_sim_ordered.transpose()

In [44]:
knn_sim_ordered_2 = np.copy(knn_sim_ordered)

In [45]:
for i in tqdm(range(len(knn_similarities))):
    knn_sim_ordered_2[knn_index_to_cos_index(i)] = knn_sim_ordered[i]

100%|██████████| 19818/19818 [00:05<00:00, 3399.88it/s]


In [46]:
# test if transpose is the same as original
(knn_sim_ordered_2.transpose() == knn_sim_ordered_2).all()

True

In [47]:
# test if knn_sim_ordered_2 is proper, should give same results as knn
recommend_content('LIMBO', knn_sim_ordered_2)

{'Mark of the Ninja': 0.8270509213987579,
 'VVVVVV': 0.8235271539319691,
 'World of Goo': 0.8231959832567497,
 "Mirror's Edge™": 0.8184542146833247,
 'Dead Space': 0.7965762606112908,
 'ORION: Prelude': 0.7853443560497188,
 'Magicka': 0.7791670352261109,
 'Trine 2: Complete Story': 0.7686020973785362,
 'INSIDE': 0.7651592572208243,
 'F.E.A.R. 3': 0.7624763911310805,
 'Rock of Ages': 0.7596923464643296,
 'Little Nightmares': 0.7578189386583029,
 'Scribblenauts Unlimited': 0.7487106438148136,
 'McPixel': 0.7445713729424711,
 'To the Moon': 0.7405275249772677,
 'Gone Home': 0.7386569119513423,
 'Tomb Raider': 0.7362503212578412,
 'Dear Esther': 0.7170404371258016,
 'Indie Game: The Movie': 0.7156302502155343,
 'Deadlight': 0.704006593446912}

In [48]:
# change weights if desired
knn_sim_weight = 0.75
cos_sim_weight = 0.25

In [49]:
# make a weighted similarity matrix 0.75 knn, 0.25 cosine
weighted_sim = (knn_sim_ordered_2 * knn_sim_weight) + (cosine_sim * cos_sim_weight)

In [50]:
# test weighted matrix, should give same results as weighted_recommend_content
recommend_content('LIMBO', weighted_sim)

{'Mark of the Ninja': 0.8021063728672503,
 'VVVVVV': 0.7775459027156847,
 'Dead Space': 0.7733138131288303,
 "Mirror's Edge™": 0.7584975075735776,
 'INSIDE': 0.75378938949706,
 'Tomb Raider': 0.7480251038935409,
 'Trine 2: Complete Story': 0.7409394967738444,
 'World of Goo': 0.740488478421895,
 'Little Nightmares': 0.728264741260435,
 'Magicka': 0.7206125916478496,
 'Deadlight': 0.7186974629343026,
 'Firewatch': 0.717219242194181,
 'Gone Home': 0.7130835930544159,
 'To the Moon': 0.7061513160218327,
 'McPixel': 0.6916789774291098,
 'ORION: Prelude': 0.6901382464067753,
 "Hellblade: Senua's Sacrifice": 0.6872420125138625,
 'METAL GEAR SOLID V: GROUND ZEROES': 0.6753099630586775,
 'F.E.A.R. 3': 0.6723610748742316,
 'Brutal Legend': 0.6696558244998068}

In [51]:
# store weighted similarity matrix in pickle
# pickle.dump(weighted_sim, open('../models/weighted_sim.pkl', 'wb'))

Convert float64 to float16 to save space

In [52]:
weighted_sim_compressed = np.array(weighted_sim, dtype=np.float16)

In [53]:
pickle.dump(weighted_sim_compressed, open('../models/weighted_sim_compressed.pkl', 'wb'))

Upper triangle of symmetric matrix to array WIP

Theoretically even more space could be saved, possibly at the expense of column retrieval overhead

In [54]:
# load weighted
weighted_sim_compressed = pickle.load(open('../models/weighted_sim_compressed.pkl', 'rb'))

In [55]:
# keep only upper triangle of matrix
compressed_upper_triangle = np.triu(weighted_sim_compressed, k=1)

In [56]:
upper_triangle_indices = np.triu_indices(len(weighted_sim_compressed), k=1)

In [57]:
# make compressed_upper_triangle into an array
compressed_upper_triangle = compressed_upper_triangle[upper_triangle_indices]

In [58]:
def return_matrix_value(i, j, array=compressed_upper_triangle, upper_indices=upper_triangle_indices):
    '''get matrix[i][j] value from array'''
    if i == j:
        return 1
    elif i > j:
        i, j = j, i
    return array[np.where((upper_indices[0] == i) & (upper_indices[1] == j))][0]

In [59]:
def return_matrix_column (i, array=compressed_upper_triangle, upper_indices=upper_triangle_indices):
    '''reconstruct matrix[i] column from array'''
    return array[np.where(upper_indices[0] == i)]

In [60]:
# test return_matrix_value
return_matrix_value(1, 1) == weighted_sim_compressed[1][1]

True

In [61]:
# test return_matrix_column
len(return_matrix_column(0))

19817

In [62]:
len(weighted_sim_compressed[0])

19818

In [63]:
column_length=weighted_sim_compressed.shape[0]

In [64]:
def get_similarity_array(i, length=column_length, array=compressed_upper_triangle):
    '''Given the index i, return array equivalent to weighted_sim[i]'''


In [65]:
def recommend_content_upper(title, upper_matrix_array = upper):
    ''' Get similar games based on weighted'''
    idx = indices[title]

    # create pairwise similarity score from upper matrix array
    sim_scores =

    # sort scores based on similarity
    sorted_sim_scores = sorted(sim_scores, key=lambda x: x[0], reverse=True)

    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[1]]: i[0] for i in sorted_sim_scores[1:21]}

    return content_similar_scores


    

SyntaxError: invalid syntax (708955166.py, line 6)

In [None]:
# test recommendation retrieval
recommend_content_upper('LIMBO')

IndexError: tuple index out of range

In [None]:
# store upper in pickle
pickle.dump(compressed_upper_triangle, open('../models/compressed_triangle_test.pkl', 'wb'))