## True hybrid content-based and collaborative filtering model

In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Reader, Dataset, KNNWithZScore
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import our processed datasets
users_df = pd.read_csv('../data/steam_playtime_clean.csv')
games_df = pickle.load(open('../data/steam_games_clean.pkl', 'rb'))

### COLLABORATIVE FILTERING FOR USER PLAYTIME DATA

#### 3 Scaling methods
#### Test around to see which method or combination of methods is best
1. RobustScaler and StandardScaler
2. Just normalize
3. RobustScaler and Normalize?

In [3]:
# first do robustscaler to minimize outliers
# scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
# users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].array.reshape(-1,1))

In [4]:
# use StandardScaler to scale user playtimes
# scaler = StandardScaler()
# users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].values.reshape(-1, 1))

In [3]:
# Normalize the targets between 0 and 1. Makes it easy to train.
min_playtime = min(users_df['playtime_forever'])
max_playtime = max(users_df['playtime_forever'])
users_df['playtime_forever'] = users_df['playtime_forever'].apply(lambda x: (x - min_playtime) / (max_playtime - min_playtime)).values

In [4]:
# instantiate surprise.Reader()
reader = Reader()

# make surprise dataset
data = Dataset.load_from_df(users_df[['steam_id', 'appid', 'playtime_forever']], reader)

In [5]:
# make a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

#### Collaborative Filtering: KNNWithZScore

GridSearch for KNN

In [6]:
# set up gridsearch param_grid for KNNWithZScore
knn_param_grid = { 'k': [2000, 2500, 3000] }

In [7]:
grid_search = GridSearchCV(KNNWithZScore, knn_param_grid, cv=4, n_jobs=3, joblib_verbose=10)

In [8]:
grid_search.fit(data)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  9.5min


Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 28.3min


Computing the msd similarity matrix...
Computing the msd similarity matrix...


[Parallel(n_jobs=3)]: Done   9 out of  12 | elapsed: 29.0min remaining:  9.7min


Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 38.8min finished


In [9]:
# print best params for grid search
print('Best params: ', grid_search.best_params)

Best params:  {'rmse': {'k': 2500}, 'mae': {'k': 2500}}


In [10]:
# print best score for grid search
print('Score: ', grid_search.best_score)

Score:  {'rmse': 0.988578843788102, 'mae': 0.9872457685798821}


#### Train KNN

In [6]:
k_value = 2500

In [7]:
# make knn model with best params from gridsearch
knn = KNNWithZScore(k=k_value, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [8]:
# fit the training data and test with our test set
predictions = knn.fit(trainset).test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [9]:
# get accuracy
accuracy.rmse(predictions)

RMSE: 0.8562


0.8562379785712162

In [10]:
# build a full trainset now
trainset = data.build_full_trainset()

In [11]:
# instantiate new knn
knn = KNNWithZScore(k=k_value, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [12]:
# fit the knn
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x7f27fa249f10>

#### Recommend KNN

In [13]:
# function that takes game title and returns the knn model inner id
def get_innerid(title):
    # get appid from games_df
    appid = games_df[games_df['name'] == title]['appid'].values[0]
    inner_id = trainset.to_inner_iid(appid)
    return inner_id

In [14]:
# function that takes knn model innerid and returns the game title
def get_title(inner_id):
    steam_id = trainset.to_raw_iid(inner_id)
    title = games_df[games_df['appid'] == steam_id].iloc[0]['name']
    return title

In [15]:
def get_knn_appid(inner_id):
    return trainset.to_raw_iid(inner_id)

### CONTENT-BASED Cosine Similarity for Game Metadata

In [17]:
# instantiate tfidfvectorizer
tfidf = TfidfVectorizer(max_features=1500, lowercase=False, min_df=5, ngram_range=(1,3))

In [18]:
# make description column into type string, otherwise tfidf cries
games_df['description_clean'] = games_df['description_clean'].astype(str)

In [19]:
# fit tfidfvectorizer to description column
tfidf_matrix = tfidf.fit_transform(games_df['description_clean'])

In [20]:
# games_df drop columns not needed for model
game_tags = games_df.drop(columns=['appid', 'description_clean', 'name'])

In [21]:
# concatenate game_tags to tfidf_matrix
matrix = np.concatenate((tfidf_matrix.toarray(), game_tags.values), axis=1)


In [22]:
from sklearn.metrics.pairwise import pairwise_distances

In [26]:
something = 1 - pairwise_distances(matrix, metric='jaccard')



In [27]:
something.shape

(19985, 19985)

In [22]:
# make a cosine similarity matrix, maybe linear kernel instead?
cosine_sim = cosine_similarity(matrix, matrix)

In [23]:
cosine_sim.shape

(21435, 21435)

In [24]:
# Construct a reverse map of indices and game titles
indices = pd.Series(games_df.index, index=games_df['name'])

In [25]:
def recommend_content(title, sim_matrix):
    '''Get similar games based on content using the similarity matrix'''
    # get index for our game
    idx = indices[title]
    
    # get pairwise similarity scores of all games w.r.t to our game
    sim_scores = list(enumerate(sim_matrix[idx]))
    
    # sort scores based on similarity
    sorted_sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[0]]: i[1] for i in sorted_sim_scores[1:21]}

    return content_similar_scores

In [27]:
# test cosine similarity matrix
recommend_content('DOOM', cosine_sim)

{'Saints Row: The Third': 0.9129149571758234,
 'DOOM Eternal': 0.8219087947313767,
 'Toukiden: Kiwami': 0.8204882667359811,
 'Psychonauts': 0.8147717013707034,
 'Danganronpa Another Episode: Ultra Despair Girls': 0.8124618243885239,
 'Crash Bandicoot™ N. Sane Trilogy': 0.8097782650690324,
 'Monster Hunter: World': 0.7917810035257399,
 'UNLOVED': 0.7879067608216791,
 'NieR:Automata™': 0.779888323405217,
 'Hybrid Wars': 0.7746400183501742,
 'Dead Rising® 2': 0.7742371767686422,
 'Sniper Elite V2': 0.7731644840769599,
 'Dead Rising 2: Off the Record': 0.7714221974288752,
 'God Mode': 0.7696530650204885,
 'Toukiden 2': 0.7684901134454072,
 'Borderlands 2': 0.7657363824423967,
 'DRAGON BALL XENOVERSE': 0.7627429261609946,
 'HITMAN™ 2': 0.7608796778812182,
 'UnEpic': 0.7575569281467783,
 'SENRAN KAGURA SHINOVI VERSUS': 0.7568518168411396}

### Create one weighted similarity matrix from knn and cosine similarity matrices

In [28]:
knn_similarities = knn.compute_similarities()

Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [29]:
# make knn_similarities into a numpy array
knn_similarities = np.array(knn_similarities)

In [30]:
# Test
(knn_similarities.transpose() == knn_similarities).all()

True

In [31]:
len(knn_similarities)

21435

In [33]:
# function for mapping from knn similarity matrix to cosine similarity indices
def knn_index_to_cos_index(knn_index):
    return indices[get_title(knn_index)]

In [34]:
# given a knn vector, sort it to cosine vector order
def knn_vector_to_cos_vector(knn_vector):
    # make cos_vector that is of length knn_vector
    cos_vector = np.zeros(len(knn_vector))
    for i in range(len(knn_vector)):
        cos_vector[knn_index_to_cos_index(i)] = knn_vector[i]
    return cos_vector

In [35]:
# create a copy of knn_similarities with cosine indices
knn_sim_ordered = np.copy(knn_similarities)

In [36]:
for i in tqdm(range(len(knn_similarities))):
    knn_sim_ordered[knn_index_to_cos_index(i)] = knn_similarities[i]

100%|██████████| 21435/21435 [00:05<00:00, 4073.91it/s]


In [37]:
# test if vectors are in the correct location
knn_sim_ordered[indices['LIMBO']].mean() == knn_similarities[get_innerid('LIMBO')].mean()

True

In [38]:
# do the transpose
knn_sim_ordered = knn_sim_ordered.transpose()

In [39]:
knn_sim_ordered_2 = np.copy(knn_sim_ordered)

In [40]:
for i in tqdm(range(len(knn_similarities))):
    knn_sim_ordered_2[knn_index_to_cos_index(i)] = knn_sim_ordered[i]

100%|██████████| 21435/21435 [00:06<00:00, 3446.76it/s]


In [41]:
# test if transpose is the same as original
(knn_sim_ordered_2.transpose() == knn_sim_ordered_2).all()

True

In [42]:
# test if knn_sim_ordered_2 is proper, should give same results as knn
recommend_content('LIMBO', knn_sim_ordered_2)

{'I Expect You To Die': 0.27454736630620946,
 'Commander Keen': 0.274106323975587,
 'Headsnatchers': 0.27209856158671086,
 'Nex Machina': 0.269534692735038,
 'Fahrenheit: Indigo Prophecy Remastered': 0.262391855381297,
 'Accounting (Legacy)': 0.2612691703633596,
 'And Yet It Moves': 0.26093998595581996,
 'Panzer Paladin': 0.26008657046697836,
 'The Wonderful End of the World': 0.25888171040719943,
 'OLDTV': 0.25008569384672946,
 '1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby)': 0.24525422130661106,
 'Fury Unleashed': 0.2376934345259899,
 'Space Pilgrim Episode III: Delta Pavonis': 0.236696946905187,
 'Overload': 0.23559746525533365,
 'Omerta - City of Gangsters': 0.23480290972712897,
 'Space Pilgrim Episode II: Epsilon Indi': 0.22503780408171592,
 'Splice': 0.22219693551355096,
 'Auditorium': 0.21837039831672872,
 'Agent Origins: Escape': 0.21499978825928656,
 'Darwinia': 0.2146224077241486}

In [43]:
# change weights if desired
knn_sim_weight = 0.75
cos_sim_weight = 0.25

In [44]:
# make a weighted similarity matrix 0.75 knn, 0.25 cosine
weighted_sim = (knn_sim_ordered_2 * knn_sim_weight) + (cosine_sim * cos_sim_weight)

In [46]:
# remove a couple from memory to save space
del knn_sim_ordered_2
del cosine_sim
del knn_similarities

In [47]:
# test weighted matrix, should give same results as weighted_recommend_content in model_ensemble
recommend_content('LIMBO', weighted_sim)

{'Nex Machina': 0.3717511508762706,
 'Panzer Paladin': 0.33808431623707264,
 'Fahrenheit: Indigo Prophecy Remastered': 0.3300443392582293,
 'Headsnatchers': 0.3291294658709518,
 'INSIDE': 0.3243783853340738,
 'Commander Keen': 0.3209487897903679,
 'Space Pilgrim Episode III: Delta Pavonis': 0.31997343075343765,
 'Deadlight': 0.3197456603213798,
 'Until You Fall': 0.31946810760201794,
 'And Yet It Moves': 0.31879648044619774,
 'Fury Unleashed': 0.31661296866770733,
 'Wick': 0.3161343787396188,
 "Wonder Boy: The Dragon's Trap": 0.31196215461093363,
 'Space Pilgrim Episode II: Epsilon Indi': 0.31122907363583435,
 'Stray': 0.3092130447309133,
 'Accounting (Legacy)': 0.30901863198918106,
 'Overload': 0.30725634090827364,
 'I Expect You To Die': 0.3070405040991434,
 '140': 0.30298625972150545,
 'Basingstoke': 0.3029812813065932}

Convert float64 to float16 to save space

In [48]:
weighted_sim_compressed = np.array(weighted_sim, dtype=np.float16)

In [52]:
# test compressed matrix
recommend_content('The Evil Within 2', weighted_sim_compressed)

{'The Evil Within': 0.4124,
 'Prey': 0.3486,
 'SOMA': 0.3486,
 'Alien: Isolation': 0.3416,
 'Life is Strange - Episode 1': 0.3362,
 'DOOM': 0.3352,
 'Rise of the Tomb Raider™': 0.3337,
 'Resident Evil 7 Biohazard': 0.3333,
 'Resident Evil 2': 0.3313,
 'Thief': 0.3223,
 'Resident Evil': 0.3186,
 'Dishonored': 0.3113,
 'Metro: Last Light Redux': 0.308,
 'Alan Wake': 0.3005,
 'Dishonored 2': 0.2996,
 'Shadow Warrior 2': 0.299,
 'Bound By Flame': 0.2954,
 'Prototype 2': 0.2935,
 'Call of Cthulhu®': 0.2932,
 'Wolfenstein II: The New Colossus': 0.2922}

In [49]:
pickle.dump(weighted_sim_compressed, open('../models/weighted_sim_compressed.pkl', 'wb'))

Upper triangle of symmetric matrix to array WIP

Theoretically even more space could be saved, possibly at the expense of column retrieval overhead

In [54]:
# load weighted
weighted_sim_compressed = pickle.load(open('../models/weighted_sim_compressed.pkl', 'rb'))

In [55]:
# keep only upper triangle of matrix
compressed_upper_triangle = np.triu(weighted_sim_compressed, k=1)

In [56]:
upper_triangle_indices = np.triu_indices(len(weighted_sim_compressed), k=1)

In [57]:
# make compressed_upper_triangle into an array
compressed_upper_triangle = compressed_upper_triangle[upper_triangle_indices]

In [58]:
def return_matrix_value(i, j, array=compressed_upper_triangle, upper_indices=upper_triangle_indices):
    '''get matrix[i][j] value from array'''
    if i == j:
        return 1
    elif i > j:
        i, j = j, i
    return array[np.where((upper_indices[0] == i) & (upper_indices[1] == j))][0]

In [60]:
# test return_matrix_value
return_matrix_value(1, 1) == weighted_sim_compressed[1][1]

True

In [59]:
# currently returns an incomplete column
def return_matrix_column (i, array=compressed_upper_triangle, upper_indices=upper_triangle_indices):
    '''reconstruct matrix[i] column from array'''
    return array[np.where(upper_indices[0] == i)]

In [61]:
# test return_matrix_column length
len(return_matrix_column(0)) == len(weighted_sim_compressed[0])

19817

In [65]:
def recommend_content_upper(title, upper_matrix_array = compressed_upper_triangle):
    ''' Get similar games based on weighted'''
    idx = indices[title]

    # create pairwise similarity score from upper matrix array
    sim_scores = return_matrix_column(idx, upper_matrix_array)

    # sort scores based on similarity
    sorted_sim_scores = sorted(sim_scores, key=lambda x: x[0], reverse=True)

    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[1]]: i[0] for i in sorted_sim_scores[1:21]}

    return content_similar_scores

SyntaxError: invalid syntax (708955166.py, line 6)

In [None]:
# test recommendation retrieval
recommend_content_upper('LIMBO')

IndexError: tuple index out of range

In [None]:
# store upper in pickle
pickle.dump(compressed_upper_triangle, open('../models/compressed_triangle_test.pkl', 'wb'))