## True hybrid content-based and collaborative filtering model

In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, KNNWithZScore
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import accuracy

In [2]:
# import our processed datasets
users_df = pd.read_csv('../data/steam_playtime_clean.csv')
games_df = pickle.load(open('../data/steam_games_clean.pkl', 'rb'))

### COLLABORATIVE FILTERING FOR USER PLAYTIME DATA

#### 3 Scaling methods
#### Test around to see which method or combination of methods is best
1. RobustScaler and StandardScaler
2. Just normalize
3. RobustScaler and Normalize?

In [3]:
# first do robustscaler to minimize outliers
# scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
# users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].array.reshape(-1,1))

In [4]:
# use StandardScaler to scale user playtimes
# scaler = StandardScaler()
# users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].values.reshape(-1, 1))

In [3]:
# Normalize the targets between 0 and 1. Makes it easy to train.
min_playtime = min(users_df['playtime_forever'])
max_playtime = max(users_df['playtime_forever'])
users_df['playtime_forever'] = users_df['playtime_forever'].apply(lambda x: (x - min_playtime) / (max_playtime - min_playtime)).values

In [4]:
# instantiate surprise.Reader()
reader = Reader()

# make surprise dataset
data = Dataset.load_from_df(users_df[['steam_id', 'appid', 'playtime_forever']], reader)

In [5]:
# make a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

#### Collaborative Filtering: KNNWithZScore

GridSearch for KNN

In [26]:
# set up gridsearch param_grid for KNNWithZScore
knn_param_grid = { 'k': [1650, 1675] }

In [27]:
grid_search = GridSearchCV(KNNWithZScore, knn_param_grid, cv=4, n_jobs=3, joblib_verbose=10)

In [28]:
grid_search.fit(data)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  6.7min


Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 19.7min


Computing the msd similarity matrix...
Computing the msd similarity matrix...


[Parallel(n_jobs=3)]: Done   9 out of  12 | elapsed: 19.8min remaining:  6.6min


Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 26.4min finished


In [29]:
# print best params for grid search
print('Best params: ', grid_search.best_params)

Best params:  {'rmse': {'k': 1675}, 'mae': {'k': 1675}}


In [30]:
# print best score for grid search
print('Score: ', grid_search.best_score)

Score:  {'rmse': 0.8562411001447227, 'mae': 0.8297702315139189}


#### Train KNN

In [6]:
k_value = 1675

In [7]:
# make knn model with best params from gridsearch
knn = KNNWithZScore(k=k_value, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [8]:
# fit the training data and test with our test set
predictions = knn.fit(trainset).test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [9]:
# get accuracy
accuracy.rmse(predictions)

RMSE: 0.8563


0.8562545716382787

In [7]:
# build a full trainset now
trainset = data.build_full_trainset()

In [8]:
# instantiate new knn
knn = KNNWithZScore(k=k_value, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [9]:
# fit the knn
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x7f0c1c152af0>

#### Recommend KNN

In [10]:
# function that takes game title and returns the knn model inner id
def get_innerid(title):
    # get appid from games_df
    appid = games_df[games_df['name'] == title]['appid'].values[0]
    inner_id = trainset.to_inner_iid(appid)
    return inner_id

In [11]:
# function that takes knn model innerid and returns the game title
def get_title(inner_id):
    steam_id = trainset.to_raw_iid(inner_id)
    title = games_df[games_df['appid'] == steam_id].iloc[0]['name']
    return title

In [12]:
def get_knn_appid(inner_id):
    return trainset.to_raw_iid(inner_id)

### Content-Based Filtering of Game Metadata

In [13]:
# make description column into type string, otherwise tfidf cries
games_df['description_clean'] = games_df['description_clean'].astype(str)

In [14]:
# games_df drop columns not needed for model
game_tags = games_df.drop(columns=['appid', 'description_clean', 'name'])

#### TF-IDF

In [22]:
# instantiate tfidfvectorizer
tfidf = TfidfVectorizer(max_features=1000, min_df=5, ngram_range=(1,1))

In [23]:
# fit tfidfvectorizer to description column
tfidf_matrix = tfidf.fit_transform(games_df['description_clean'])

In [24]:
# concatenate game_tags to tfidf_matrix
matrix = np.concatenate((tfidf_matrix.toarray(), game_tags.values), axis=1)


#### CountVectorizer

In [31]:
# try CountVectorizer
countvec = CountVectorizer(max_features=1000, min_df=5, ngram_range=(1,3))
countvec_matrix = countvec.fit_transform(games_df['description_clean'])

In [32]:
matrix = np.concatenate((countvec_matrix.toarray(), game_tags.values), axis=1)

#### Word2Vec

In [17]:
from gensim.models import Word2Vec

# tokenize description_clean column
games_df['description_clean'] = games_df['description_clean'].apply(lambda x: x.split())

# instantiate word2vec model
w2v = Word2Vec(min_count=20, window=5, sample=1e-5, alpha=0.03, min_alpha=0.0007, negative=20)

In [18]:
games_df['description_clean']

0        [valve, next, installment, world, online, acti...
1        [valve, day, defeat, offer, intense, online, a...
2        [valve, fast, multiplayer, action, set, half, ...
3        [valve, originally, planned, section, highway,...
4        [infinity, ward, anticipated, game, year, sequ...
                               ...                        
21430    [sword, master, hero, fireball, totting, maide...
21431    [devcats, devcats, game, game, zodiacats, jigs...
21432    [cat, play, studio, discord, discord, game, ga...
21433    [amrita, pixel, survivor, roguelike, survival,...
21434    [bloody, pixel, special, offer, game, summary,...
Name: description_clean, Length: 21435, dtype: object

In [19]:
w2v.build_vocab(games_df['description_clean'], progress_per=10000)

In [20]:
w2v.train(games_df['description_clean'], total_examples=w2v.corpus_count, epochs=30, report_delay=1)

(18373058, 71001060)

In [21]:
# process game_descriptions into vectors based on word2vec model
game_desc_vectors = []
for desc in games_df['description_clean']:
    vector = np.zeros(100)
    for word in desc:
        try:
            vector += w2v.wv[word]
        except KeyError:
            pass
    game_desc_vectors.append(vector)

In [22]:
game_desc_vectors = np.array(game_desc_vectors)

In [23]:
matrix = np.concatenate((game_desc_vectors, game_tags.values), axis=1)

#### Cosine Similarity

In [24]:
# make a cosine similarity matrix, maybe linear kernel instead?
cosine_sim = cosine_similarity(matrix, matrix)

In [25]:
# Construct a reverse map of indices and game titles
indices = pd.Series(games_df.index, index=games_df['name'])

In [26]:
def recommend_content(title, sim_matrix):
    '''Get similar games based on content using the similarity matrix'''
    # get index for our game
    idx = indices[title]
    
    # get pairwise similarity scores of all games w.r.t to our game
    sim_scores = list(enumerate(sim_matrix[idx]))
    
    # sort scores based on similarity
    sorted_sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[0]]: i[1] for i in sorted_sim_scores[1:21]}

    return content_similar_scores

In [27]:
# test cosine similarity matrix
recommend_content('DOOM', cosine_sim)

{'Zombie Army Trilogy': 0.9033009717606886,
 'Borderlands Game of the Year': 0.9026415983435533,
 'DOOM II': 0.901135673581355,
 'Incoming Forces': 0.8804752010671191,
 'MADNESS: Project Nexus': 0.878067019940923,
 'Evasion': 0.8743980748950054,
 'Epic Showdown': 0.8730080179760811,
 'The Club™': 0.871285322761595,
 'Quake 4': 0.8710932476873114,
 'Steel Storm: Burning Retribution': 0.8698633064166583,
 'Beyond Flesh and Blood': 0.869120388544816,
 'World War Z: Aftermath': 0.8667028809249501,
 'DUSK': 0.8654299898984441,
 'The Haunted: Hells Reach': 0.8650702535232941,
 'Earthfall': 0.8649746564750768,
 'Aliens vs. Predator™': 0.8626780011230428,
 'Left 4 Dead': 0.8623077764933084,
 'Ghostrunner': 0.8618308382338482,
 'Killbot': 0.8579238238203406,
 'Return to Castle Wolfenstein': 0.8570421567011032}

### Create one weighted similarity matrix from knn and cosine similarity matrices

In [28]:
knn_similarities = knn.compute_similarities()

Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [29]:
# make knn_similarities into a numpy array
knn_similarities = np.array(knn_similarities)

In [30]:
# Test
(knn_similarities.transpose() == knn_similarities).all()

True

In [31]:
len(knn_similarities)

21435

In [32]:
# function for mapping from knn similarity matrix to cosine similarity indices
def knn_index_to_cos_index(knn_index):
    return indices[get_title(knn_index)]

In [33]:
# given a knn vector, sort it to cosine vector order
def knn_vector_to_cos_vector(knn_vector):
    # make cos_vector that is of length knn_vector
    cos_vector = np.zeros(len(knn_vector))
    for i in range(len(knn_vector)):
        cos_vector[knn_index_to_cos_index(i)] = knn_vector[i]
    return cos_vector

In [34]:
# create a copy of knn_similarities with cosine indices
knn_sim_ordered = np.copy(knn_similarities)

In [35]:
for i in tqdm(range(len(knn_similarities))):
    knn_sim_ordered[knn_index_to_cos_index(i)] = knn_similarities[i]

100%|██████████| 21435/21435 [00:04<00:00, 4303.19it/s]


In [36]:
# test if vectors are in the correct location
knn_sim_ordered[indices['LIMBO']].mean() == knn_similarities[get_innerid('LIMBO')].mean()

True

In [37]:
# do the transpose
knn_sim_ordered = knn_sim_ordered.transpose()

In [38]:
knn_sim_ordered_2 = np.copy(knn_sim_ordered)

In [39]:
for i in tqdm(range(len(knn_similarities))):
    knn_sim_ordered_2[knn_index_to_cos_index(i)] = knn_sim_ordered[i]

100%|██████████| 21435/21435 [00:06<00:00, 3542.01it/s]


In [40]:
# test if transpose is the same as original
(knn_sim_ordered_2.transpose() == knn_sim_ordered_2).all()

True

In [41]:
# test if knn_sim_ordered_2 is proper, should give same results as knn
recommend_content('LIMBO', knn_sim_ordered_2)

{'I Expect You To Die': 0.27454736630620946,
 'Commander Keen': 0.274106323975587,
 'Headsnatchers': 0.27209856158671086,
 'Nex Machina': 0.269534692735038,
 'Fahrenheit: Indigo Prophecy Remastered': 0.262391855381297,
 'Accounting (Legacy)': 0.2612691703633596,
 'And Yet It Moves': 0.26093998595581996,
 'Panzer Paladin': 0.26008657046697836,
 'The Wonderful End of the World': 0.25888171040719943,
 'OLDTV': 0.25008569384672946,
 '1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby)': 0.24525422130661106,
 'Fury Unleashed': 0.2376934345259899,
 'Space Pilgrim Episode III: Delta Pavonis': 0.236696946905187,
 'Overload': 0.23559746525533365,
 'Omerta - City of Gangsters': 0.23480290972712897,
 'Space Pilgrim Episode II: Epsilon Indi': 0.22503780408171592,
 'Splice': 0.22219693551355096,
 'Auditorium': 0.21837039831672872,
 'Agent Origins: Escape': 0.21499978825928656,
 'Darwinia': 0.2146224077241486}

In [42]:
# change weights if desired
knn_sim_weight = 0.5
cos_sim_weight = 0.5

In [43]:
# make a weighted similarity matrix 0.75 knn, 0.25 cosine
weighted_sim = (knn_sim_ordered_2 * knn_sim_weight) + (cosine_sim * cos_sim_weight)

In [44]:
# remove a couple from memory to save space
del knn_sim_ordered_2
del cosine_sim
del knn_similarities

In [46]:
# test weighted matrix, should give same results as weighted_recommend_content in model_ensemble
recommend_content('DOOM', weighted_sim)

{'PWND': 0.523241915384963,
 'Ghostrunner': 0.5089971930374433,
 'DOOM Eternal': 0.49778052984042764,
 'Hunted: The Demon’s Forge™': 0.4857034357398553,
 'World War Z: Aftermath': 0.4800480987388373,
 'Wolfenstein: The New Order': 0.4736872086513055,
 'Dying Light: Bad Blood': 0.4725587733921997,
 'Turok 2: Seeds of Evil': 0.46542521132741976,
 'Left 4 Dead': 0.46363119289216626,
 'Zombie Army Trilogy': 0.4632932124983115,
 'Alien Rage - Unlimited': 0.4624158601236639,
 'Earthfall': 0.4593403031132734,
 'Zombie Shooter 2': 0.45825936572756326,
 'Aliens vs. Predator™': 0.45718237482132207,
 'Nexuiz': 0.45675126742810795,
 'Sniper Elite: Nazi Zombie Army 2': 0.45669399943572886,
 'Rambo The Video Game + Baker Team DLC': 0.4566832054515414,
 'Left 4 Dead 2': 0.456552253717765,
 'Zombie Panic! Source': 0.455505617142948,
 'Painkiller Overdose': 0.4546614163069649}

Convert float64 to float16 to save space

In [47]:
weighted_sim_compressed = np.array(weighted_sim, dtype=np.float16)

In [48]:
# test compressed matrix
recommend_content('SOMA', weighted_sim_compressed)

{'Alien: Isolation': 0.6025,
 'The Evil Within 2': 0.529,
 'Amnesia: Rebirth': 0.5234,
 'Outlast': 0.509,
 'Amnesia: The Dark Descent': 0.5083,
 'Call of Cthulhu®': 0.4949,
 'Mutant Year Zero: Road to Eden': 0.4944,
 '>observer_': 0.4902,
 'Darkwood': 0.4832,
 'Prey': 0.4783,
 'Encased: A Sci-Fi Post-Apocalyptic RPG': 0.4744,
 'Outlast 2': 0.4705,
 'Axiom Verge': 0.4666,
 'Enigmoon': 0.4648,
 'Insincere': 0.4644,
 'The Descendant': 0.4634,
 'Resident Evil Revelations 2': 0.4614,
 'The Evil Within': 0.455,
 'Conarium': 0.4485,
 'Catherine Classic': 0.4473}

In [49]:
pickle.dump(weighted_sim_compressed, open('../models/weighted_sim_compressed.pkl', 'wb'))

Upper triangle of symmetric matrix to array WIP

Theoretically even more space could be saved, possibly at the expense of column retrieval overhead

In [54]:
# load weighted
weighted_sim_compressed = pickle.load(open('../models/weighted_sim_compressed.pkl', 'rb'))

In [55]:
# keep only upper triangle of matrix
compressed_upper_triangle = np.triu(weighted_sim_compressed, k=1)

In [56]:
upper_triangle_indices = np.triu_indices(len(weighted_sim_compressed), k=1)

In [57]:
# make compressed_upper_triangle into an array
compressed_upper_triangle = compressed_upper_triangle[upper_triangle_indices]

In [58]:
def return_matrix_value(i, j, array=compressed_upper_triangle, upper_indices=upper_triangle_indices):
    '''get matrix[i][j] value from array'''
    if i == j:
        return 1
    elif i > j:
        i, j = j, i
    return array[np.where((upper_indices[0] == i) & (upper_indices[1] == j))][0]

In [60]:
# test return_matrix_value
return_matrix_value(1, 1) == weighted_sim_compressed[1][1]

True

In [59]:
# currently returns an incomplete column
def return_matrix_column (i, array=compressed_upper_triangle, upper_indices=upper_triangle_indices):
    '''reconstruct matrix[i] column from array'''
    return array[np.where(upper_indices[0] == i)]

In [61]:
# test return_matrix_column length
len(return_matrix_column(0)) == len(weighted_sim_compressed[0])

19817

In [65]:
def recommend_content_upper(title, upper_matrix_array = compressed_upper_triangle):
    ''' Get similar games based on weighted'''
    idx = indices[title]

    # create pairwise similarity score from upper matrix array
    sim_scores = return_matrix_column(idx, upper_matrix_array)

    # sort scores based on similarity
    sorted_sim_scores = sorted(sim_scores, key=lambda x: x[0], reverse=True)

    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[1]]: i[0] for i in sorted_sim_scores[1:21]}

    return content_similar_scores

SyntaxError: invalid syntax (708955166.py, line 6)

In [None]:
# test recommendation retrieval
recommend_content_upper('LIMBO')

IndexError: tuple index out of range

In [None]:
# store upper in pickle
pickle.dump(compressed_upper_triangle, open('../models/compressed_triangle_test.pkl', 'wb'))