# Modeling (Recommendation System)

## Part 1: Collaborative Filtering

In [1]:
import pickle
import random
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from surprise import Dataset, Reader, KNNBasic, KNNWithMeans, SlopeOne, SVDpp, SVD, accuracy
from surprise.model_selection import train_test_split, KFold

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_review = pd.read_csv("archive/cleaned_reviews_v2.csv")
df_review = df_review.loc[:, ~df_review.columns.str.contains("^Unnamed")]
df_review.head()

Unnamed: 0,recommendationid,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,appid,steamid,num_games_owned,num_reviews,playtime_forever,playtime_last_two_weeks,playtime_at_review,last_played
0,55936147,A fun and quirky stealth-based problem solving...,2019-10-31 02:33:41,2019-10-31 02:33:41,True,2,0,0.55414,0,True,False,False,1018080,76561198051821837,0,12,41,0,41.0,2019-10-31 00:52:26
1,55989797,"Loved the art style, and the game ran very smo...",2019-10-31 11:57:40,2019-10-31 11:57:40,True,1,0,0.52381,0,True,False,False,1018080,76561197993790846,657,8,17,0,15.0,2019-11-03 09:37:50
2,64251252,the game crashed four times for one hour.... i...,2020-02-28 16:21:55,2020-02-28 16:26:49,False,0,0,0.0,0,True,False,False,1018080,76561198095855343,1254,24,76,0,76.0,2020-02-28 16:16:07
3,49140086,While I cannot recommend this Unity asset reli...,2019-02-21 15:52:16,2019-02-21 15:52:16,False,16,0,0.624971,0,True,False,False,1018090,76561198053422627,2384,1225,56,0,56.0,2019-02-23 20:59:13
4,49137406,Is extremely unoptimized and has laggy framera...,2019-02-21 13:10:21,2019-02-21 13:10:21,False,11,0,0.527824,0,True,False,False,1018090,76561198019816374,1351,1674,10,0,10.0,2019-02-21 11:27:35


In [4]:
df_bridge = pd.read_csv("archive/cleaned_bridge.csv", index_col=[0])
df_bridge

Unnamed: 0,appid,num_reviews,review_score,review_score_desc,total_positive,total_negative,total_reviews
0,1020470,2,6,Mostly Positive,360,106,466
1,1018050,0,0,No user reviews,0,0,0
2,1018060,0,0,No user reviews,0,0,0
3,1018080,3,0,3 user reviews,2,1,3
4,1018090,7,0,7 user reviews,1,6,7
...,...,...,...,...,...,...,...
94021,1045590,0,0,No user reviews,0,0,0
94022,1045600,0,0,No user reviews,0,0,0
94023,1045610,2,5,Mixed,38,29,67
94024,1045630,0,0,No user reviews,0,0,0


In [5]:
df_game = pd.read_csv("archive/cleaned_steam_db_v2.csv", index_col=[0])
df_game.head()

Unnamed: 0_level_0,name,steam_appid,required_age,is_free,genres,platform_windows,platform_mac,platform_linux,release_year,release_quarter,...,nsfw,film,developers,publishers,description,release_distance_value,initial_price_usd,final_price_usd,memory_gb,storage_gb
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
demo,Pin Them Demo,1904630,0,True,[],True,False,False,2023.0,2.0,...,False,False,[0],[0],,2,0.0,0.0,,
game,Al-Qadim: The Genie's Curse,1904640,0,False,"[1, 3]",True,False,False,2022.0,1.0,...,False,False,[1],[2],Experience the mysterious Al-Qadim game world ...,2,3.204,3.204,0.5,2.0
game,Dungeons & Dragons - Stronghold: Kingdom Simul...,1904650,0,False,"[28, 2]",True,False,False,2022.0,1.0,...,False,False,[3],[2],Run your own kingdom in the legendary Dungeons...,2,3.204,3.204,0.5,2.0
game,Chapel 3-D: The Ascent,1904680,0,False,"[1, 23]",True,False,False,,,...,False,False,[4],[5],"Chapel 3-D: The Ascent is a break-neck, viole...",0,0.0,0.0,1.0,0.0
game,VTuber Gallery : Anime Pose,1904690,0,True,"[51, 53, 55, 57, 59, 70]",True,False,False,2022.0,1.0,...,False,False,[6],[6],VTuber Gallery is #1 anime pose app that allow...,2,0.0,0.0,8.0,0.0


In [6]:
df_review["timestamp_created"] = pd.to_datetime(df_review["timestamp_created"])
df_review["timestamp_updated"] = pd.to_datetime(df_review["timestamp_updated"])
df_review["last_played"] = pd.to_datetime(df_review["last_played"])

## Simple model for active users

Collaborative filtering only requires user, item and rating features. In this case, the three features are selected below.

In [7]:
rating_table = df_review[["steamid", "appid", "voted_up"]]
rating_table.head()

Unnamed: 0,steamid,appid,voted_up
0,76561198051821837,1018080,True
1,76561197993790846,1018080,True
2,76561198095855343,1018080,False
3,76561198053422627,1018090,False
4,76561198019816374,1018090,False


It's better to have `weighted_vote_score` to check the quality of the ratings.

In [8]:
rating_table["voted_up"] = rating_table["voted_up"].apply(lambda x: 1 if x else -1)
rating_table.head()

Unnamed: 0,steamid,appid,voted_up
0,76561198051821837,1018080,1
1,76561197993790846,1018080,1
2,76561198095855343,1018080,-1
3,76561198053422627,1018090,-1
4,76561198019816374,1018090,-1


Drop all users and items with low number of ratings.

In [9]:
user_activating = rating_table.groupby("steamid")["voted_up"].count().reset_index().rename(columns={ "voted_up": "review_count" })
print("Number of all users: {}".format(user_activating.shape[0]))
print("Number of active users (> 3 number of ratings): {}".format(user_activating[user_activating["review_count"] > 3].shape[0]))

Number of all users: 198497
Number of active users (> 3 number of ratings): 8045


In [10]:
item_activating = rating_table.groupby("appid")["voted_up"].count().reset_index().rename(columns={ "voted_up": "review_count" })
print("Number of all items: {}".format(item_activating.shape[0]))
print("Number of active items (> 3 number of ratings): {}".format(item_activating[item_activating["review_count"] > 3].shape[0]))

Number of all items: 45365
Number of active items (> 3 number of ratings): 21033


In [11]:
active_users = user_activating.loc[user_activating["review_count"] > 3, "steamid"]
active_items = item_activating.loc[item_activating["review_count"] > 3, "appid"]

rating_table = rating_table[rating_table["steamid"].isin(active_users) & rating_table["appid"].isin(active_items)]
rating_table

Unnamed: 0,steamid,appid,voted_up
3,76561198053422627,1018090,-1
4,76561198019816374,1018090,-1
7,76561198041661882,1018090,-1
10,76561198085502382,1018130,1
12,76561198036172352,1018130,1
...,...,...,...
299315,76561197970193418,1045580,1
299322,76561198280616270,1045650,1
299334,76561197990354493,1045650,1
299335,76561198082848099,1045650,1


It's necessary to avoid having sparse matrix for modeling. That's why we need to drop a large amount of inactive users and items.

In [12]:
# Shuffle the table before applying KFold
rating_table.sample(frac=1)

s_reader = Reader(rating_scale=(-1, 1))
s_data = Dataset.load_from_df(rating_table, s_reader)

trainset, testset = train_test_split(s_data, test_size=0.2)

In [13]:
models = [
	KNNBasic(sim_options={ "user_based": False }),
	KNNBasic(sim_options={ "user_based": True }),
	KNNWithMeans(sim_options={ "user_based": False }),
	KNNWithMeans(sim_options={ "user_based": True }),
	SlopeOne(),
	SVDpp()]
predictions = [None for _ in range(len(models))]

In [14]:
for i in range(len(models)):
	models[i].fit(trainset)
	predictions[i] = models[i].test(testset)
	print(accuracy.rmse(predictions[i], verbose=True), '\n')

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8484
0.8484419614083509 

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9141
0.914089892970989 

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8934
0.893421200452595 

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8273
0.8272733096137835 

RMSE: 0.8400
0.8400097697834452 

RMSE: 0.7668
0.7667805263653049 



SVD is the best model so far

In [15]:
def get_top_10(user, prediction_list, top=10):
	
	recommended_list = defaultdict(lambda: -2)
	for predictions in prediction_list:
		for uid, iid, true_r, est, _ in predictions:
			# Find user in the whole predictions
			if uid == user:
				recommended_list[iid] = max(recommended_list[iid], est)
				

	# It's possible to get less result. In this case, use a brute-force method
	if len(recommended_list) < top:
		for predictions in prediction_list:
			for item in active_items:
				recommended_list[item] = max(recommended_list[iid], models[i].predict(user, item).est)

	recommended_list = [(k, v) for k, v in recommended_list.items()]

	recommended_list.sort(key=lambda x: x[1], reverse=True)
	recommended_list = recommended_list[:top]

	return recommended_list



def extract_from_top_10(tuple_list):

	global df_game, df_bridge

	# Change the rating prediction
	recommendation_table = pd.DataFrame(tuple_list, columns=["appid", "likely_to_like"])
	recommendation_table["likely_to_like"] = recommendation_table["likely_to_like"].apply(
		lambda x: "YES" if x > 0.25 else ("NO" if x < -0.25 else "MAYBE"))
	recommendation_table["l"] = recommendation_table["likely_to_like"].apply(lambda x: 3 if (x == "YES") else (1 if (x == "NO") else 2))
	
	recommendation_table = pd.merge(
		recommendation_table, df_game[["name", "steam_appid", "genres", "final_price_usd", "memory_gb", "storage_gb"]],
		how="left", left_on="appid", right_on="steam_appid")
	
	recommendation_table = pd.merge(
		recommendation_table, df_bridge[["appid", "review_score", "review_score_desc", "total_reviews"]],
		how="left", on="appid"
	)

	# Drop all negative rating games
	recommendation_table = recommendation_table[~recommendation_table["review_score_desc"].str.contains("Negative")]

	# Sort recommendations by order
	recommendation_table.sort_values(by=["l", "review_score"], inplace=True, ascending=False)

	return recommendation_table.drop(columns=["l", "steam_appid"])

In [16]:
random_user = random.choice(rating_table["steamid"].unique())
print(random_user)

76561199124363642


In [17]:
played_table = rating_table[rating_table["steamid"] == random_user]
pd.merge(played_table, df_game[["name", "steam_appid", "genres", "final_price_usd", "memory_gb", "storage_gb"]], how="left", left_on="appid", right_on="steam_appid")

Unnamed: 0,steamid,appid,voted_up,name,steam_appid,genres,final_price_usd,memory_gb,storage_gb
0,76561199124363642,1074340,1,Viking Sisters,1074340,"[25, 4, 23]",1.80225,0.5,0.5
1,76561199124363642,1087940,1,Incredible Dracula 3: Family Secret,1087940,"[25, 4, 23]",1.80225,1.5,0.75
2,76561199124363642,301260,1,Gardens Inc. 2: The Road to Fame,301260,"[4, 28, 2]",4.806,,0.5
3,76561199124363642,437060,1,MOAI 3: Trade Mission Collector's Edition,437060,[4],5.6871,0.5,1.0


In [18]:
recommendation = get_top_10(random_user, [predictions[i] for i in range(6)])
recommendation

[(33950, 1),
 (102622, 1),
 (201802, 1),
 (208630, 1),
 (209460, 1),
 (217962, 1),
 (222554, 1),
 (237050, 1),
 (267360, 1),
 (300540, 1)]

In [19]:
extract_from_top_10(recommendation)

Unnamed: 0,appid,likely_to_like,name,genres,final_price_usd,memory_gb,storage_gb,review_score,review_score_desc,total_reviews
4,209460,YES,Rage: The Scorchers™,[1],4.505625,,,8,Very Positive,114
8,267360,YES,MURI,[1],2.8035,0.5,0.0,8,Very Positive,214
0,33950,YES,Fish Fillets 2,"[4, 23]",5.2065,0.5,,7,Positive,48
1,102622,YES,Orcs Must Die! - Artifacts of Power,"[1, 23, 2]",1.2015,,,7,Positive,35
2,201802,YES,Orcs Must Die! 2 - Are We There Yeti?,"[1, 23, 2]",2.8035,,,7,Positive,15
3,208630,YES,Midnight Mysteries 3: Devil on the Mississippi,"[25, 4]",4.806,0.0,,7,Positive,28
5,217962,YES,Dogfight 1942 Russia Under Siege,[1],1.80225,0.0,,7,Positive,11
6,222554,YES,Train Simulator: Western Lines of Scotland Rou...,[28],12.4155,,,7,Positive,38
7,237050,YES,Sanctum 2: Ruins of Brightholme,"[1, 23, 2]",3.504375,,,7,Positive,19
9,300540,YES,Sweet Lily Dreams,"[25, 23, 3]",8.21025,0.0,0.25,7,Positive,13


In [20]:
for i in range(6):
    with open(f"model/collab_pred_{i}.pkl", "wb") as f:
        pickle.dump(predictions[i], f)
    with open(f"model/collab_model_{i}.pkl", "wb") as f:
        pickle.dump(models[i], f)