# Modeling (Recommendation System)

## Part 1: Collaborative Filtering

In [26]:
import random
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from surprise import Dataset, Reader, KNNBasic, KNNWithMeans, SlopeOne, SVDpp, SVD, accuracy
from surprise.model_selection import train_test_split, KFold

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_review = pd.read_csv("archive/cleaned_reviews_v2.csv")
df_review = df_review.loc[:, ~df_review.columns.str.contains("^Unnamed")]
df_review.head()

Unnamed: 0,recommendationid,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,appid,steamid,num_games_owned,num_reviews,playtime_forever,playtime_last_two_weeks,playtime_at_review,last_played
0,55936147,A fun and quirky stealth-based problem solving...,2019-10-31 02:33:41,2019-10-31 02:33:41,True,2,0,0.55414,0,True,False,False,1018080,76561198051821837,0,12,41,0,41.0,2019-10-31 00:52:26
1,55989797,"Loved the art style, and the game ran very smo...",2019-10-31 11:57:40,2019-10-31 11:57:40,True,1,0,0.52381,0,True,False,False,1018080,76561197993790846,657,8,17,0,15.0,2019-11-03 09:37:50
2,64251252,the game crashed four times for one hour.... i...,2020-02-28 16:21:55,2020-02-28 16:26:49,False,0,0,0.0,0,True,False,False,1018080,76561198095855343,1254,24,76,0,76.0,2020-02-28 16:16:07
3,49140086,While I cannot recommend this Unity asset reli...,2019-02-21 15:52:16,2019-02-21 15:52:16,False,16,0,0.624971,0,True,False,False,1018090,76561198053422627,2384,1225,56,0,56.0,2019-02-23 20:59:13
4,49137406,Is extremely unoptimized and has laggy framera...,2019-02-21 13:10:21,2019-02-21 13:10:21,False,11,0,0.527824,0,True,False,False,1018090,76561198019816374,1351,1674,10,0,10.0,2019-02-21 11:27:35


In [4]:
df_bridge = pd.read_csv("archive/cleaned_bridge.csv", index_col=[0])
df_bridge

Unnamed: 0,appid,num_reviews,review_score,review_score_desc,total_positive,total_negative,total_reviews
0,1020470,2,6,Mostly Positive,360,106,466
1,1018050,0,0,No user reviews,0,0,0
2,1018060,0,0,No user reviews,0,0,0
3,1018080,3,0,3 user reviews,2,1,3
4,1018090,7,0,7 user reviews,1,6,7
...,...,...,...,...,...,...,...
94021,1045590,0,0,No user reviews,0,0,0
94022,1045600,0,0,No user reviews,0,0,0
94023,1045610,2,5,Mixed,38,29,67
94024,1045630,0,0,No user reviews,0,0,0


In [5]:
df_game = pd.read_csv("archive/cleaned_steam_db_v2.csv", index_col=[0])
df_game.head()

Unnamed: 0,type,name,steam_appid,required_age,is_free,genres,platform_windows,platform_mac,platform_linux,coming_date,...,lang_ru,lang_ar,tool,nsfw,film,release_date_code,initial_price_usd,final_price_usd,memory_gb,storage_gb
1,game,Al-Qadim: The Genie's Curse,1904640.0,0,False,"[1, 3]",True,False,False,2022-03-29,...,False,False,False,False,,3,3.204,3.204,0.0,2.0
2,game,Dungeons & Dragons - Stronghold: Kingdom Simul...,1904650.0,0,False,"[28, 2]",True,False,False,2022-03-29,...,False,False,False,False,,3,3.204,3.204,0.0,2.0
3,game,Chapel 3-D: The Ascent,1904680.0,0,False,"[1, 23]",True,False,False,,...,False,False,False,False,,0,0.0,0.0,1.0,0.0
4,game,VTuber Gallery : Anime Pose,1904690.0,0,True,"[51, 53, 55, 57, 59, 70]",True,False,False,2022-03-21,...,False,False,True,False,,3,0.0,0.0,8.0,0.0
5,dlc,Evolution - Alone and Unafraid Trait Pack,1904700.0,0,False,"[28, 2]",True,True,False,2023-08-30,...,False,False,False,False,,1,2.943675,2.943675,4.0,2.0


In [6]:
df_review["timestamp_created"] = pd.to_datetime(df_review["timestamp_created"])
df_review["timestamp_updated"] = pd.to_datetime(df_review["timestamp_updated"])
df_review["last_played"] = pd.to_datetime(df_review["last_played"])

## Simple model for active users

Collaborative filtering only requires user, item and rating features. In this case, the three features are selected below.

In [7]:
rating_table = df_review[["steamid", "appid", "voted_up"]]
rating_table.head()

Unnamed: 0,steamid,appid,voted_up
0,76561198051821837,1018080,True
1,76561197993790846,1018080,True
2,76561198095855343,1018080,False
3,76561198053422627,1018090,False
4,76561198019816374,1018090,False


In [8]:
rating_table["voted_up"] = rating_table["voted_up"].apply(lambda x: 1 if x else -1)
rating_table.head()

Unnamed: 0,steamid,appid,voted_up
0,76561198051821837,1018080,1
1,76561197993790846,1018080,1
2,76561198095855343,1018080,-1
3,76561198053422627,1018090,-1
4,76561198019816374,1018090,-1


Drop all users and items with low number of ratings.

In [10]:
user_activating = rating_table.groupby("steamid")["voted_up"].count().reset_index().rename(columns={ "voted_up": "review_count" })
print("Number of all users: {}".format(user_activating.shape[0]))
print("Number of active users (> 3 number of ratings): {}".format(user_activating[user_activating["review_count"] > 3].shape[0]))

Number of all users: 198497
Number of active users (> 3 number of ratings): 8045


In [11]:
item_activating = rating_table.groupby("appid")["voted_up"].count().reset_index().rename(columns={ "voted_up": "review_count" })
print("Number of all items: {}".format(item_activating.shape[0]))
print("Number of active items (> 3 number of ratings): {}".format(item_activating[item_activating["review_count"] > 3].shape[0]))

Number of all items: 45365
Number of active items (> 3 number of ratings): 21033


In [12]:
active_users = user_activating.loc[user_activating["review_count"] > 3, "steamid"]
active_items = item_activating.loc[item_activating["review_count"] > 3, "appid"]

rating_table = rating_table[rating_table["steamid"].isin(active_users) & rating_table["appid"].isin(active_items)]
rating_table

Unnamed: 0,steamid,appid,voted_up
3,76561198053422627,1018090,-1
4,76561198019816374,1018090,-1
7,76561198041661882,1018090,-1
10,76561198085502382,1018130,1
12,76561198036172352,1018130,1
...,...,...,...
299315,76561197970193418,1045580,1
299322,76561198280616270,1045650,1
299334,76561197990354493,1045650,1
299335,76561198082848099,1045650,1


It's necessary to avoid having sparse matrix for modeling. That's why we need to drop a large amount of inactive users and items.

In [13]:
# Shuffle the table before applying KFold
rating_table.sample(frac=1)

s_reader = Reader(rating_scale=(-1, 1))
s_data = Dataset.load_from_df(rating_table, s_reader)

trainset, testset = train_test_split(s_data, test_size=0.2)

In [14]:
models = [
	KNNBasic(sim_options={ "user_based": False }),
	KNNBasic(sim_options={ "user_based": True }),
	KNNWithMeans(sim_options={ "user_based": False }),
	KNNWithMeans(sim_options={ "user_based": True }),
	SlopeOne(),
	SVDpp()]
predictions = [None for _ in range(len(models))]

In [15]:
for i in range(len(models)):
	models[i].fit(trainset)
	predictions[i] = models[i].test(testset)
	print(accuracy.rmse(predictions[i], verbose=True), '\n')

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8479
0.8478935218037286 

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9092
0.9091738212026531 

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8848
0.8848009922594721 

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8156
0.8155967900596753 

RMSE: 0.8336
0.8335973456244792 

RMSE: 0.7640
0.7640333431307863 



SVD is the best model so far

In [61]:
def get_top_10(user, prediction_list, top=10):
	
	recommended_list = defaultdict(lambda: -2)
	for predictions in prediction_list:
		for uid, iid, true_r, est, _ in predictions:
			# Find user in the whole predictions
			if uid == user:
				recommended_list[iid] = max(recommended_list[iid], est)
				

	# It's possible to get less result. In this case, use a brute-force method
	if len(recommended_list) < top:
		for predictions in prediction_list:
			for item in active_items:
				recommended_list[item] = max(recommended_list[iid], models[i].predict(user, item).est)

	recommended_list = [(k, v) for k, v in recommended_list.items()]

	recommended_list.sort(key=lambda x: x[1], reverse=True)
	recommended_list = recommended_list[:top]

	return recommended_list



def extract_from_top_10(tuple_list):

	global df_game, df_bridge

	# Change the rating prediction
	recommendation_table = pd.DataFrame(tuple_list, columns=["appid", "likely_to_like"])
	recommendation_table["likely_to_like"] = recommendation_table["likely_to_like"].apply(
		lambda x: "YES" if x > 0.25 else ("NO" if x < -0.25 else "MAYBE"))
	recommendation_table["l"] = recommendation_table["likely_to_like"].apply(lambda x: 3 if (x == "YES") else (1 if (x == "NO") else 2))
	
	recommendation_table = pd.merge(
		recommendation_table, df_game[["name", "steam_appid", "genres", "final_price_usd", "memory_gb", "storage_gb"]],
		how="left", left_on="appid", right_on="steam_appid")
	
	recommendation_table = pd.merge(
		recommendation_table, df_bridge[["appid", "review_score", "review_score_desc", "total_reviews"]],
		how="left", on="appid"
	)

	# Drop all negative rating games
	recommendation_table = recommendation_table[~recommendation_table["review_score_desc"].str.contains("Negative")]

	# Sort recommendations by order
	recommendation_table.sort_values(by=["l", "review_score"], inplace=True, ascending=False)

	return recommendation_table.drop(columns=["l", "steam_appid"])

In [28]:
random_user = random.choice(rating_table["steamid"].unique())
print(random_user)

76561198423160291


In [45]:
played_table = rating_table[rating_table["steamid"] == random_user]
pd.merge(played_table, df_game[["name", "steam_appid", "genres", "final_price_usd", "memory_gb", "storage_gb"]], how="left", left_on="appid", right_on="steam_appid")

Unnamed: 0,steamid,appid,voted_up,name,steam_appid,genres,final_price_usd,memory_gb,storage_gb
0,76561198423160291,1287830,1,Forbidden Pleasure,1287830.0,"[1, 25, 4, 23, 3, 28]",1.1214,8.0,2.0
1,76561198423160291,1842220,-1,3D Lover,1842220.0,[4],4.806,2.0,4.0
2,76561198423160291,1099830,-1,Welcome to Paradise,1099830.0,"[1, 25, 4, 3, 28, 18]",5.607,8.0,2.0
3,76561198423160291,990500,-1,PANTY SLIDE VR,990500.0,"[4, 23]",2.8035,8.0,0.0


In [57]:
recommendation = get_top_10(random_user, [predictions[i] for i in range(6)])
recommendation

[(1664951, 0.8290471105101759),
 (1502230, 0.7438361796305677),
 (1490114, 0.7313492312292854),
 (892780, 0.7307326265972677),
 (1269840, 0.7204322087438434),
 (208318, 0.7180346969951413),
 (1105630, 0.7154225978014845),
 (2154140, 0.6896763204768381),
 (373740, 0.683872718018972),
 (264320, 0.6784694982640582)]

In [62]:
extract_from_top_10(recommendation)

Unnamed: 0,appid,likely_to_like,name,genres,final_price_usd,memory_gb,storage_gb,review_score,review_score_desc,total_reviews
1,1502230,YES,Tower of Waifus,"[4, 23, 3, 2]",0.84105,1.0,0.0,8,Very Positive,63
8,373740,YES,Pillars of Eternity - The White March Part II,[3],6.60825,4.0,14.0,8,Very Positive,83
0,1664951,YES,Train Sim World® 2: DB G6 Diesel Shunter Add-On,[28],7.5294,8.0,20.0,7,Positive,30
2,1490114,YES,Train Sim World 2: Rush Hour Season Ticket,[28],0.0,8.0,20.0,7,Positive,27
3,892780,YES,Sanguine Sanctum,[23],2.8035,2.0,0.0,7,Positive,42
4,1269840,YES,,,,,,7,Positive,32
5,208318,YES,,,,,,7,Positive,13
6,1105630,YES,,,,,,7,Positive,19
7,2154140,YES,Heroes of Eroticism - New Beginnings,"[4, 23]",4.4055,0.0,4.0,5,Mixed,23
9,264320,YES,Captain Morgane and the Golden Turtle,"[25, 4]",2.8035,0.0,3.0,5,Mixed,52
