# Construct datasets

In [100]:
import csv
import json
import random as rd

import pandas as pd
from surprise import (
    Dataset,
    Reader,
    KNNBasic,
)
from surprise.model_selection import (
    train_test_split,
)
from surprise.accuracy import rmse

In [101]:
def load_reviews(path: str):
    """
    read user-item rating
    :param path: 
    :return: 
    """
    review_dict = {
        'itemID': [],
        'userID': [],
        'rating': []
    }
    with open(path, 'r', encoding='utf-8') as f:
        csv_reader = csv.reader(f)
        for i, row in enumerate(csv_reader):
            if not i:
                continue
            review_dict['itemID'].append(row[0])
            review_dict['userID'].append(row[1])
            is_recommended = row[2]
            review_dict['rating'].append(2 if is_recommended == 'True' else 1)
    df = pd.DataFrame(review_dict)
    return df


def load_games(path: str):
    """
    Read game information
    :param path: 
    :param positive_ratio_threshold: 
    :return: 
    """
    ret = []
    with open(path, 'r', encoding='utf-8') as f:
        csv_reader = csv.reader(f)
        for i, row in enumerate(csv_reader):
            if not i:
                continue
            ret.append(
                {
                    'app_id': row[0],
                    'title': row[1],
                    'rating': row[-1],
                }
            )
    return ret

In [102]:
reviews_df = load_reviews('final_data/user_recommendations_subset_clean.csv')
games = load_games('final_data/game_details_subset_clean.csv')
reviews_df = reviews_df[
    reviews_df['itemID'].isin([game['app_id'] for game in games])
]
reader = Reader(rating_scale=(1, 2))
reviews_dataset = Dataset.load_from_df(
    reviews_df[['userID', 'itemID', 'rating']],
    reader
)
train, test = train_test_split(reviews_dataset, test_size=0.2)

In [103]:
print('[games] len: {}'.format(len(games)))

reviews_df

[games] len: 50


Unnamed: 0,itemID,userID,rating
0,33,12,2
1,29,1,1
2,23,13,2
3,2,8,2
4,23,11,2
...,...,...,...
604,30,5,2
605,23,20,2
606,30,20,2
607,30,4,2


# Build the training model

In [104]:
N_NEIGHBORS = 5

sim_options = {
    "name": "cosine",     # sim_options(dict): A dictionary of options for the similarity measure. Here we choose cosine
    "user_based": False,  # item-based
}

model = KNNBasic(k=N_NEIGHBORS, sim_options=sim_options)
model.fit(train)

# save similiarity matrix to csv file
sim_matrix = model.compute_similarities()
sim_matrix_df = pd.DataFrame(sim_matrix)

raw_iids = [model.trainset.to_raw_iid(inner_id) for inner_id in range(sim_matrix_df.shape[0])]
sim_matrix_df.columns = raw_iids
sim_matrix_df.index = raw_iids
# sim_matrix_df.to_csv('model_output/collaborative_filtering_sim_matrix.csv', index=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [105]:
# Calculate the top N similar items for each item
top_n_neighbors = {}
for i, inner_id in enumerate(model.trainset.all_items()):
    raw_id = model.trainset.to_raw_iid(inner_id)
    neighbors = model.get_neighbors(inner_id, k=N_NEIGHBORS)
    top_n_neighbors[raw_id] = [model.trainset.to_raw_iid(neighbor) for neighbor in neighbors]

# save top_n_neighbors into a Json file, which we'll read from for the recommend function
with open('model_output/collaborative_filtering_top_n_neighbours.json', 'w', encoding='utf-8') as f:
    json.dump(top_n_neighbors, f, ensure_ascii=False, indent=4)

# load json file
top_n_neighbors = json.load(open('model_output/collaborative_filtering_top_n_neighbours.json', 'r', encoding='utf-8'))

In [106]:
def recommend(user_id: str, top_n: int = 10):
    """
    recommend
    :param user_id: 
    :param top_n: 
    :return: 
    """
    game_ids = [game['app_id'] for game in games]
    # Get user rated items
    user_items = reviews_df[reviews_df['userID'] == user_id]['itemID'].tolist()
    if len(user_items) == 0:
        # If user has not rated any item 
        sorted_games = sorted(games, key=lambda x: x['rating'], reverse=True)
        return [game['app_id'] for game in sorted_games[:top_n]]
        # choices = rd.choices(sorted_games, k=top_n)
        # return [game['app_id'] for game in choices]
    # Get the top N similar items to the item rated by the users
    top_n_items = []
    for user_item in user_items:
        neighbor_list = top_n_neighbors.get(user_item, [])
        for neighbor in neighbor_list:
            if neighbor not in user_items and neighbor in game_ids:
                top_n_items.append(neighbor)
    top_n_items = list(set(top_n_items))
    return top_n_items[:top_n]
    # choices = rd.choices(top_n_items, k=top_n)
    # return choices

Right now we recommend top-n games.
We can also use the function random.choices() to randomly pick n recommendations. 


In [107]:
# testing
test_user_id = '1'
recommend_list = recommend(test_user_id)
# print('[Recommended Games]')
# for item in reviews_df[reviews_df['userID'] == test_user_id]['itemID'].tolist():
#     print(item)
print('[Model Recommendation List]')
for item in recommend_list:
    print(item)

[Model Recommendation List]
38
21
8
46
39
4
14
10
12
32
