# Data loading

In [1]:
import pandas as pd

# Load datasets
games_df = pd.read_csv("games_cleaned.csv")
games_pca_df = pd.read_csv("games_cleaned_PCA.csv", usecols=['title', 'app_id', 'user_reviews_log', 'positive_ratio_log', 'price_original_log'] + ['PC{}'.format(i) for i in range(1, 116)])
recommendations_df = pd.read_csv("recommendations_with_score.csv", usecols=['user_id','app_id', 'hours_log', 'is_recommended','recommendation_credibility_normalized_log'])
users_df = pd.read_csv("sample_user_data.csv", usecols=['user_id', 'user_credibility_log','reviews'])
df_with_clusters = pd.read_csv("clustering.csv")

In [2]:
games_df = games_df.drop(columns = ["win","mac","linux","date_release","price_final_log","discount"])

In [3]:
# Merging datasets
# Start by merging the recommendations with the game information
merged_df = pd.merge(recommendations_df, users_df, on="user_id")
# # Then merge the user information
final_df = pd.merge(merged_df, games_df, on="app_id")

In [4]:
final_df = final_df[final_df['rating_encoded'] >= 4]

# Calculating new features

In [5]:
import numpy as np

# Here im creating a new variable called "enthusiasm" to represent how many hours the user spent
# in that game relative to the average hours a user spends on that game

# Calculate the average 'hours_log' for each game across all users
average_hours_per_game = final_df.groupby('app_id')['hours_log'].mean().reset_index(name='avg_hours_log')

# Merge this average back into the original dataframe
final_df = final_df.merge(average_hours_per_game, on='app_id')

# Calculate the 'enthusiasm' feature
final_df['enthusiasm'] = final_df['avg_hours_log'] / final_df['hours_log']

final_df['enthusiasm'].replace([np.inf, -np.inf], np.nan, inplace=True)

# Adding a small constant to the denominator
final_df['enthusiasm'] = final_df['avg_hours_log'] / (final_df['hours_log'] + 1e-9)

final_df = final_df.drop(columns = ['avg_hours_log','hours_log'])

# Scaling the different scores

In [6]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler to scale between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit the scaler to data and transform it
final_df['user_reviews_log'] = scaler.fit_transform(final_df[['user_reviews_log']])
final_df['positive_ratio_log'] = scaler.fit_transform(final_df[['positive_ratio_log']])
final_df['enthusiasm'] = scaler.fit_transform(final_df[['enthusiasm']])

# Calculating a new score for target

In [7]:
# A weightage between 0 and 1 for the final score,
# if you want to account for games with less reviews, put in a smaller number
popularity_weightage = .3

goodness_weightage = 1

# Weightage on enthusiasm, do note that popular multiplayer games tend to have more hours
enthusiasm_weightage = 0.3


final_df['is_recommend'] = final_df['is_recommended'] * \
                           ((final_df['user_reviews_log'] * popularity_weightage) + \
                           (final_df['positive_ratio_log'] * goodness_weightage) + \
                           (final_df['enthusiasm'] * enthusiasm_weightage))


final_df = final_df.drop(columns = ['positive_ratio_log','user_reviews_log','enthusiasm'])

# Encoding the user id and app id for model

In [8]:
# Since the model cannot take random user_id and app_id, i need to encode them into a
# list from 0 to number of users

import pandas as pd

# Encode 'user_id' since the model cannot take random user_ids
user_id_encoder = {id: idx for idx, id in enumerate(final_df['user_id'].unique())}
final_df['user_id_encoded'] = final_df['user_id'].map(user_id_encoder)

# Encode 'app_id' since the model cannot take random app_ids
app_id_encoder = {id: idx for idx, id in enumerate(final_df['app_id'].unique())}
final_df['app_id_encoded'] = final_df['app_id'].map(app_id_encoder)

# Decoder 'user_id' for later usage when the functions receive a list of original id
user_id_decoder = {v: k for k, v in user_id_encoder.items()}

In [9]:
# Here im scaling the target of the model to avoid any extraordinarily big or small numbers

from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler to scale between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit the scaler to data and transform it
final_df['is_recommend_normalized'] = scaler.fit_transform(final_df[['is_recommend']])

# Check the first few rows to see the normalization
print(final_df[['is_recommend', 'is_recommend_normalized']].head())


   is_recommend  is_recommend_normalized
0      1.110787                 0.874490
1      1.110787                 0.874490
2      1.110787                 0.874490
3      1.110787                 0.874490
4     -1.110787                 0.084769


# Building the NCF Model

In [10]:
# A NCF model is used here to try to model the non-linear interactions between the user and games
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Concatenate, Dense, Dropout
from tensorflow.keras.regularizers import l2

# Parameters for embedding layers
num_users = final_df['user_id_encoded'].nunique()
num_items = final_df['app_id_encoded'].nunique()
embedding_size = 10

# User and Item Inputs
user_id_input = Input(shape=(1,), name='user_id_input')
item_id_input = Input(shape=(1,), name='item_id_input')

# Embeddings
user_embedding_gmf = Embedding(num_users, embedding_size, embeddings_regularizer=l2(1e-6), name='user_embedding_gmf')(user_id_input)
item_embedding_gmf = Embedding(num_items, embedding_size, name='item_embedding_gmf')(item_id_input)

user_embedding_mlp = Embedding(num_users, embedding_size * 2, name='user_embedding_mlp')(user_id_input)
item_embedding_mlp = Embedding(num_items, embedding_size * 2, name='item_embedding_mlp')(item_id_input)

# Flatten embeddings
user_vector_gmf = Flatten(name='flatten_user_gmf')(user_embedding_gmf)
item_vector_gmf = Flatten(name='flatten_item_gmf')(item_embedding_gmf)

user_vector_mlp = Flatten(name='flatten_user_mlp')(user_embedding_mlp)
item_vector_mlp = Flatten(name='flatten_item_mlp')(item_embedding_mlp)

# GMF part (simple element-wise multiplication)
gmf_vector = Dot(axes=1, normalize=False, name='gmf_dot')([user_vector_gmf, item_vector_gmf])

# MLP part (concatenation followed by dense layers)
mlp_vector = Concatenate(name='concatenate_mlp')([user_vector_mlp, item_vector_mlp])
mlp_vector = Dense(32, activation='relu', kernel_regularizer=l2(1e-6), name='dense_layer_1')(mlp_vector)  # Reduced complexity

# Concatenate GMF and MLP parts remain unchanged
combined_vector = Concatenate(name='concatenate_gmf_mlp')([gmf_vector, mlp_vector])
predictions = Dense(1, activation=None, name='output_layer')(combined_vector)

# Define the model
model = Model(inputs=[user_id_input, item_id_input], outputs=predictions)
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mean_absolute_error', 'mean_squared_logarithmic_error'])


# Model summary
model.summary()

2024-04-15 22:20:23.451077: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-04-15 22:20:23.451104: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-04-15 22:20:23.451109: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-04-15 22:20:23.451263: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-15 22:20:23.451274: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


# Training the model

In [11]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and testing sets
X = final_df[['user_id_encoded', 'app_id_encoded']]
y = final_df['is_recommend_normalized']

# Split the data - adjust the test size as needed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
model.fit([X_train.user_id_encoded, X_train.app_id_encoded], y_train,
          batch_size=128, epochs=10,
          validation_split=0.1)

# Evaluate the model
model.evaluate([X_test.user_id_encoded, X_test.app_id_encoded], y_test)


Epoch 1/10


2024-04-15 22:20:24.184295: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m2974/2974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 19ms/step - loss: 0.0900 - mean_absolute_error: 0.2058 - mean_squared_logarithmic_error: 0.0422 - val_loss: 0.0486 - val_mean_absolute_error: 0.1449 - val_mean_squared_logarithmic_error: 0.0231
Epoch 2/10
[1m2974/2974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 19ms/step - loss: 0.0424 - mean_absolute_error: 0.1287 - mean_squared_logarithmic_error: 0.0201 - val_loss: 0.0500 - val_mean_absolute_error: 0.1402 - val_mean_squared_logarithmic_error: 0.0233
Epoch 3/10
[1m2974/2974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 19ms/step - loss: 0.0289 - mean_absolute_error: 0.1050 - mean_squared_logarithmic_error: 0.0136 - val_loss: 0.0571 - val_mean_absolute_error: 0.1494 - val_mean_squared_logarithmic_error: 0.0259
Epoch 4/10
[1m2974/2974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 19ms/step - loss: 0.0198 - mean_absolute_error: 0.0876 - mean_squared_logarithmic_error: 0.0089 - val_loss: 0

[0.07234605401754379, 0.17318186163902283, 0.03134565427899361]

# Creating sparse matrix for all user-app pair

In [12]:
# Here a COO-format sparse matrix is created for all user-app pair because a dense matrix
# is way too big

import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

data = np.ones(len(final_df), dtype=int)

# Create a COO-format sparse matrix from user-item interactions
sparse_matrix = coo_matrix((data, (final_df['user_id_encoded'], final_df['app_id_encoded'])))

sparse_matrix_csr = sparse_matrix.tocsr()


In [13]:
# Here a function is defined to get all the unteracted user-game pair to make predictions
def get_uninteracted_items(user_id, sparse_matrix, n_items):
    """Get a list of item IDs that the given user has not interacted with."""
    interacted_items = sparse_matrix[user_id].nonzero()[1]  # Get the indices of interacted items for the user
    all_items = np.arange(n_items)  # Array of all item IDs
    uninteracted_items = np.setdiff1d(all_items, interacted_items)  # Find items that the user hasn't interacted with
    return uninteracted_items

# Example usage
n_users = final_df['user_id_encoded'].max() + 1  # Assuming IDs start from 0
n_items = final_df['app_id_encoded'].max() + 1  # Assuming IDs start from 0

user_id = 2  # Example user ID
uninteracted_items = get_uninteracted_items(user_id, sparse_matrix_csr, n_items)
print(f"User {user_id} has not interacted with {len(uninteracted_items)} items.")


User 2 has not interacted with 21361 items.


# Generating Recommendation

In [17]:
# Here is a function that generates recommendations given a user_id, note that
# this user_id is the original user id, which will be encoded in the following
# chunk for the model to work.

def generate_recommendations_for_user(user_id, sparse_matrix, model, n_items, top_n=20):

    # Step 0 : Encode the user id
    user_id = user_id_encoder[user_id]

    # Step 1: Identify uninteracted items
    uninteracted_items = get_uninteracted_items(user_id, sparse_matrix, n_items)

    # Prepare user and item IDs for prediction
    user_ids = np.array([user_id] * len(uninteracted_items))
    item_ids = np.array(uninteracted_items)

    # Predict preferences for the uninteracted items
    predictions = model.predict([user_ids, item_ids])


    # Common fix for an extra dimension in predictions
    predictions = predictions.squeeze()

    # Proceed with combining arrays and the rest of the function...
    combined = np.rec.fromarrays([item_ids, predictions], names='item_id,prediction')

    # Step 3: Sort items by predicted preference
    sorted_items = np.sort(combined, order='prediction')[::-1]

    # Step 4: Filter to ensure unique item recommendations
    # This step is crucial to remove duplicates while keeping the highest ranked recommendation for each item.
    _, unique_indices = np.unique(sorted_items['item_id'], return_index=True)

    # Select top unique recommendations based on the unique_indices, up to top_n
    unique_top_indices = np.sort(unique_indices)[:top_n]
    top_item_ids = sorted_items[['item_id','prediction']][unique_top_indices]


    return top_item_ids


# Creating Dictionaries to map back the encoded user and app ids

In [15]:
# Here a function is defined to map the encoded app id into original app id, vice versa.

app_id_to_title = final_df.set_index('app_id_encoded')['title'].to_dict()
app_id_to_real_app_id = final_df.set_index('app_id_encoded')['app_id'].to_dict()


# Single recommendation

In [18]:
chosen_id = 947

recommended_items = pd.DataFrame(generate_recommendations_for_user(chosen_id, sparse_matrix_csr, model, n_items, top_n = 20))

recommended_items['item_id'] = recommended_items['item_id'].replace(app_id_to_real_app_id)

print(f"These are the game recommendations for user {chosen_id}.")
print(recommended_items)


[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
These are the game recommendations for user 947.
    item_id  prediction
0    732430    0.990542
1    383980    0.980681
2      4000    0.961953
3    636480    0.956884
4   1985690    0.953030
5    349040    0.952459
6   1062090    0.943024
7   1419860    0.941769
8    738060    0.941649
9     17470    0.941538
10   411370    0.940249
11  1369320    0.940042
12  1977530    0.939334
13   489830    0.936318
14   350080    0.936012
15   367500    0.934765
16   261570    0.934221
17   368340    0.932323
18  1038740    0.931727
19   447530    0.930789


# Evaluation- Kmeans Clustering


In [19]:
# Here a function is defined to the cluster games of the chosen user
def get_user_cluster(picked_userid):
    user_cluster = df_with_clusters[df_with_clusters['user_id'] == picked_userid]['cluster_label'].values

    if len(user_cluster) == 0:
        # print(f"User {picked_userid} is not found in any cluster.")
        pass
    else:
        # print(f"User {picked_userid} belongs to cluster {user_cluster[0]}.")
        pass

    return user_cluster[0]

In [20]:
# Here is a function to retrieve the top games in the user cluster
def get_top_n_popular_games_per_cluster():
    df_with_clusters_with_information = pd.merge(df_with_clusters, games, on='app_id', how='left')
    cluster_game_counts = df_with_clusters_with_information.groupby(['cluster_label', 'app_id', 'positive_ratio_log']).size().reset_index(name='User_Count')

    cluster_games_with_info = pd.merge(cluster_game_counts, games, on='app_id', how='left')

    cluster_game_counts_sorted = cluster_games_with_info.sort_values(by=['cluster_label', 'User_Count'], ascending=[True, False])

    top_n_popular_games_per_cluster = cluster_game_counts_sorted.groupby('cluster_label').head(500)
    # print(top_n_popular_games_per_cluster)

    return top_n_popular_games_per_cluster


In [21]:
# Here is a function that checks the recommended list of the NCF vs the top cluster games
def check_against_cluster_list(picked_userid, k, user_cluster, top_n_popular_games_per_cluster, ranked_item_score_merged_dataset):
    user_clusters = [int(x) for x in user_cluster[1:-1].split(',')]

    for cluster in user_clusters:
        user_cluster_info = top_n_popular_games_per_cluster[top_n_popular_games_per_cluster.apply(lambda row: str(cluster) in row['cluster_label'], axis=1)]

        is_in_cluster = ranked_item_score_merged_dataset['game_id'].head(k).isin(user_cluster_info['app_id'])
        true_false_counts = is_in_cluster.value_counts()

        # print(is_in_cluster)
        # print("Accuracy result of checking against cluster: " + str(cluster))
        # print(true_false_counts)

        if len(is_in_cluster) > 0 :
            precision_k_picked_user = sum(is_in_cluster)/len(is_in_cluster)
        else:
            precision_k_picked_user = 0
        # print(precision_k_picked_user)

        return precision_k_picked_user


# Wrapper

In [22]:
def recommend_and_evaluate_neural(picked_userid, k= 20):

    recommended_items = pd.DataFrame(generate_recommendations_for_user(picked_userid, sparse_matrix_csr, model, n_items))

    recommended_items['item_id'] = recommended_items['item_id'].replace(app_id_to_real_app_id)

    # Rename the 'item_id' column to 'game_id'
    recommended_items.rename(columns={'item_id': 'game_id'}, inplace=True)

    recommended_items.rename(columns={'prediction': 'game_score'}, inplace=True)

    ranked_item_score = recommended_items.sort_values(by='game_score', ascending=False)
    ranked_item_score_merged_dataset = ranked_item_score.merge(games_df, how='left', left_on='game_id', right_on='app_id')

    print(ranked_item_score_merged_dataset[['game_id', 'game_score', 'title']].head(k))

    user_cluster = get_user_cluster(picked_userid)

    top_n_popular_games_per_cluster = get_top_n_popular_games_per_cluster()

    precision_k_picked_user = check_against_cluster_list(picked_userid, k, user_cluster, top_n_popular_games_per_cluster, ranked_item_score_merged_dataset)

    print("Accuracy for user " + str(picked_userid) + ":")
    print(precision_k_picked_user)

    return precision_k_picked_user

In [23]:
N = 30
total = 0
k= 20

first_n_user_ids = users_df.head(N)['user_id'].tolist()
for user in first_n_user_ids:
    picked_userid = user
    try:
        precision_k_for_picked_user = recommend_and_evaluate_neural(947)(picked_userid, k)
        total += precision_k_for_picked_user
    except Exception as e:
        continue

precision_k_n = total/N

print(f"Precision@K for N users, where K= {k} and N= {N}: ")
print(precision_k_n)

[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
    game_id  game_score                                        title
0    732430    0.990542                                  Superflight
1    383980    0.980681                             Rivals of Aether
2      4000    0.961953                                  Garry's Mod
3    636480    0.956884                                   Ravenfield
4   1985690    0.953030                                   The Looker
5    349040    0.952459     NARUTO SHIPPUDEN: Ultimate Ninja STORM 4
6   1062090    0.943024                                   Timberborn
7   1419860    0.941769                                       Tukoni
8    738060    0.941649          Freddy Fazbear's Pizzeria Simulator
9     17470    0.941538                            Dead Space (2008)
10   411370    0.940249       Melty Blood Actress Again Current Code
11  1369320    0.940042                              Virtual Cottage
12  1977530    0.939334     

[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
    game_id  game_score                                        title
0    732430    0.990542                                  Superflight
1    383980    0.980681                             Rivals of Aether
2      4000    0.961953                                  Garry's Mod
3    636480    0.956884                                   Ravenfield
4   1985690    0.953030                                   The Looker
5    349040    0.952459     NARUTO SHIPPUDEN: Ultimate Ninja STORM 4
6   1062090    0.943024                                   Timberborn
7   1419860    0.941769                                       Tukoni
8    738060    0.941649          Freddy Fazbear's Pizzeria Simulator
9     17470    0.941538                            Dead Space (2008)
10   411370    0.940249       Melty Blood Actress Again Current Code
11  1369320    0.940042                              Virtual Cottage
12  1977530    0.939334     

[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
    game_id  game_score                                        title
0    732430    0.990542                                  Superflight
1    383980    0.980681                             Rivals of Aether
2      4000    0.961953                                  Garry's Mod
3    636480    0.956884                                   Ravenfield
4   1985690    0.953030                                   The Looker
5    349040    0.952459     NARUTO SHIPPUDEN: Ultimate Ninja STORM 4
6   1062090    0.943024                                   Timberborn
7   1419860    0.941769                                       Tukoni
8    738060    0.941649          Freddy Fazbear's Pizzeria Simulator
9     17470    0.941538                            Dead Space (2008)
10   411370    0.940249       Melty Blood Actress Again Current Code
11  1369320    0.940042                              Virtual Cottage
12  1977530    0.939334     

[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
    game_id  game_score                                        title
0    732430    0.990542                                  Superflight
1    383980    0.980681                             Rivals of Aether
2      4000    0.961953                                  Garry's Mod
3    636480    0.956884                                   Ravenfield
4   1985690    0.953030                                   The Looker
5    349040    0.952459     NARUTO SHIPPUDEN: Ultimate Ninja STORM 4
6   1062090    0.943024                                   Timberborn
7   1419860    0.941769                                       Tukoni
8    738060    0.941649          Freddy Fazbear's Pizzeria Simulator
9     17470    0.941538                            Dead Space (2008)
10   411370    0.940249       Melty Blood Actress Again Current Code
11  1369320    0.940042                              Virtual Cottage
12  1977530    0.939334     

[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
    game_id  game_score                                        title
0    732430    0.990542                                  Superflight
1    383980    0.980681                             Rivals of Aether
2      4000    0.961953                                  Garry's Mod
3    636480    0.956884                                   Ravenfield
4   1985690    0.953030                                   The Looker
5    349040    0.952459     NARUTO SHIPPUDEN: Ultimate Ninja STORM 4
6   1062090    0.943024                                   Timberborn
7   1419860    0.941769                                       Tukoni
8    738060    0.941649          Freddy Fazbear's Pizzeria Simulator
9     17470    0.941538                            Dead Space (2008)
10   411370    0.940249       Melty Blood Actress Again Current Code
11  1369320    0.940042                              Virtual Cottage
12  1977530    0.939334     

# Calculate Novelty Score

In [24]:
# Read games data
games_df = pd.read_csv('games_cleaned.csv')

# Create the popularity_score dictionary
popularity_score = {}

# Populate the popularity_score dictionary based on the sample data
for index, row in games_df.iterrows():
    popularity_score[row['app_id']] = row['rating_encoded']/8

# Print the popularity_score dictionary
print(popularity_score)

pop_dict = popularity_score

{13500: 0.875, 22364: 0.75, 113020: 0.875, 226560: 0.5, 249050: 0.875, 250180: 0.875, 253980: 0.625, 271850: 0.5, 282900: 0.875, 19810: 0.625, 15270: 0.875, 21130: 0.875, 22130: 0.875, 29180: 0.875, 32750: 0.875, 241620: 0.625, 408520: 0.875, 244910: 0.625, 245950: 0.875, 250460: 0.625, 278890: 0.875, 305181: 0.875, 312200: 0.625, 321290: 0.875, 329640: 0.875, 367670: 0.875, 380810: 0.875, 392330: 0.875, 437000: 0.625, 445420: 0.875, 458790: 0.75, 462280: 0.75, 1872790: 0.875, 371970: 0.875, 222660: 0.875, 237850: 0.875, 268220: 0.875, 274270: 0.875, 281450: 0.875, 1259750: 0.5, 296870: 0.875, 330180: 0.875, 351920: 0.875, 397210: 0.625, 402130: 0.5, 402710: 0.5, 410970: 0.875, 415200: 0.875, 410770: 0.875, 254820: 0.875, 284180: 0.875, 286380: 0.875, 288220: 0.875, 302690: 0.875, 320388: 0.625, 339350: 1.0, 342310: 0.5, 346500: 0.625, 393831: 0.75, 414870: 0.875, 419990: 0.875, 434041: 0.75, 446600: 0.875, 451640: 0.75, 479010: 0.625, 494150: 1.0, 1198740: 0.875, 35000: 0.875, 37400: 

In [25]:
def novelty_metric(rec_list, pop_dict):
    pop_sum = []  # List to store popularity scores of recommended items
    for item in rec_list:
        if item in pop_dict.keys():  # Check if the item exists in the popularity dictionary
            pop_sum.append(pop_dict[item])  # Add the popularity score of the item to the list
    return np.mean(pop_sum)  # Calculate and return the mean popularity score of recommended items


In [26]:
# Extract the list of recommended app_IDs from the recommended_items DataFrame
rec_list = recommended_items['item_id'].to_list()

# Calculate the novelty score for the recommended items using the novelty_metric function
novelty_metric(rec_list, pop_dict)


0.94375

In [27]:
# Set the number of recommendations and the number of users
n_recommendations = 20
N = 30

# Initialize total novelty score
total_novelty_score = 0

# Get the first N user IDs
first_n_user_ids = users_df.head(N)['user_id'].tolist()

# Iterate over each user
for user in first_n_user_ids:
    # Generate recommended item IDs for the user
    recommended_items = pd.DataFrame(generate_recommendations_for_user(user, sparse_matrix_csr, model, n_items, top_n=n_recommendations))

    # Map app IDs to real app IDs
    recommended_items['item_id'] = recommended_items['item_id'].replace(app_id_to_real_app_id)

    # Extract the list of recommended item IDs
    rec_list = recommended_items['item_id'].to_list()

    # Check if rec_list is empty
    if not rec_list:
        novelty_score = 0  # If there are no recommendations, novelty score is 0
    else:
        # Calculate novelty score for the recommendations
        novelty_score = novelty_metric(rec_list, pop_dict)

    # Print the user ID and its novelty score
    print(user, novelty_score)

    # Accumulate the novelty score
    total_novelty_score += novelty_score

# Calculate the average novelty score
average_novelty_score = total_novelty_score / N

# Print the average novelty score
print("Average novelty score for {N} users is: ", average_novelty_score)


[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
947 0.94375
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
1107 0.8875
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
1345 0.90625
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
1405 0.89375
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
1634 0.90625
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
2206 0.99375
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
2607 0.9
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
3363 0.875
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
3422 0.875
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
3629 0.86875
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
3931 0.85625
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━

In [28]:
def get_novelty_score(user_id):
    # Generate recommended item IDs for the user
    recommended_items = pd.DataFrame(generate_recommendations_for_user(user_id, sparse_matrix_csr, model, n_items, top_n=n_recommendations))

    # Map app IDs to real app IDs
    recommended_items['item_id'] = recommended_items['item_id'].replace(app_id_to_real_app_id)

    # Extract the list of recommended item IDs
    rec_list = recommended_items['item_id'].to_list()

    # Calculate the novelty score for the recommended items
    novelty_score = novelty_metric(rec_list, pop_dict)

    return novelty_score


Calculate Diversity Score

In [29]:
# Import the function get_item_matrix from the item_matrix module
from item_matrix import get_item_matrix

# Call the get_item_matrix function to obtain the item similarity matrix
item_sim_matrix = get_item_matrix()


Index label '10' does not exist.
30000
531429
50872


In [30]:
def ils_metric(rec_list, item_sim_matrix):
    sim_temp = 0  # Initialize a temporary variable to store the similarity sum
    for i in range(0, len(rec_list)):
        for j in range(i + 1, len(rec_list)):
            # Check if item j is in the similarity matrix for item i
            if rec_list[j] in item_sim_matrix[rec_list[i]]:
                # If yes, add the similarity score to sim_temp
                sim_temp += item_sim_matrix[rec_list[i]][rec_list[j]]
    # Calculate the ILS score by subtracting the normalized similarity sum from 1
    return 1 - (sim_temp / (len(rec_list) * (len(rec_list) - 1)))


In [31]:
# Set the number of recommendations and the number of users
n_recommendations = 20
N = 30

# Initialize total diversity score
total_diversity_score = 0

# Get the first N user IDs
first_n_user_ids = users_df.head(N)['user_id'].tolist()

# Iterate over each user
for user in first_n_user_ids:
    # Generate recommended item IDs for the user
    recommended_items = pd.DataFrame(generate_recommendations_for_user(user, sparse_matrix_csr, model, n_items, top_n=n_recommendations))

    # Map app IDs to real app IDs
    recommended_items['item_id'] = recommended_items['item_id'].replace(app_id_to_real_app_id)

    # Extract the list of recommended item IDs
    rec_list = recommended_items['item_id'].to_list()

    # Check if rec_list is empty
    if not rec_list:
        diversity_score = 0  # If there are no recommendations, diversity score is 0
    else:
        # Calculate diversity score for the recommendations using the ILS metric
        diversity_score = ils_metric(rec_list, item_sim_matrix)

    # Print user ID and diversity score
    print(user, diversity_score)

    # Accumulate diversity score
    total_diversity_score += diversity_score

# Calculate average diversity score
average_diversity_score = total_diversity_score / N

# Print the average diversity score
print("The average diversity score for {N} users is: ", average_diversity_score)


[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
947 0.9871641153195596
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
1107 0.9858181723966727
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
1345 0.9831819701361603
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
1405 0.9945059781413939
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
1634 0.9741628950738253
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
2206 0.9988854529452608
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
2607 0.9821042827561842
[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
3363 0.9885123735064285
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
3422 0.985537549819657
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
3629 0.9912186224361424
[1m671/671