<a href="https://colab.research.google.com/github/jhsrojasro/ML-3A-project/blob/main/Game_recommendations_Using_GNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import networkx as nx
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install node2vec
from node2vec import Node2Vec



In [None]:
workdir = '/content/drive/Shareddrives/Recommender Systems/dataset/'

# Data Preprocesing

In [None]:
users_df = pd.read_csv(workdir+'users.csv')
users_df = users_df.drop(labels=["reviews"], axis=1)
users_df.head()

Unnamed: 0,user_id,products
0,7090995,359
1,13511788,156
2,8444727,329
3,4632233,176
4,4968781,98


In [None]:
games_df = pd.read_csv(workdir+'games.csv')
games_df = games_df[["app_id", "title"]]
games_df.head()

Unnamed: 0,app_id,title
0,13500,Prince of Persia: Warrior Within™
1,22364,BRINK: Agents of Change
2,113020,Monaco: What's Yours Is Mine
3,226560,Escape Dead Island
4,249050,Dungeon of the ENDLESS™


In [None]:
recommendations_df = pd.read_csv(workdir+'recommendations.csv')
recommendations_df = recommendations_df[["app_id", "user_id", "is_recommended"]]
recommendations_df = recommendations_df[recommendations_df['is_recommended'] == True]
recommendations_df = recommendations_df.drop(labels=["is_recommended"], axis=1)
recommendations_df.head()

Unnamed: 0,app_id,user_id
0,975370,49625
2,1085660,243409
3,703080,248701
4,526870,22902
5,306130,43700


In [None]:
n_partitions = 8
pair_dicts = [defaultdict(int) for _ in range(n_partitions)]
# test_df = recommendations_df[:1000000]
particion_size = len(recommendations_df) // n_partitions
for k in range(n_partitions):
  for user_id, group in tqdm(recommendations_df[k * particion_size: (k+1) * particion_size].groupby("user_id"), desc="Creating pairs"):
    games = list(group["app_id"])
    for i in range(len(games)):
      for j in range(i + 1, len(games)):
        pair_dicts[k][(games[i], games[j])] += 1

Creating pairs:   0%|          | 0/3029797 [00:00<?, ?it/s]

Creating pairs:   0%|          | 0/2982871 [00:00<?, ?it/s]

Creating pairs:   0%|          | 0/2941818 [00:00<?, ?it/s]

Creating pairs:   0%|          | 0/2329225 [00:00<?, ?it/s]

Creating pairs:   0%|          | 0/2780361 [00:00<?, ?it/s]

Creating pairs:   0%|          | 0/2590293 [00:00<?, ?it/s]

Creating pairs:   0%|          | 0/2252720 [00:00<?, ?it/s]

Creating pairs:   0%|          | 0/2478101 [00:00<?, ?it/s]

In [None]:
for k in range(1, n_partitions):
  for (game1, game2), n_recommendations in pair_dicts[k].items():
    pair_dicts[0][(game1, game2)] +=  pair_dicts[k][(game1, game2)]


In [None]:
pairs = pair_dicts[0]

In [None]:
with open(workdir+'pairs.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(pairs, f, pickle.HIGHEST_PROTOCOL)


In [None]:
G = nx.Graph()
for pair in pairs:
    movie1, movie2 = pair
    score = pairs[pair]

    if score >= 20: # We don’t consider lower scores because that would create a large graph in which connections were less meaningful
        G.add_edge(movie1, movie2, weight=score)

print(G)

Graph with 5384 nodes and 239958 edges


In [None]:
pickle.dump(G, open(workdir+'graph_data.pickle', 'wb'))

# Traing GNN

In [None]:
G = pickle.load(open(workdir+'graph_data.pickle', 'rb'))

In [None]:
node2vec = Node2Vec(G, dimensions=64, walk_length=20, num_walks=200, p=2, q=1, workers=2)

Computing transition probabilities:   0%|          | 0/5384 [00:00<?, ?it/s]

In [None]:
pickle.dump(node2vec, open(workdir+'node2vec.pickle', 'wb'))

In [None]:
model = node2vec.fit(window=10, min_count=1, batch_words=4, epochs=10, compute_loss=True)

Computing transition probabilities:   0%|          | 0/5384 [00:00<?, ?it/s]

Generating walks (CPU: 1):   8%|▊         | 17/200 [01:52<20:18,  6.66s/it]

KeyboardInterrupt: ignored

In [None]:
model.save(workdir+'trained_model.model')

# Use trained model to make recommendations

In [None]:
from gensim.models import Word2Vec
loaded_model = Word2Vec.load(workdir+'trained_model.model')

In [None]:
def recommend(game_title, model):
    game_id = str(games_df[games_df.title == game_title].app_id.values[0])
    for id in model.wv.most_similar(game_id)[:5]:
        title = games_df[games_df.app_id == int(id[0])].title.values[0]
        print(f'{title}: {id[1]:.2f}')
    return model.wv.most_similar(game_id)[0][1]

In [None]:
recommend('Escape Dead Island', loaded_model)

Halo: Spartan Strike: 0.81
The Bureau: XCOM Declassified: 0.80
Homefront: 0.78
Controller Companion: 0.76
The Incredible Adventures of Van Helsing: 0.75


0.8070984482765198