In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

import networkx as nx
import torch
from torch import nn

from model import simple_embedding
from utils import *
import evaluation

### load data

In [3]:
meta = pd.read_csv("../data/movies_metadata.csv")
ratings = pd.read_csv("../data/ratings.csv")
# ratings = pd.read_csv("../data/ratings_small.csv")
links = pd.read_csv("../data/links.csv", dtype=str)

meta.columns = meta.columns.str.lower()
ratings.columns = ratings.columns.str.lower()
links.columns = links.columns.str.lower()

meta = meta.rename(columns={"id": "tmdbid"})

links.tmdbid = links.tmdbid.dropna().astype(int)
links.movieid = links.movieid.dropna().astype(int)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
ratings = ratings.loc[ratings.userid < 1500].copy()

In [5]:
meta.tmdbid = pd.to_numeric(meta.tmdbid, errors="coerce")
meta = meta.dropna(subset=["tmdbid"])
meta = meta.merge(links[["movieid", "tmdbid"]], how="left")

#### map userid and movie_id to index

In [6]:
# only movies that are in ratings can be used for recommendation (not all movies in meta)
userid = ratings.userid.sort_values().drop_duplicates().reset_index(drop=True)
movieid = ratings.movieid.sort_values().drop_duplicates().reset_index(drop=True)
movieid.index = movieid.index + len(userid)

In [7]:
nodeid_userid = userid.to_dict()
nodeid_movieid = movieid.to_dict()

userid_nodeid = {v: k for k, v in nodeid_userid.items()}
movieid_nodeid = {v: k for k, v in nodeid_movieid.items()}

## transform to graph

In [8]:
%%time
ddata = dd.from_pandas(ratings, npartitions=10)

def create_edge(x): 
    return (userid_nodeid[int(x.userid)], movieid_nodeid[int(x.movieid)], x.rating)

edges = ddata.map_partitions(lambda df: df.apply((lambda row: create_edge(row)), axis=1)).compute() 
edges = edges.tolist()

CPU times: user 1.4 s, sys: 16.7 ms, total: 1.42 s
Wall time: 1.41 s


In [9]:
%%time
G = nx.Graph(directed=False)
G.add_weighted_edges_from(edges)

CPU times: user 117 ms, sys: 5.66 ms, total: 122 ms
Wall time: 122 ms


In [10]:
print("number of nodes:", G.number_of_nodes())
print("number of edges:", G.number_of_edges())
cc = 2 * G.number_of_edges() / G.number_of_nodes()
print("average node degree:", cc)
print("density of network:", nx.density(G))

number of nodes: 11522
number of edges: 145490
average node degree: 25.254296129144247
density of network: 0.0021920229258870104


# Training

### get edge list

In [11]:
pos_edge_list = graph_to_edge_list(G)

# split edges
split_dict = {"train": 0.75, "test": 0.25}
edges = transductive_edge_split(pos_edge_list, split_dict, seed=825)

#### create negative samples and labels

In [12]:
pos_edge_index = dict()
for key, ls in edges.items():
    pos_edge_index[key] = edge_list_to_tensor(ls)

In [13]:
pos_edge_index = dict()
neg_edge_index = dict()
pos_label = dict()
neg_label = dict()

for key, ls in edges.items():
    pos_edge_index[key] = edge_list_to_tensor(ls)

    neg_edge_list = sample_negative_edges(G, len(ls))
    neg_edge_index[key] = edge_list_to_tensor(neg_edge_list)

    pos_label[key] = torch.ones(len(ls))
    neg_label[key] = torch.zeros(len(ls))

### create embeddings

In [14]:
torch.manual_seed(1)
emb = create_node_emb(num_node=G.number_of_nodes())

In [15]:
users, user_index = pos_edge_index["train"].min(dim=0)
movies, movie_index = pos_edge_index["train"].max(dim=0)

### train embeddings

In [16]:
from torch.optim import Adam


def batch_train(model, pos_edges, n_batches, epochs=181):

    learning_rate = 0.001
    optimizer = Adam(emb.parameters(), lr=learning_rate)

    users, unique_users, index = get_pos_edges_users(pos_edges)
    _, unique_movies, _ = get_pos_edges_movies(pos_edges)

    for i in range(epochs):
        user_batches = user_batch_generator(unique_users, n_batches)
        for batch in user_batches:
            optimizer.zero_grad()
            user_losses = []
            for u in batch:

                pos_edges_user, neg_edges_user = get_pos_neg_edges_for_user(
                    edges=pos_edges,
                    users=users,
                    u=u,
                    unique_movies_set=set(unique_movies),
                )

                # make predictions and calculate loss
                f_pos = model.forward(pos_edges_user)
                f_neg = model.forward(neg_edges_user)

                ul = brp_loss(f_pos, f_neg)
                user_losses.append(ul)

            batch_loss = torch.stack(user_losses).mean()

            batch_loss.backward()
            optimizer.step()

        if i % 30 == 0:
            print(f"epoch {i}: loss is: {batch_loss}")


model = simple_embedding(emb)

batch_train(model, pos_edge_index["train"], n_batches=100)

epoch 0: loss is: 0.6907793879508972
epoch 30: loss is: 0.5676481127738953
epoch 60: loss is: 0.525672197341919
epoch 90: loss is: 0.47521576285362244
epoch 120: loss is: 0.4558046758174896
epoch 150: loss is: 0.37974950671195984
epoch 180: loss is: 0.4816705286502838


### recall@k

In [17]:
evaluation.avg_recall_at_k(
    train_edges=pos_edge_index["train"],
    test_edges=pos_edge_index["test"],
    model=model,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)

0.25526739966388773

## improve model by using Light GCN

In [18]:
from model import LightGCN

In [19]:
lgcn = LightGCN(5)

In [20]:
res = lgcn.forward(emb.weight, pos_edge_index["train"])

In [21]:
emb2 = nn.Embedding(emb.num_embeddings, emb.embedding_dim)
emb2.weight = nn.Parameter(res)

m2 = simple_embedding(emb2)

In [22]:
evaluation.avg_recall_at_k(
    train_edges=pos_edge_index["train"],
    test_edges=pos_edge_index["test"],
    model=m2,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)

0.2746294701406378