In this notebook we implemented a simple embedding recommender. Using light GCN we diffuse the embeddings across edges. 

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

import networkx as nx
import torch
from torch import nn

from model import simple_embedding
from utils import *
import evaluation

### load data

In [None]:
meta = pd.read_csv("../data/movies_metadata.csv")
ratings = pd.read_csv("../data/ratings.csv")
# ratings = pd.read_csv("../data/ratings_small.csv")
links = pd.read_csv("../data/links.csv", dtype=str)

meta.columns = meta.columns.str.lower()
ratings.columns = ratings.columns.str.lower()
links.columns = links.columns.str.lower()

meta = meta.rename(columns={"id": "tmdbid"})

links.tmdbid = links.tmdbid.dropna().astype(int)
links.movieid = links.movieid.dropna().astype(int)

In [None]:
ratings = ratings.loc[ratings.userid < 1500].copy()

In [None]:
meta.tmdbid = pd.to_numeric(meta.tmdbid, errors="coerce")
meta = meta.dropna(subset=["tmdbid"])
meta = meta.merge(links[["movieid", "tmdbid"]], how="left")

#### map userid and movie_id to index

In [None]:
userid = ratings.userid.sort_values().drop_duplicates().reset_index(drop=True)
movieid = ratings.movieid.sort_values().drop_duplicates().reset_index(drop=True)
movieid.index = movieid.index + len(userid)

In [None]:
nodeid_userid = userid.to_dict()
nodeid_movieid = movieid.to_dict()

userid_nodeid = {v: k for k, v in nodeid_userid.items()}
movieid_nodeid = {v: k for k, v in nodeid_movieid.items()}

## transform to graph

In [None]:
%%time
ddata = dd.from_pandas(ratings, npartitions=10)

def create_edge(x): 
    return (userid_nodeid[int(x.userid)], movieid_nodeid[int(x.movieid)], x.rating)

edges = ddata.map_partitions(lambda df: df.apply((lambda row: create_edge(row)), axis=1)).compute() 
edges = edges.tolist()

In [None]:
%%time
G = nx.Graph(directed=False)
G.add_weighted_edges_from(edges)

In [None]:
print("number of nodes:", G.number_of_nodes())
print("number of edges:", G.number_of_edges())
cc = 2 * G.number_of_edges() / G.number_of_nodes()
print("average node degree:", cc)
print("density of network:", nx.density(G))

# Training

### get edge list

In [None]:
pos_edge_list = graph_to_edge_list(G)

# split edges
split_dict = {"train": 0.75, "test": 0.25}
edges = transductive_edge_split(pos_edge_list, split_dict, seed=825)

#### create negative samples and labels

In [None]:
pos_edge_index = dict()
neg_edge_index = dict()
pos_label = dict()
neg_label = dict()

for key, ls in edges.items():
    pos_edge_index[key] = edge_list_to_tensor(ls)

    neg_edge_list = sample_negative_edges(G, len(ls))
    neg_edge_index[key] = edge_list_to_tensor(neg_edge_list)

    pos_label[key] = torch.ones(pos_edge_index[key].shape[1])
    neg_label[key] = torch.zeros(neg_edge_index[key].shape[1])

### create embeddings

In [None]:
torch.manual_seed(1)
emb = create_node_emb(num_node=G.number_of_nodes())

### train embeddings

In [None]:
from torch.optim import Adam


def train(model, train_label, train_edge, epochs=2000):

    learning_rate = 0.001

    optimizer = Adam(emb.parameters(), lr=learning_rate)

    for i in range(epochs):
        optimizer.zero_grad()

        pred = model.forward(train_edge)
        loss = model.loss_fn(pred, train_label)

        loss.backward()
        optimizer.step()

        if i % 500 == 0:
            print(
                f"epoch {i}: loss is: {loss}, accuracy is {accuracy(pred, train_label)}"
            )


train_label = torch.cat([pos_label["train"], neg_label["train"]], dim=0)
train_edge = torch.cat([pos_edge_index["train"], neg_edge_index["train"]], dim=1)

model = simple_embedding(emb)

train(model, train_label, train_edge)

### recall@k

In [None]:
evaluation.avg_recall_at_k(
    train_edges=pos_edge_index["train"],
    test_edges=pos_edge_index["test"],
    model=model,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)

## improve model by using Light GCN

In [None]:
from model import LightGCN

In [None]:
lgcn = LightGCN(5)

In [None]:
res = lgcn.forward(emb.weight, pos_edge_index["train"])

In [None]:
emb2 = nn.Embedding(emb.num_embeddings, emb.embedding_dim)
emb2.weight = nn.Parameter(res)

m2 = simple_embedding(emb2)

In [None]:
evaluation.avg_recall_at_k(
    train_edges=pos_edge_index["train"],
    test_edges=pos_edge_index["test"],
    model=m2,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)