In this notebook we implemented a simple embedding recommender. Using light GCN we diffuse the embeddings across edges. 

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import dask.dataframe as dd

import networkx as nx
import torch
from torch import nn
from torch.optim import Adam

from model import SimpleEmbedding
import evaluation
from utils import *

### load data

In [None]:
ratings = pd.read_csv("../data/ratings.csv")
ratings.columns = ratings.columns.str.lower()

ratings = ratings.loc[ratings.userid < 1500].copy()

#### map userid and movie_id to index

In [None]:
nodeid_userid, nodeid_movieid, userid_nodeid, movieid_nodeid = get_mapping(ratings)

## transform to graph

In [None]:
%%time
ddata = dd.from_pandas(ratings, npartitions=10)

def create_edge(x): 
    return (userid_nodeid[int(x.userid)], movieid_nodeid[int(x.movieid)], x.rating)

edges = ddata.map_partitions(lambda df: df.apply((lambda row: create_edge(row)), axis=1)).compute() 
edges = edges.tolist()

In [None]:
%%time
G = nx.Graph(directed=False)
G.add_weighted_edges_from(edges)

In [None]:
print("number of nodes:", G.number_of_nodes())
print("number of edges:", G.number_of_edges())
cc = 2 * G.number_of_edges() / G.number_of_nodes()
print("average node degree:", cc)
print("density of network:", nx.density(G))

# Training

### get edge list

In [None]:
pos_edge_list = graph_to_edge_list(G)

# split edges
split_dict = {"train": 0.75, "valid": 0.1, "test": 0.15}
edges = transductive_edge_split(pos_edge_list, split_dict, seed=825)

#### create negative samples and labels

In [None]:
pos_edge_index = dict()
neg_edge_index = dict()
pos_label = dict()
neg_label = dict()

for key, ls in edges.items():
    pos_edge_index[key] = edge_list_to_tensor(ls)

    neg_edge_list = sample_negative_edges(G, len(ls))
    neg_edge_index[key] = edge_list_to_tensor(neg_edge_list)

    pos_label[key] = torch.ones(pos_edge_index[key].shape[1])
    neg_label[key] = torch.zeros(neg_edge_index[key].shape[1])

### Train Embeddings

#### Training Loop

In [None]:
def train(
    model,
    train_label,
    train_edge,
    valid_label=None,
    valid_edge=None,
    epochs=5000,
    early_stopping=3,
):

    """
    Training loop for SimpleEmbedding Model.

    Params:
        - model: SimpleEmbedding Model
        - train_label: torch.Tensor with labels corresponding to train_edges
                        shape: ([num_pos_edges + num_neg_edges])
        - train_edge: torch.Tensor with training edges (should be in same order as train_label)
                        shape: ([2, num_pos_edges + num_neg_edges])
        - valid_label: analogous to train_label
        - valid_edge: analogous to train_edge
        - epochs: number of maximum epochs to train
        - early_stopping: (int) if this value is greater than 0, training is stopped if the
                            validation accuracy goes down "early_stopping" times in a row.
    """

    learning_rate = 0.003
    optimizer = Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.BCELoss()

    descreasing = 0
    valid_accuracy = 0

    for i in range(epochs):
        optimizer.zero_grad()

        pred = model(train_edge)
        loss = loss_fn(pred, train_label)

        loss.backward()
        optimizer.step()

        if valid_edge is not None:
            pred_validation = model(valid_edge)
            valid_accuracy_new = accuracy(pred_validation, valid_label)
            if early_stopping > 0:
                if valid_accuracy_new < valid_accuracy:
                    decreasing += 1
                else:
                    decreasing = 0
                if decreasing == early_stopping:
                    break
            valid_accuracy = valid_accuracy_new

        if i % 500 == 0:
            print_message = f"epoch {i}: loss is: {loss:.3f}, accuracy train: {accuracy(pred, train_label)}"
            if valid_edge is not None:
                print_message += f" valid: {valid_accuracy}"
            print(print_message)

#### Initialize Embedding Training

In [None]:
torch.manual_seed(1)
emb = create_node_emb(num_node=G.number_of_nodes())
simple_embedding_model = SimpleEmbedding(emb)

train_label = torch.cat([pos_label["train"], neg_label["train"]], dim=0)
train_edge = torch.cat([pos_edge_index["train"], neg_edge_index["train"]], dim=1)

valid_label = torch.cat([pos_label["valid"], neg_label["valid"]], dim=0)
valid_edge = torch.cat([pos_edge_index["valid"], neg_edge_index["valid"]], dim=1)

train(simple_embedding_model, train_label, train_edge, valid_label, valid_edge)

### Recall@100 on Testset

In [None]:
recall100_simple_embedding_model = evaluation.avg_recall_at_k(
    seen_edges=torch.cat([pos_edge_index["train"], pos_edge_index["valid"]], dim=1),
    test_edges=pos_edge_index["test"],
    model=simple_embedding_model,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)

recall100_simple_embedding_model

## improve model by using Light GCN

In [None]:
from model import LightGCN

In [None]:
def get_lgcn_embedding_model(emb, message_edges, n_layers, edge_weight=None):
    """
    Returns Embedding model where embedding weights are the outcome of LGCN smoothing.
    params:
        - emb: Embedding to be smoothed with LGCN
        - message_edges: edges along which LGCN should pass embeddings for smoothing
        - n_layers: number of LGCN layers
        - edge_weight: if specified smoothing takes edge weight into account

    """
    lgcn = LightGCN(n_layers)
    res = lgcn.forward(emb.weight, message_edges, edge_weight=edge_weight)

    lgcn_emb = nn.Embedding(emb.num_embeddings, emb.embedding_dim)
    lgcn_emb.weight = nn.Parameter(res)

    lgcn_emb_model = SimpleEmbedding(lgcn_emb)
    return lgcn_emb_model


def get_best_lgcn_layer(emb, min_i=2, max_i=20, verbose=False):
    """
    Returns layer number according to recall@100 on validationset.
    Prints validation recall@100 for different layers (hyperparameter tuning of layer number)
    params:
        - emb: embedding to be passed along
        - min_i: minimum layer number
        - max_i: maximum layer number
        - verbose: (boolean) if True outputs validation recall for each layer tried,
                    else only the best layer
    """
    best_recall = 0
    best_param = None
    for i in range(min_i, max_i):
        lgcn_emb_model = get_lgcn_embedding_model(
            emb=emb, message_edges=pos_edge_index["train"], n_layers=i
        )

        recall_validation = evaluation.avg_recall_at_k(
            seen_edges=pos_edge_index["train"],
            test_edges=pos_edge_index["valid"],
            model=lgcn_emb_model,
            library=nodeid_movieid.keys(),
            users=nodeid_userid.keys(),
            k=100,
        )
        if verbose:
            print(f"n_layer {i} : ", recall_validation)
        if recall_validation > best_recall:
            best_param = i
            best_recall = recall_validation
    print(f"best param: {best_param}")
    return best_param

In [None]:
n1 = get_best_lgcn_layer(simple_embedding_model.emb)

lgcn_simple_embedding_model = get_lgcn_embedding_model(
    emb=simple_embedding_model.emb,
    message_edges=torch.cat([pos_edge_index["train"], pos_edge_index["valid"]], dim=1),
    n_layers=n1,
)

## Testset result

In [None]:
recall100_lgcn_simple_embedding_model = evaluation.avg_recall_at_k(
    seen_edges=torch.cat([pos_edge_index["train"], pos_edge_index["valid"]], dim=1),
    test_edges=pos_edge_index["test"],
    model=lgcn_simple_embedding_model,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)

recall100_lgcn_simple_embedding_model