In this notebook we implemented a simple embedding recommender. Using light GCN we diffuse the embeddings across edges. 

The difference to notebook 2 is that training is done in batches and the brp loss is optimized instead of the binary cross entropy loss.

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import dask.dataframe as dd

import networkx as nx
import torch
from torch import nn
from torch.optim import Adam

from model import SimpleEmbedding
import evaluation
from utils import *

### load data

In [None]:
ratings = pd.read_csv("../data/ratings.csv")
ratings.columns = ratings.columns.str.lower()

ratings = ratings.loc[ratings.userid < 1500].copy()

## transform data to graph

#### map userid and movieid to node-index

In [None]:
nodeid_userid, nodeid_movieid, userid_nodeid, movieid_nodeid = get_mapping(ratings)

#### get edge list

In [None]:
%%time
ddata = dd.from_pandas(ratings, npartitions=10)

def create_edge(x): 
    return (userid_nodeid[int(x.userid)], movieid_nodeid[int(x.movieid)], x.rating)

edges = ddata.map_partitions(lambda df: df.apply((lambda row: create_edge(row)), axis=1)).compute() 
edges = edges.tolist()

#### create graph 

In [None]:
%%time
G = nx.Graph(directed=False)
G.add_weighted_edges_from(edges)

In [None]:
print("number of nodes:", G.number_of_nodes())
print("number of edges:", G.number_of_edges())
cc = 2 * G.number_of_edges() / G.number_of_nodes()
print("average node degree:", cc)
print("density of network:", nx.density(G))

# Training

#### get edge list

In [None]:
pos_edge_list = graph_to_edge_list(G)

# split edges
split_dict = {"train": 0.75, "valid": 0.1, "test": 0.15}
edges = transductive_edge_split(pos_edge_list, split_dict, seed=825)

#### create negative samples and labels

In [None]:
pos_edge_index = dict()
for key, ls in edges.items():
    pos_edge_index[key] = edge_list_to_tensor(ls)

### Train Embeddings

#### Training Loop

In [None]:
def batch_train(
    model, train_edges, n_batches, valid_edges=None, epochs=181, early_stopping=2
):
    """
    Trains embeddings with BRP loss and using user batches. Negtaive Edges for each user
    are sampled on the fly.

    Params:
        - model: SimpleEmbedding Model
        - train_edges: torch.Tensor with shape (2, n_positive_edges).
                    Conatains positive edges of training set.
        - n_batches: number of user batches.
        - valid_edges: analogous to train_edge
        - epochs: number of maximum epochs to train
        - early_stopping: (int) if this value is greater than 0, training is stopped if the
                            validation accuracy goes down "early_stopping" times in a row.

    """

    learning_rate = 0.003
    optimizer = Adam(emb.parameters(), lr=learning_rate)

    users, unique_users, index = get_pos_edges_users(train_edges)
    _, unique_movies, _ = get_pos_edges_movies(train_edges)

    descreasing = 0
    valid_recall_k = 0

    for i in range(epochs):
        user_batches = user_batch_generator(unique_users, n_batches)
        for batch in user_batches:
            optimizer.zero_grad()
            user_losses = []
            for u in batch:

                pos_edges_user, neg_edges_user = get_pos_neg_edges_for_user(
                    edges=train_edges,
                    users=users,
                    u=u,
                    unique_movies_set=set(unique_movies),
                )

                # make predictions and calculate loss
                f_pos = model.forward(pos_edges_user)
                f_neg = model.forward(neg_edges_user)

                ul = brp_loss(f_pos, f_neg)
                user_losses.append(ul)

            batch_loss = torch.stack(user_losses).mean()

            batch_loss.backward()
            optimizer.step()

        if i % 10 == 0:

            if valid_edges is not None:
                valid_recall_k_new = evaluation.avg_recall_at_k(
                    seen_edges=pos_edge_index["train"],
                    test_edges=pos_edge_index["valid"],
                    model=model,
                    library=nodeid_movieid.keys(),
                    users=nodeid_userid.keys(),
                    k=100,
                )
                if early_stopping > 0:
                    if valid_recall_k_new <= valid_recall_k:
                        decreasing += 1
                    else:
                        decreasing = 0
                    if decreasing == early_stopping:
                        break
                valid_recall_k = valid_recall_k_new

            print(
                f"epoch {i}: loss is: {batch_loss}, valid recall@100: {valid_recall_k}"
            )

#### Initialize Embedding Training

In [None]:
torch.manual_seed(1)
emb = create_node_emb(num_node=G.number_of_nodes())

embedding_brp_model = SimpleEmbedding(emb)

batch_train(
    embedding_brp_model,
    pos_edge_index["train"],
    n_batches=100,
    valid_edges=pos_edge_index["valid"],
    early_stopping=2,
)

### Recall@100 on Testset

In [None]:
recall100_embedding_brp_model = evaluation.avg_recall_at_k(
    seen_edges=torch.cat([pos_edge_index["train"], pos_edge_index["valid"]], dim=1),
    test_edges=pos_edge_index["test"],
    model=embedding_brp_model,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)

recall100_embedding_brp_model

## improve model by using Light GCN

In [None]:
from model import LightGCN

In [None]:
def get_lgcn_embedding_model(emb, message_edges, n_layers, edge_weight=None):
    """
    Returns Embedding model where embedding weights are the outcome of LGCN smoothing.
    params:
        - emb: Embedding to be smoothed with LGCN
        - message_edges: edges along which LGCN should pass embeddings for smoothing
        - n_layers: number of LGCN layers
        - edge_weight: if specified smoothing takes edge weight into account

    """
    lgcn = LightGCN(n_layers)
    res = lgcn.forward(emb.weight, message_edges, edge_weight=edge_weight)

    lgcn_emb = nn.Embedding(emb.num_embeddings, emb.embedding_dim)
    lgcn_emb.weight = nn.Parameter(res)

    lgcn_emb_model = SimpleEmbedding(lgcn_emb)
    return lgcn_emb_model


def get_best_lgcn_layer(emb, min_i=2, max_i=20, verbose=False):
    """
    Returns layer number according to recall@100 on validationset.
    Prints validation recall@100 for different layers (hyperparameter tuning of layer number)
    params:
        - emb: embedding to be passed along
        - min_i: minimum layer number
        - max_i: maximum layer number
        - verbose: (boolean) if True outputs validation recall for each layer tried,
                    else only the best layer
    """
    best_recall = 0
    best_param = None
    for i in range(min_i, max_i):
        lgcn_emb_model = get_lgcn_embedding_model(
            emb=emb, message_edges=pos_edge_index["train"], n_layers=i
        )

        recall_validation = evaluation.avg_recall_at_k(
            seen_edges=pos_edge_index["train"],
            test_edges=pos_edge_index["valid"],
            model=lgcn_emb_model,
            library=nodeid_movieid.keys(),
            users=nodeid_userid.keys(),
            k=100,
        )
        if verbose:
            print(f"n_layer {i} : ", recall_validation)
        if recall_validation > best_recall:
            best_param = i
            best_recall = recall_validation
    print(f"best param: {best_param}")
    return best_param

In [None]:
n1 = get_best_lgcn_layer(embedding_brp_model.emb)

lgcn_embedding_brp_model = get_lgcn_embedding_model(
    emb=embedding_brp_model.emb,
    message_edges=torch.cat([pos_edge_index["train"], pos_edge_index["valid"]], dim=1),
    n_layers=n1,
)

## Testset result

In [None]:
recall100_lgcn_embedding_brp_model = evaluation.avg_recall_at_k(
    seen_edges=torch.cat([pos_edge_index["train"], pos_edge_index["valid"]], dim=1),
    test_edges=pos_edge_index["test"],
    model=lgcn_embedding_brp_model,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)

recall100_lgcn_embedding_brp_model

### Improve Embedding by Using Ratings

In [None]:
def get_ratings(edges):
    """Returns tensor of shape [(number of edges)] with edge weights for each edge."""
    ls = []
    for i in range(edges.shape[1]):
        edge = edges[:, i]
        r = G.get_edge_data(*edge.tolist())["weight"]
        ls.append(r)
    return torch.tensor(ls)


edge_w_ratings = get_ratings(
    torch.cat([pos_edge_index["train"], pos_edge_index["valid"]], dim=1)
)

In [None]:
lgcn_embedding_brp_model_ratings = get_lgcn_embedding_model(
    emb=embedding_brp_model.emb,
    message_edges=torch.cat([pos_edge_index["train"], pos_edge_index["valid"]], dim=1),
    n_layers=n1,
    edge_weight=edge_w_ratings,
)

In [None]:
recall100_lgcn_embedding_brp_model_ratings = evaluation.avg_recall_at_k(
    seen_edges=torch.cat([pos_edge_index["train"], pos_edge_index["valid"]], dim=1),
    test_edges=pos_edge_index["test"],
    model=lgcn_embedding_brp_model_ratings,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)

recall100_lgcn_embedding_brp_model_ratings