In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

import networkx as nx
import torch
from torch import nn

from model import simple_embedding
from utils import *
import evaluation

### load data

In [None]:
meta = pd.read_csv("../data/movies_metadata.csv")
# ratings = pd.read_csv("data/ratings.csv")
ratings = pd.read_csv("../data/ratings_small.csv")
links = pd.read_csv("../data/links.csv", dtype=str)

meta.columns = meta.columns.str.lower()
ratings.columns = ratings.columns.str.lower()
links.columns = links.columns.str.lower()

meta = meta.rename(columns={"id": "tmdbid"})

links.tmdbid = links.tmdbid.dropna().astype(int)
links.movieid = links.movieid.dropna().astype(int)

In [None]:
meta.tmdbid = pd.to_numeric(meta.tmdbid, errors="coerce")
meta = meta.dropna(subset=["tmdbid"])
meta = meta.merge(links[["movieid", "tmdbid"]], how="left")

#### map userid and movie_id to index

In [None]:
userid = ratings.userid.sort_values().drop_duplicates().reset_index(drop=True)
movieid = ratings.movieid.sort_values().drop_duplicates().reset_index(drop=True)
movieid.index = movieid.index + len(userid)

In [None]:
nodeid_userid = userid.to_dict()
nodeid_movieid = movieid.to_dict()

userid_nodeid = {v: k for k, v in nodeid_userid.items()}
movieid_nodeid = {v: k for k, v in nodeid_movieid.items()}

## transform to graph

In [None]:
%%time
ddata = dd.from_pandas(ratings, npartitions=10)

def create_edge(x): 
    return (userid_nodeid[int(x.userid)], movieid_nodeid[int(x.movieid)], x.rating)

edges = ddata.map_partitions(lambda df: df.apply((lambda row: create_edge(row)), axis=1)).compute() 
edges = edges.tolist()

In [None]:
%%time
G = nx.Graph(directed=False)
G.add_weighted_edges_from(edges)

In [None]:
print("number of nodes:", G.number_of_nodes())
print("number of edges:", G.number_of_edges())
cc = 2 * G.number_of_edges() / G.number_of_nodes()
print("average node degree:", cc)
print("density of network:", nx.density(G))

# Training

### get edge list

In [None]:
pos_edge_list = graph_to_edge_list(G, sort=True)

# split edges
split_dict = {"train": 0.8, "test": 0.2}
edges = transductive_edge_split(pos_edge_list, split_dict, seed=25)

#### create negative samples and labels

In [None]:
pos_edge_index = dict()
for key, ls in edges.items():
    pos_edge_index[key] = edge_list_to_tensor(ls)

In [None]:
pos_edge_index = dict()
neg_edge_index = dict()
pos_label = dict()
neg_label = dict()

for key, ls in edges.items():
    pos_edge_index[key] = edge_list_to_tensor(ls)

    neg_edge_list = sample_negative_edges(G, len(ls))
    neg_edge_index[key] = edge_list_to_tensor(neg_edge_list)

    pos_label[key] = torch.ones(len(ls))
    neg_label[key] = torch.zeros(len(ls))

### create embeddings

In [None]:
torch.manual_seed(1)
emb = create_node_emb(num_node=G.number_of_nodes())

In [None]:
users, user_index = pos_edge_index["train"].min(dim=0)
movies, movie_index = pos_edge_index["train"].max(dim=0)

In [None]:
pos_edge_index["train"][:, users == 261]

### train embeddings

In [None]:
def get_pos_edges_users(pos_edge_index):
    users, index = pos_edge_index.min(dim=0)
    unique_users = users.unique().tolist()
    return users, unique_users, index

In [None]:
def get_pos_edges_movies(pos_edge_index):
    movies, index = pos_edge_index.max(dim=0)
    unique_movies = movies.unique().tolist()
    return movies, unique_movies, index

In [None]:
def user_batch_generator(_unique_users, n_batches):
    unique_user = _unique_users.copy()
    random.shuffle(unique_user)
    batch_n_user = len(unique_user) // n_batches
    for i in range(0, len(unique_user), batch_n_user):
        yield unique_user[i : i + batch_n_user]

In [None]:
def user_brp_loss(_f_pos, _f_neg):
    s = nn.Sigmoid()
    f_pos = _f_pos.repeat_interleave(_f_neg.shape[0], dim=0)
    f_neg = _f_neg.repeat(1, _f_pos.shape[0])

    return -torch.log(s(f_pos - f_neg)).mean()

In [None]:
def user_loss(edges, users, u, unique_movies_set, model, neg_sample_size=10):
    pos_edges = edges[:, users == u]

    watched_movies = set(pos_edges[1].tolist())
    neg_movies = list(unique_movies_set - watched_movies)
    neg_movies = np.random.choice(neg_movies, neg_sample_size, replace=False)
    neg_movies = torch.tensor(neg_movies)

    neg_edges = torch.stack([pos_edges[0, :neg_sample_size], neg_movies])

    # calculate brp loss for user
    f_pos = model.forward(pos_edges)
    f_neg = model.forward(neg_edges)

    return user_brp_loss(f_pos, f_neg)

In [None]:
from torch.optim import Adam


def batch_train(model, pos_edges, n_batches, epochs=180):

    learning_rate = 0.001
    optimizer = Adam(emb.parameters(), lr=learning_rate)

    users, unique_users, index = get_pos_edges_users(pos_edges)
    _, unique_movies, _ = get_pos_edges_movies(pos_edges)

    for i in range(epochs):
        user_batches = user_batch_generator(unique_users, n_batches)
        for batch in user_batches:
            optimizer.zero_grad()
            user_losses = []
            for u in batch:
                ul = user_loss(
                    edges=pos_edges,
                    users=users,
                    u=u,
                    unique_movies_set=set(unique_movies),
                    model=model,
                )
                user_losses.append(ul)
            batch_loss = torch.stack(user_losses).mean()

            batch_loss.backward()
            optimizer.step()

        if i % 30 == 0:
            print(f"epoch {i}: loss is: {batch_loss}")


model = simple_embedding(emb)

batch_train(model, pos_edge_index["train"], n_batches=100)

### recall@k

In [None]:
evaluation.avg_recall_at_k(
    train_edges=pos_edge_index["train"],
    test_edges=pos_edge_index["test"],
    model=model,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)

## improve model by using Light GCN

In [None]:
from model import LightGCN

In [None]:
lgcn = LightGCN(5)

In [None]:
res = lgcn.forward(emb.weight, pos_edge_index["train"])

In [None]:
emb2 = nn.Embedding(emb.num_embeddings, emb.embedding_dim)
emb2.weight = nn.Parameter(res)

m2 = simple_embedding(emb2)

In [None]:
evaluation.avg_recall_at_k(
    train_edges=pos_edge_index["train"],
    test_edges=pos_edge_index["test"],
    model=m2,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)