In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import ast

import networkx as nx
import torch
from torch import nn

from model import SimpleEmbedding
from utils import *
import evaluation

### load data

In [None]:
ratings = pd.read_csv("../data/ratings.csv")
links = pd.read_csv("../data/links.csv", dtype=str)
credits = pd.read_csv("../data/credits.csv")

ratings.columns = ratings.columns.str.lower()
links.columns = links.columns.str.lower()
credits.columns = credits.columns.str.lower()

credits = credits.rename(columns={"id": "tmdbid"})

links.tmdbid = links.tmdbid.dropna().astype(int)
links.movieid = links.movieid.dropna().astype(int)

In [None]:
ratings = ratings.loc[ratings.userid < 1500].copy()

## 1. proximity of movies based on actors

#### process credits

In [None]:
# covert strings to list of dictionary
credits.cast = credits.cast.apply(ast.literal_eval)
credits.crew = credits.crew.apply(ast.literal_eval)

In [None]:
cdf = credits[["tmdbid"]].copy()

# cdf (cast and crw df) contains list of people who worked on a certain movie
# extract information from dictionaries
cdf["cast_list"] = credits.cast.apply(lambda x: [y["name"] for y in x])
cdf["crew_list"] = credits.crew.apply(lambda x: [y["name"] for y in x])
cdf["people"] = cdf.apply(lambda x: x.cast_list + x.crew_list, axis=1)

In [None]:
relevant_links = links.merge(ratings[["movieid"]].drop_duplicates())
cdf_relevant = cdf.merge(relevant_links[["tmdbid", "movieid"]])

In [None]:
cdf_edges = (
    cdf_relevant[["cast_list", "movieid"]]
    .explode(column="cast_list")
    .drop_duplicates()
    .reset_index(drop=True)
)

## Map to node ids

In [None]:
nodeid_people, nodeid_movieid2, people_nodeid, movieid_nodeid2 = get_mapping(cdf_edges)

#### map userid and movie_id to index

In [None]:
nodeid_userid, nodeid_movieid, userid_nodeid, movieid_nodeid = get_mapping(ratings)

### create graph

In [None]:
%%time
def create_edge_credits_movie(x): 
    return (people_nodeid[x.cast_list], movieid_nodeid2[x.movieid])

raw_edges = cdf_edges.apply(lambda x: create_edge_credits_movie(x), axis=1).tolist()

In [None]:
%%time
H = nx.Graph(directed=False)
H.add_edges_from(raw_edges)

In [None]:
print("number of nodes:", H.number_of_nodes())
print("number of edges:", H.number_of_edges())
cc = 2 * H.number_of_edges() / H.number_of_nodes()
print("average node degree:", cc)
print("density of network:", nx.density(H))

### get edge list

In [None]:
pos_edge_list = graph_to_edge_list(H)

# split edges
split_dict = {"train": 0.75, "test": 0.25}
edges = transductive_edge_split(pos_edge_list, split_dict, seed=825)

#### create negative samples and labels

In [None]:
%%time
pos_edge_index = dict()
neg_edge_index = dict()
pos_label = dict()
neg_label = dict()

for key, ls in edges.items():
    pos_edge_index[key] = edge_list_to_tensor(ls)

    neg_edge_list = sample_negative_edges(H, len(ls))
    neg_edge_index[key] = edge_list_to_tensor(neg_edge_list)

    pos_label[key] = torch.ones(pos_edge_index[key].shape[1])
    neg_label[key] = torch.zeros(neg_edge_index[key].shape[1])

In [None]:
pos_edge_index = dict()
for key, ls in edges.items():
    pos_edge_index[key] = edge_list_to_tensor(ls)

### create embeddings

In [None]:
torch.manual_seed(1)
emb = create_node_emb(num_node=H.number_of_nodes())

### train embeddings

In [None]:
from torch.optim import Adam


def train(model, train_label, train_edge, epochs=1000):

    learning_rate = 0.001

    optimizer = Adam(emb.parameters(), lr=learning_rate)

    for i in range(epochs):
        optimizer.zero_grad()

        pred = model.forward(train_edge)
        loss = model.loss_fn(pred, train_label)

        loss.backward()
        optimizer.step()

        if i % 500 == 0:
            print(
                f"epoch {i}: loss is: {loss}, accuracy is {accuracy(pred, train_label)}"
            )


train_label = torch.cat([pos_label["train"], neg_label["train"]], dim=0)
train_edge = torch.cat([pos_edge_index["train"], neg_edge_index["train"]], dim=1)

model = SimpleEmbedding(emb)

train(model, train_label, train_edge)

### rate movie proximity

In [None]:
from itertools import combinations
import seaborn as sns

In [None]:
%%time
comb = list(combinations(list(nodeid_movieid2.keys()), 2))
movie_comb = torch.tensor(comb).transpose(0, 1)

In [None]:
movie_proximity_weight = model.forward(movie_comb).detach()
movie_proximity_weight = torch.pow(movie_proximity_weight, 100)

In [None]:
sns.distplot(movie_proximity_weight)

In [None]:
%%time

nodeid2_nodeid = dict()

for k, v in nodeid_movieid2.items():
    nodeid2_nodeid[k] = movieid_nodeid[v]

movie_movie_edges = list(combinations(list(nodeid2_nodeid.values()), 2))
movie_movie_edges = torch.tensor(movie_movie_edges).transpose(0, 1)

## Movie Recommender

## transform to graph

In [None]:
%%time
ddata = dd.from_pandas(ratings, npartitions=10)

def create_edge_user_movie(x): 
    return (userid_nodeid[int(x.userid)], movieid_nodeid[int(x.movieid)], x.rating)

edges = ddata.map_partitions(lambda df: df.apply((lambda row: create_edge_user_movie(row)), axis=1)).compute() 
edges = edges.tolist()

In [None]:
%%time
G = nx.Graph(directed=False)
G.add_weighted_edges_from(edges)

In [None]:
print("number of nodes:", G.number_of_nodes())
print("number of edges:", G.number_of_edges())
cc = 2 * G.number_of_edges() / G.number_of_nodes()
print("average node degree:", cc)
print("density of network:", nx.density(G))

# Training

### get edge list

In [None]:
pos_edge_list = graph_to_edge_list(G)

# split edges
split_dict = {"train": 0.75, "test": 0.25}
edges = transductive_edge_split(pos_edge_list, split_dict, seed=825)

#### create negative samples and labels

In [None]:
pos_edge_index = dict()
for key, ls in edges.items():
    pos_edge_index[key] = edge_list_to_tensor(ls)

### create embeddings

In [None]:
torch.manual_seed(1)
emb = create_node_emb(num_node=G.number_of_nodes())

In [None]:
users, user_index = pos_edge_index["train"].min(dim=0)
movies, movie_index = pos_edge_index["train"].max(dim=0)

### train embeddings

In [None]:
from torch.optim import Adam


def batch_train(model, pos_edges, n_batches, epochs=181):

    learning_rate = 0.001
    optimizer = Adam(emb.parameters(), lr=learning_rate)

    users, unique_users, index = get_pos_edges_users(pos_edges)
    _, unique_movies, _ = get_pos_edges_movies(pos_edges)

    for i in range(epochs):
        user_batches = user_batch_generator(unique_users, n_batches)
        for batch in user_batches:
            optimizer.zero_grad()
            user_losses = []
            for u in batch:

                pos_edges_user, neg_edges_user = get_pos_neg_edges_for_user(
                    edges=pos_edges,
                    users=users,
                    u=u,
                    unique_movies_set=set(unique_movies),
                )

                # make predictions and calculate loss
                f_pos = model.forward(pos_edges_user)
                f_neg = model.forward(neg_edges_user)

                ul = brp_loss(f_pos, f_neg)
                user_losses.append(ul)

            batch_loss = torch.stack(user_losses).mean()

            batch_loss.backward()
            optimizer.step()

        if i % 30 == 0:
            print(f"epoch {i}: loss is: {batch_loss}")


model = SimpleEmbedding(emb)

batch_train(model, pos_edge_index["train"], n_batches=100)

### recall@k

In [None]:
evaluation.avg_recall_at_k(
    seen_edges=pos_edge_index["train"],
    test_edges=pos_edge_index["test"],
    model=model,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)

## improve model by using Light GCN

In [None]:
from model import LightGCN

In [None]:
lgcn = LightGCN(5)

In [None]:
res = lgcn.forward(emb.weight, pos_edge_index["train"])

In [None]:
emb2 = nn.Embedding(emb.num_embeddings, emb.embedding_dim)
emb2.weight = nn.Parameter(res)

m2 = SimpleEmbedding(emb2)

In [None]:
evaluation.avg_recall_at_k(
    seen_edges=pos_edge_index["train"],
    test_edges=pos_edge_index["test"],
    model=m2,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)

### add edge weight

In [None]:
def get_ratings(edges):
    ls = []
    for i in range(edges.shape[1]):
        edge = edges[:, i]
        r = G.get_edge_data(*edge.tolist())["weight"]
        ls.append(r)
    return torch.tensor(ls)

In [None]:
edge_w_ratings = get_ratings(pos_edge_index["train"])

In [None]:
res = lgcn.forward(emb.weight, pos_edge_index["train"], edge_weight=edge_w_ratings)

In [None]:
emb3 = nn.Embedding(emb.num_embeddings, emb.embedding_dim)
emb3.weight = nn.Parameter(res)

m3 = SimpleEmbedding(emb3)

In [None]:
evaluation.avg_recall_at_k(
    seen_edges=pos_edge_index["train"],
    test_edges=pos_edge_index["test"],
    model=m3,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)

### add more edges between movies

In [None]:
edges_combined = torch.cat(
    [pos_edge_index["train"], movie_movie_edges[:, movie_proximity_weight > 0.90]],
    dim=1,
)
edge_weights_combined = torch.cat(
    [edge_w_ratings, movie_proximity_weight[movie_proximity_weight > 0.90] * 0.001]
)

In [None]:
res = lgcn.forward(emb.weight, edges_combined, edge_weight=edge_weights_combined)

In [None]:
emb4 = nn.Embedding(emb.num_embeddings, emb.embedding_dim)
emb4.weight = nn.Parameter(res)

m4 = SimpleEmbedding(emb4)

In [None]:
evaluation.avg_recall_at_k(
    seen_edges=pos_edge_index["train"],
    test_edges=pos_edge_index["test"],
    model=m4,
    library=nodeid_movieid.keys(),
    users=nodeid_userid.keys(),
    k=100,
)