The purpose of this notebook is to 
- Create a data pipline using dgl 
- Train a gnn model for recommendation system 

In [1]:
#%pip install dgl-cu116 dglgo -f https://data.dgl.ai/wheels/repo.html

In [2]:
import dgl 
import os
import torch
import pandas as pd 
import numpy as np
import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F
import tqdm
import logging 

from typing import Iterator 
from collections import defaultdict
from collections import Counter
from dgl import save_graphs, load_graphs
from dgl.data import DGLDataset
from dgl.data.utils import makedirs, save_info, load_info
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler

In [3]:
logging.basicConfig(
    filename='model_no_feat.log', 
    filemode='a',
    force = True,
    format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S',
    level=logging.INFO
)

In [4]:
# Transfromation function 

def tranform_duration(data: pd.DataFrame)->pd.DataFrame:
    # typecasting duration from str to int
    result = data.copy(deep=True)
    tmp = result.Duration.dropna().str.extract(r"((?P<hr>\d*) hr. )?(?P<min>\d*) min.", expand=True)   
    result.loc[result.Duration.notna(), "Duration"] = (tmp["hr"].astype(float).fillna(0)+tmp["min"].astype(float).fillna(0))
    return result

def tranform_aired(data: pd.DataFrame)->pd.DataFrame:
    # typecasting air from str to int
    # add a feature called year 
    result = data.copy(deep=True)
    result["year"] = result.Aired.str.extract(r"(\d{4})").astype(float)
    return result

def create_genres_dummy(data: pd.DataFrame)->pd.DataFrame:
    result = data.copy(deep=True)
    tmp = result.Genres.dropna().apply(
        lambda x: x.split(", ") if isinstance(x, str) else x
    ).tolist()
    all_genres = Counter()
    for lst in tmp:
        all_genres.update(Counter(lst))
    genre2idx = {g:idx for idx, g in enumerate(all_genres.keys())}
    dummy_matrix = np.zeros((len(result),len(genre2idx)), dtype=int)
    for idx, genres_lst in zip(result[result.Genres.notna()].index, tmp):
        for genre in genres_lst:
            dummy_matrix[idx, genre2idx[genre]] = 1
    result[["genre_"+g for g in genre2idx]] = dummy_matrix
    return result

def create_dummy(data: pd.DataFrame)->pd.DataFrame:
    result = data.copy(deep=True)
    tmp = []
    use_col = ['Type', 'Source', 'Rating']
    for col in use_col:
        tmp.append(pd.get_dummies(result[col], prefix=col))
    result = pd.concat([result]+tmp, axis = 1)
    return result 

def fill_numeric_missing(data: pd.DataFrame)->pd.DataFrame:
    result = data.copy(deep=True)
    for col in result.columns:
        if result[col].dtypes == "float":
            result[col] = result[col].fillna(np.median(result[col].dropna()))
    return result 

def remove_irrelevant_feature(data: pd.DataFrame)->pd.DataFrame:
    # drop all categorical feature from result dataframe 
    result = data.copy(deep=True)
    for col in result.columns:
        if result[col].dtypes == "O":
            del result[col]
    return result  

def normalize_feature(data: pd.DataFrame)->pd.DataFrame:
    result = data.copy(deep=True)
    # Apply min-max normlaization to the feature 
    scaler = MinMaxScaler()
    result[result.columns[1:]] = scaler.fit_transform(result.iloc[:, 1:].values)
    return result 

def filter_low_rating(data: pd.DataFrame)->pd.DataFrame:
    result = data.copy(deep=True)
    return result.loc[result.rating>=9, ["user_id", "anime_id"]]

def filter_non_exist_anime(data: pd.DataFrame, anime_id: pd.Series)->pd.DataFrame:
    result = data.copy(deep=True)
    return result[result.anime_id.isin(anime_id)]

def create_id_mapper(data: pd.DataFrame)->tuple[dict]:
    tmp = data.copy(deep=True)
    user_id = sorted(tmp.user_id.unique())
    anime_id = sorted(tmp.anime_id.unique())
    return {old_idx: new_idx for new_idx, old_idx in enumerate(user_id)}, {old_idx: new_idx for new_idx, old_idx in enumerate(anime_id)}
    

In [5]:
class AnimeDataset(DGLDataset):
    def __init__(self,
                save_dir=None,
                force_reload=False,
                verbose=False):
        super().__init__(name='anime_dataset',                                    
                        save_dir=save_dir,
                        force_reload=force_reload,
                        verbose=verbose)
        
    def process(self):
        anime_data = pd.read_csv("./data/anime.csv", na_values = "Unknown")
        rating_data = pd.read_csv("./data/rating_complete.csv.zip")
        # Tranfrom the anime dataframe and rating dataser
        anime_data = (anime_data.pipe(tranform_duration)\
                .pipe(tranform_aired)\
                .pipe(fill_numeric_missing)\
                .pipe(tranform_aired)\
                .pipe(create_genres_dummy)\
                .pipe(create_dummy)\
                .pipe(remove_irrelevant_feature)\
                .pipe(normalize_feature)
        )
        anime_data = anime_data.dropna()
        rating_data = (rating_data.pipe(filter_low_rating)\
            .pipe(filter_non_exist_anime, anime_id = anime_data.MAL_ID)\
        )
        rating_data = rating_data.sample(frac=1, random_state=20221104)
        
        user_id_mapper, anime_id_mapper = create_id_mapper(rating_data)
        torch.save(anime_id_mapper, "data/anime_id_mapper.pt")
        # filter anime does have any rating 
        anime_data = anime_data[anime_data.MAL_ID.isin(anime_id_mapper)].iloc[:, 1:]
        anime_features = torch.from_numpy(anime_data.to_numpy())
        edges_src = torch.from_numpy(rating_data['user_id'].apply(lambda idx: user_id_mapper[idx]).to_numpy())
        edges_dst = torch.from_numpy(rating_data['anime_id'].apply(lambda idx: anime_id_mapper[idx]).to_numpy())
        num_nodes_dict = {'anime': len(anime_id_mapper), 'user': len(user_id_mapper)}
        
        self.graph = dgl.heterograph({
            ('user', 'like', 'anime'): (edges_src, edges_dst),
            ('anime', 'rev_like', 'user'): (edges_dst, edges_src)
        }, num_nodes_dict = num_nodes_dict, idtype=torch.int32)
        self.graph.nodes['anime'].data['feat'] = anime_features

        n_edges = self.graph.number_of_edges(etype=('user', 'like', 'anime'))
        n_train_message = int(n_edges * 0.5)
        n_train_supervise = int(n_edges * 0.3)
        n_val = int(n_edges * 0.1)
        
        train_message_mask = torch.zeros(n_edges, dtype=torch.bool)
        train_supervise_mask = torch.zeros(n_edges, dtype=torch.bool)
        val_mask = torch.zeros(n_edges, dtype=torch.bool)
        test_mask = torch.zeros(n_edges, dtype=torch.bool)
        
        train_message_mask[: n_train_message] = True
        train_supervise_mask[n_train_message: n_train_message + n_train_supervise] = True
        val_mask[n_train_message + n_train_supervise: n_train_message + n_train_supervise + n_val] = True
        test_mask[n_train_message + n_train_supervise + n_val:] = True
        
        self.graph.edges['like'].data['train_message_mask'] = train_message_mask
        self.graph.edges['like'].data['train_supervise_mask'] = train_supervise_mask
        self.graph.edges['like'].data['val_mask'] = val_mask
        self.graph.edges['like'].data['test_mask'] = test_mask
        
    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

    def save(self):
        # save graphs
        graph_path = os.path.join(self.save_dir, 'anime_dgl_graph.bin')
        save_graphs(graph_path, self.graph)

    def load(self):
        # load processed data from directory `self.save_path`
        graph_path = os.path.join(self.save_dir, 'anime_dgl_graph.bin')
        self.graph = load_graphs(graph_path)[0][0]

    def has_cache(self):
        # check whether there are processed data in `self.save_path`
        graph_path = os.path.join(self.save_dir, 'anime_dgl_graph.bin')
        return os.path.exists(graph_path)

In [6]:
dataset = AnimeDataset(save_dir="./data")
graph = dataset[0]

In [7]:
graph

Graph(num_nodes={'anime': 16343, 'user': 306492},
      num_edges={('anime', 'rev_like', 'user'): 16489589, ('user', 'like', 'anime'): 16489589},
      metagraph=[('anime', 'user', 'rev_like'), ('user', 'anime', 'like')])

#### Build the model 

In [8]:
eids = np.arange(graph['like'].num_edges())
train_edge_size = graph.edges['like'].data['train_message_mask'].sum() + graph.edges['like'].data['train_supervise_mask'].sum()
val_edge_size = train_edge_size + graph.edges['like'].data['val_mask'].sum()
train_g = dgl.remove_edges(graph, eids[train_edge_size:], etype="like")
train_g = dgl.remove_edges(train_g, eids[train_edge_size:], etype="rev_like")
valid_g = dgl.remove_edges(graph, eids[val_edge_size:], etype="like")
valid_g = dgl.remove_edges(valid_g, eids[val_edge_size:], etype="rev_like")

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# hyperparameter 
batch_size = 4096 #1024
hidden_dims = 64
in_dims = graph.nodes['anime'].data['feat'].size(1)
out_dims = 64
# number of rgcn layers 
num_layer = 2
num_neg_sample = 5
# Max recommendation items to consider 
k = 10
n_epochs = 10
weight_decay = 5e-4
lr = 1e-3
etypes = graph.etypes

In [10]:
# average indegree for anime
average_user_count = graph.in_degrees(etype="like").float().median().item()
# average outdegree for user
average_anime_count = graph.out_degrees(etype="like").float().median().item()
average_user_count, average_anime_count

(23.0, 33.0)

In [11]:
def get_dataloader_with_sampling(graph, mask, sampler, device, batch_size=32, shuffle=False):
    idx = torch.nonzero(mask, as_tuple=False).int().squeeze()
    loader = dgl.dataloading.DataLoader(
        graph, {"like": idx}, sampler,
        batch_size=batch_size,
        device=device,
        shuffle=shuffle,
        drop_last=False,
        use_uva=True
    )
    return loader

def generate_history_interaction(u, v):
    result = defaultdict(list)
    for src, dst in zip(u, v):
        result[src.item()].append(dst.item())
    return result 

def generate_user_batch(g, batch_size) -> Iterator:
    user_ids = torch.arange(g.number_of_nodes("user")).type(torch.int32)
    num_batch = g.number_of_nodes("user")//batch_size+1
    for i in range(1, num_batch+1):
        if i == num_batch:
            batch_user_id = user_ids[batch_size*(i-1):]
            yield batch_user_id
            break
        batch_user_id = user_ids[batch_size*(i-1): batch_size*i]
        yield batch_user_id

In [12]:
class StochasticMultipleLayerRGCN(nn.Module):
    def __init__(self, in_feats, hidden_feat, out_feat, rel_names, n_layers):
        super().__init__()
        self.n_layers = n_layers
        self.layers = nn.ModuleList()

        # hidden layers
        for i in range(n_layers - 1):
            if i == 0:
                # input layer
                self.layers.append(dglnn.HeteroGraphConv({
                        rel : dglnn.SAGEConv(in_feats, hidden_feat, aggregator_type='mean')
                        for rel in rel_names
                    })
                )
            else:
                self.layers.append(dglnn.HeteroGraphConv({
                        rel : dglnn.SAGEConv(hidden_feat, hidden_feat, aggregator_type='mean')
                        for rel in rel_names
                    })
                )

        # output layer
        self.layers.append(dglnn.HeteroGraphConv({
                rel : dglnn.SAGEConv(hidden_feat, out_feat, aggregator_type='mean')
                for rel in rel_names
            })
        )
        
    def forward(self, blocks, x):
        for i, layer in enumerate(self.layers):
            x = layer(blocks[i], x)
        return x
    
class ScorePredictor(nn.Module):
    def forward(self, edge_subgraph, x):
        with edge_subgraph.local_scope():
            edge_subgraph.ndata['x'] = x
            for etype in edge_subgraph.canonical_etypes:
                edge_subgraph.apply_edges(
                    dgl.function.u_dot_v('x', 'x', 'score'), etype=etype)
            return edge_subgraph.edata['score']

class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, etypes, num_layer=2, use_feat=False):
        super().__init__()
        self.in_dims = in_features
        self.hidden_dims = hidden_features
        self.out_dims = out_features
        self.use_feat = use_feat
        self.num_layer = num_layer
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        if use_feat:
            self.anime_lin = torch.nn.Linear(in_features, hidden_features)
        self.anime_emb = torch.nn.Embedding(graph.num_nodes("anime"), hidden_features)
        self.user_emb = torch.nn.Embedding(graph.num_nodes("user"), hidden_features)
        self.rgcn = StochasticMultipleLayerRGCN(
            hidden_features, hidden_features, out_features, etypes, num_layer
        )
        self.pred = ScorePredictor()

    def forward(self, input_nodes, positive_graph, negative_graph, blocks, x):
        x_anime = self.anime_lin(x["anime"])+self.anime_emb(input_nodes["anime"]) if self.use_feat else self.anime_emb(input_nodes["anime"]) 
        x_dict = {
            "user": self.user_emb(input_nodes["user"]),
            "anime": x_anime,
        }
            
        x = self.rgcn(blocks, x_dict)
        pos_score = self.pred(positive_graph, x)
        neg_score = self.pred(negative_graph, x)
        return pos_score, neg_score
    
    def inference(self, g, anime_feat, batch_size, device):
        """
        Perform offline inference 
        Params
         - g: The dgl graph without test edge 
         - x_anime: feature tensor for anime
        
        Return
         - Final embedding for all node 
        
        """

        # input embedding/ feature      
        x_anime = self.anime_lin(anime_feat.float().to(device))+self.anime_emb.weight if self.use_feat else self.anime_emb.weight 
        x = {
            "user": self.user_emb.weight,
            "anime": x_anime
        }
        
        for l, layer in enumerate(self.rgcn.layers):
            # initialize the next layer embedding(output)
            y = {
                k: torch.zeros(
                    g.number_of_nodes(k),
                    self.hidden_dims if l != 1 else self.out_dims,
                )
                for k in g.ntypes
            }
            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)

            # sample all node in graph 
            dataloader = dgl.dataloading.DataLoader(
                g,
                {k: torch.arange(g.number_of_nodes(k), dtype=torch.int32) for k in g.ntypes},
                sampler,
                batch_size=batch_size,
                shuffle=True,
                drop_last=False,
                use_uva=True,
                device=device
            )
            
            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
                block = blocks[0].to(device)
                input_nodes = {k: v.long() for k, v in input_nodes.items()}
                output_nodes = {k: v.long() for k, v in output_nodes.items()}
                # Select subset of x
                h = {
                    k: x[k][input_nodes[k]].to(device)
                    for k in input_nodes.keys()
                }
                h = layer(block, h)
                # update y
                for k in output_nodes.keys():
                    y[k][output_nodes[k]] = h[k].cpu()
            # update x 
            x = y
        # Save the computation result 
        #torch.save(y, "data/final_embedding_dict.pt")
        return y 
    
    def recommend(self, y, src_index = None, hist_dict = None, k = 1):
        """
        Generate top k recommendation for node with 'src_index'
        Params:
        - y: the final embedding for the user and anime, which is a dict of tensor 
        - src_index: tensor index for the user who we want to generate anime recommendation
        - hist_dict: contain the history interaction for anime and user. use it to filter the recommendation set,
        must contain same src_index
        Return:
        - dict of indices for the recommended anime 
        """
        #y = torch.load("data/final_embedding_dict.pt")
        user_src = y["user"]
        anime_dst = y["anime"]
        top_index = defaultdict(list)
        
        if src_index is not None:
            user_src = user_src[src_index]

        pred = user_src @ anime_dst.t()
        # IndexError: index 1024 is out of bounds for dimension 0 with size 1024
        #print(pred)
        #print(pred.size())
        # IndexError: index -1024 is out of bounds for dimension 0 with size 316
        for new_uid, uid in enumerate(hist_dict):
            # the uid 1024 is exceed the pred index 
            # because when second batch is coming, the index is reset for pred(don't use global index) 
            # we can solve it by tracking the # of batch of batch size
            # another way is to just use the size of hist_dict to construct the index
            _, indices = pred[new_uid].sort(descending=True, dim=-1)
            mask = torch.isin(indices, torch.tensor(hist_dict[uid]), invert=True)
            top_index[uid] = indices[mask][:k].tolist()
            
        return top_index

In [13]:
# Loss function and evaluation metrics 
def compute_loss(pos_score, neg_score):
    # Margin loss
    n_edges = pos_score.shape[0]
    return (1 - pos_score + neg_score.view(n_edges, -1)).clamp(min=0).mean()

def mrr(pos_score, neg_score):
    """
    Calculates mean reciprocal rank (MRR) for given positive sample score and negative sample score
    Used only in validation context
    :param pos_score: BX1 tensor with the logit score
    :param neg_score: (B*k)X1 tensor with the logit score
    :return: Mean reciprocal rank score
    """
    n_edges = pos_score.shape[0]
    neg_score = neg_score.view(n_edges, -1)
    # indice for the sorted score
    indices = torch.cat([pos_score, neg_score], dim=1).argsort(dim=1, descending=True)
    return (1.0 / (indices == 0).nonzero()[:, 1].float().add(1.0)).sum().item()

def hit_at_k(top_index, hist_watch_dict, k):
    """Calculates number of hits@k: 
    Formula: # of user who find the relevant item in top k set/ # of user 
    :param top_index: dict of recommended anime per user 
    :param hist_watch_dict: dict of user watching history in the test set

    :param k: number of top K results to be considered as hits
    :return: Hits@K score
    """
    # AssertionError:
    # different len for test user and recommendation user
    # assert len(top_index) == len(hist_watch_dict)
    num_user = len(hist_watch_dict)
    total_hits = 0
    for uid in hist_watch_dict:
        s1 = set(top_index[uid][:k])
        s2 = set(hist_watch_dict[uid])
        # find the comment element between two set
        s3 = s1.intersection(s2)
        if len(s3)>0: total_hits +=1 
    return total_hits/num_user

def precision_at_k(top_index, hist_watch_dict, k):
    """Calculates number of precision@k: 
    Formula: # of recommended items that is relevant(TP) / # of recommended items(TP+FP) 
    :param top_index: dict of recommended anime per user 
    :param hist_watch_dict: dict of user watching history in the test set

    :param k: number of top K results to be recommended 
    :return: Precision@K score
    """
    num_user = len(hist_watch_dict)
    total_tp = 0
    for uid in hist_watch_dict:
        s1 = set(top_index[uid][:k])
        s2 = set(hist_watch_dict[uid])
        # find the comment element between two set
        s3 = s1.intersection(s2)
        total_tp += len(s3)
    return total_tp/(num_user*k)

In [14]:
# Trainning loop
def Train(model, train_dataloader, val_dataloader, optimizer, n_epochs, best_model_path):
    model = model.to(device)
    best_mrr = best_epoch =  0
    auc_lst = []
    mrr_lst = []
    for epoch in range(1, n_epochs+1):
        model.train()
        total_loss = total_examples = 0
        for input_nodes, positive_graph, negative_graph, blocks in tqdm.tqdm(train_loader):
            input_features = {k: v.float() for k, v in blocks[0].srcdata['feat'].items()}
            pos_score_dict, neg_score_dict = model(input_nodes, positive_graph, negative_graph, blocks, input_features)
            
            pos_score = pos_score_dict["user", "like", "anime"]
            neg_score = neg_score_dict["user", "like", "anime"]
            loss = compute_loss(pos_score, neg_score)
            opt.zero_grad()
            loss.backward()
            opt.step()
            
            total_loss += float(loss) * pos_score.shape[0]
            total_examples += pos_score.shape[0]
        auc, mrr = Evaluate(model, val_dataloader)
        auc_lst.append(auc)
        mrr_lst.append(mrr)
        if mrr > best_mrr:
            best_mrr = mrr
            best_epoch = epoch
            torch.save(model.state_dict(), PATH)
        logging.info(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}, AUC: {auc:.4f}, MRR: {mrr:.4f}")
    logging.info(f"The best epoch is {best_epoch} with mrr score {best_mrr:.4f}")
    return auc_lst, mrr_lst

In [15]:
def Evaluate(model, eval_dataloader):
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        preds = []
        ground_truths = []
        mrr_sum = 0
        num_pos_edge = 0
        for input_nodes, positive_graph, negative_graph, blocks in tqdm.tqdm(eval_dataloader):
            input_features = {k: v.float() for k, v in blocks[0].srcdata['feat'].items()}
            pos_score_dict, neg_score_dict = model(input_nodes, positive_graph, negative_graph, blocks, input_features)

            pos_score = pos_score_dict["user", "like", "anime"]
            neg_score = neg_score_dict["user", "like", "anime"]
            
            pred = torch.cat([pos_score, neg_score])
            label = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
            mrr_sum += mrr(pos_score, neg_score)
            num_pos_edge += len(pos_score)
            preds.append(pred)
            ground_truths.append(label)
        preds = torch.cat(preds, dim=0).cpu().numpy()
        ground_truths = torch.cat(ground_truths, dim=0).cpu().numpy()
        auc = roc_auc_score(ground_truths, preds)
    return auc, mrr_sum/ num_pos_edge

In [16]:
def Test(model, max_k, val_g, full_g, batch_size, device, path):
    """
    Get the final embedding with the validation subgraph 
    and generate recommendation then evaluate the performance with hit rate 
    Params:
    - model: pytorch model
    - max_k: max # of recommendation results 
    - val_g: subgraph 
    - full_g: fullgraph
    - path: where to save the recommendation list for each user 
    return:
    - dict of user id and topk recommendation items 
    - test hit@k dict k={1, 2, ..., max_k}
    """
    test_hr_dict={}
    test_precision_dict={}
    model = model.to(device)
    # Generate final embedding and save the result
    y = model.inference(val_g, graph.ndata["feat"]["anime"], batch_size*3, device)
    results = defaultdict(list)
    user_batch = generate_user_batch(full_g, batch_size=batch_size*3)
    for uids in tqdm.tqdm(user_batch):
        # DGLError: Expect argument "u" to have data type torch.int32. But got torch.int64.
        u, v = val_g.out_edges(uids, etype='like')
        hist_interact_dict = generate_history_interaction(u, v)
        #print(uids)
        topk_rec = model.recommend(y, src_index = uids.long(), k=max_k, hist_dict = hist_interact_dict)
        results.update(topk_rec)
    # Save the result recommendation
    torch.save(results, path)
    # get the testing edge 
    test_eid = torch.nonzero(full_g.edges['like'].data['test_mask'], as_tuple=False).int().squeeze()
    test_interact_dict = generate_history_interaction(*full_g.find_edges(test_eid, etype="like"))
    # evaluate the result
    for k in range(1, max_k+1):
        hr = hit_at_k(results, test_interact_dict, k)
        precision = precision_at_k(results, test_interact_dict, k)
        logging.info(f"hit@{k}: {hr:.4f}")
        logging.info(f"precision@{k}: {precision:.4f}")
        test_hr_dict[k] = hr
        test_precision_dict[k] = precision
        
    return results, test_hr_dict, test_precision_dict

In [17]:
# define data loader 
train_sampler = dgl.dataloading.NeighborSampler(
    [{
        ("user", "like", "anime"): average_user_count,
        ("anime", "rev_like", "user"): average_anime_count
    }]*num_layer)
train_sampler = dgl.dataloading.as_edge_prediction_sampler(
    train_sampler, 
    negative_sampler=dgl.dataloading.negative_sampler.Uniform(num_neg_sample),
    exclude='reverse_types',
    reverse_etypes={'like': 'rev_like', 'rev_like': 'like'}
)
train_supervise_mask = graph.edges['like'].data['train_supervise_mask']
train_loader = get_dataloader_with_sampling(
    graph = train_g,
    mask = train_supervise_mask,
    sampler = train_sampler,
    batch_size = batch_size,
    shuffle = True,
    device = device
)

In [18]:
val_sampler = dgl.dataloading.NeighborSampler(
    [{
        ("user", "like", "anime"): average_user_count,
        ("anime", "rev_like", "user"): average_anime_count
    }]*num_layer)
val_sampler = dgl.dataloading.as_edge_prediction_sampler(
    val_sampler, 
    negative_sampler=dgl.dataloading.negative_sampler.Uniform(num_neg_sample),
    exclude='reverse_types',
    reverse_etypes={'like': 'rev_like', 'rev_like': 'like'}
)
val_mask = graph.edges['like'].data['val_mask']
val_loader = get_dataloader_with_sampling(
    graph = valid_g,
    mask = val_mask,
    sampler = val_sampler,
    batch_size = batch_size*3,
    shuffle = False,
    device = device
)

#### Result without feature

In [19]:
# Specify a path
PATH = "data/state_dict_model.pt"
# Run trainning and save the model with highest auc
model = Model(in_dims, hidden_dims, out_dims, etypes, num_layer)
model = model.to(device)
model = model.float()
opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay = weight_decay)
auc_lst, mrr_lst = Train(model, train_loader, val_loader, opt, n_epochs, PATH)

100%|██████████| 1208/1208 [01:07<00:00, 17.96it/s]
100%|██████████| 135/135 [00:06<00:00, 21.88it/s]
100%|██████████| 1208/1208 [01:06<00:00, 18.22it/s]
100%|██████████| 135/135 [00:06<00:00, 21.79it/s]
100%|██████████| 1208/1208 [01:07<00:00, 17.97it/s]
100%|██████████| 135/135 [00:06<00:00, 21.80it/s]
100%|██████████| 1208/1208 [01:06<00:00, 18.13it/s]
100%|██████████| 135/135 [00:06<00:00, 21.85it/s]
100%|██████████| 1208/1208 [01:07<00:00, 17.91it/s]
100%|██████████| 135/135 [00:06<00:00, 21.13it/s]
100%|██████████| 1208/1208 [01:06<00:00, 18.06it/s]
100%|██████████| 135/135 [00:06<00:00, 21.65it/s]
100%|██████████| 1208/1208 [01:06<00:00, 18.09it/s]
100%|██████████| 135/135 [00:06<00:00, 21.60it/s]
100%|██████████| 1208/1208 [01:06<00:00, 18.13it/s]
100%|██████████| 135/135 [00:06<00:00, 21.65it/s]
100%|██████████| 1208/1208 [01:07<00:00, 17.94it/s]
100%|██████████| 135/135 [00:06<00:00, 21.73it/s]
100%|██████████| 1208/1208 [01:06<00:00, 18.17it/s]
100%|██████████| 135/135 [00:0

In [20]:
# Specify a path
PATH = "data/state_dict_model.pt"
REC_PATH = "data/topk_recommendation.pt"
# Load the model and run testing 
model = Model(in_dims, hidden_dims, out_dims, etypes, num_layer)
model.load_state_dict(torch.load(PATH, map_location=device))
model.eval()
rec_result, hr_dict, precision_dict = Test(model, k, valid_g, graph, batch_size, device, REC_PATH)
# test: running time 24 mins. 300it

100%|██████████| 27/27 [00:00<00:00, 85.76it/s]
100%|██████████| 27/27 [00:01<00:00, 21.13it/s]
25it [17:29, 41.98s/it]


#### Result with feature

In [21]:
logging.basicConfig(
    filename='model_with_feat.log', 
    filemode='a',
    force = True,
    format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S',
    level=logging.INFO
)

In [22]:
# Specify a path
PATH = "data/state_dict_model_feat.pt"
# Run trainning and save the model with highest auc
model = Model(in_dims, hidden_dims, out_dims, etypes, num_layer, use_feat = True)
model = model.to(device)
model = model.float()
opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay = weight_decay)
auc_lst1, mrr_lst1 = Train(model, train_loader, val_loader, opt, n_epochs, PATH)

100%|██████████| 1208/1208 [01:07<00:00, 17.99it/s]
100%|██████████| 135/135 [00:06<00:00, 21.79it/s]
100%|██████████| 1208/1208 [01:07<00:00, 17.94it/s]
100%|██████████| 135/135 [00:06<00:00, 21.59it/s]
100%|██████████| 1208/1208 [01:07<00:00, 18.01it/s]
100%|██████████| 135/135 [00:06<00:00, 21.83it/s]
100%|██████████| 1208/1208 [01:07<00:00, 17.93it/s]
100%|██████████| 135/135 [00:06<00:00, 21.76it/s]
100%|██████████| 1208/1208 [01:07<00:00, 17.90it/s]
100%|██████████| 135/135 [00:06<00:00, 21.70it/s]
100%|██████████| 1208/1208 [01:06<00:00, 18.04it/s]
100%|██████████| 135/135 [00:06<00:00, 21.73it/s]
100%|██████████| 1208/1208 [01:07<00:00, 17.78it/s]
100%|██████████| 135/135 [00:06<00:00, 21.68it/s]
100%|██████████| 1208/1208 [01:07<00:00, 17.94it/s]
100%|██████████| 135/135 [00:06<00:00, 21.65it/s]
100%|██████████| 1208/1208 [01:07<00:00, 17.86it/s]
100%|██████████| 135/135 [00:06<00:00, 21.70it/s]
100%|██████████| 1208/1208 [01:07<00:00, 17.97it/s]
100%|██████████| 135/135 [00:0

In [23]:
# Specify a path
PATH = "data/state_dict_model_feat.pt"
REC_PATH = "data/topk_recommendation_feat.pt"
# Load the model and run testing 
model = Model(in_dims, hidden_dims, out_dims, etypes, num_layer, use_feat=True)
model.load_state_dict(torch.load(PATH, map_location=device))
model.eval()
rec_result1, hr_dict1, precision_dict = Test(model, k, valid_g, graph, batch_size, device, REC_PATH)
# test: running time 24 mins. 300it

100%|██████████| 27/27 [00:00<00:00, 92.65it/s]
100%|██████████| 27/27 [00:00<00:00, 32.47it/s]
25it [17:23, 41.75s/it]


Observation:
- With feature the trainning score is better 
- However the testing score is lower (Overfitting), Not generalize to the test example

#### Random Walk Approch 
In the following section, I will try to explore using random walk sampling to make recommendation and answer the following questions
- If we sample the random walk for each user without the test edge (only validation subgraph), what is the top 10 most visited anime?
- What is the length of the random walk? Are there optimal number of length?
- Should we include restart probability? Are there optimal number of it?
- How can we evaluate the perfomance of the model and tune the parameter?
- What should be the metapath from the user? user->anime->user->anime ....
- What if we can find other attribute connect different anime together (leverage feature of the anine e.g. genre/ yr)



In [19]:
from dgl.sampling import random_walk
from itertools import product

In [20]:
logging.basicConfig(
    filename='model_random_walk.log', 
    filemode='a',
    force = True,
    format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S',
    level=logging.INFO
)

In [76]:
def recommend_with_random_walk(g, num_random_walks: int = 1000, num_repeated: int = 2, restart_prob: float = .5, k: int = 10):
    
    seed = torch.arange(g.num_nodes("user")).type(torch.int32)
    tmp = []
    for _ in tqdm.tqdm(range(num_random_walks)):
        result = random_walk(g, seed.to(device), metapath=['like', 'rev_like'] * num_repeated,
            restart_prob=torch.FloatTensor([0, restart_prob, 0, restart_prob]).to(device)
        )
        # result[0] return the path, result[1] return the type of node for that path
        # 0-> anime, 1-> user
        # -1 mean the trace is terminated 
        sample_anime = result[0][:, result[1] == 0]
        tmp.append(sample_anime)
    final_sample_anime = torch.concat(tmp, axis=1)
    topk_visited_anime = {}
    for i in tqdm.tqdm(range(g.num_nodes("user"))):
        c = Counter()
        u, v = g.out_edges([i], etype='like')
        c.update(final_sample_anime[i, :].tolist())
        for dst in v.tolist():
            del c[dst]
        del c[-1]
        topk_visited_anime[i] = [anime_idx for anime_idx,_ in c.most_common(k)]
    return topk_visited_anime
    

In [33]:
def fit_random_walk(train_g, full_g, params: dict, k: int=10):
    assert list(params.keys()) == ['num_repeated', 'restart_prob']
    best_precision = 0 
    best_params = {}
    params_comb = product(params["num_repeated"], params["restart_prob"])
    # get the validation edge 
    val_eid = torch.nonzero(full_g.edges['like'].data['val_mask'], as_tuple=False).int().squeeze()
    val_interact_dict = generate_history_interaction(*full_g.find_edges(val_eid, etype="like"))
    for num_repeated, restart_prob in params_comb:
        topk_visited_anime = recommend_with_random_walk(train_g, num_repeated = num_repeated, restart_prob = restart_prob)
        # evaluate the result
        hr = hit_at_k(topk_visited_anime, val_interact_dict, k)
        precision = precision_at_k(topk_visited_anime, val_interact_dict, k)
        logging.info(f"({num_repeated}, {restart_prob}) :: hit@{k}: {hr:.4f}, precision@{k}: {precision:.4f}")
        if precision > best_precision:
            best_precision = precision
            best_params = {"num_repeated": num_repeated, "restart_prob": restart_prob}
    logging.info(f"The best params is {best_params} with precision {best_precision:.4f}")
    return best_params

In [34]:
def test_random_walk(val_g, full_g, params: dict, max_k: int=10):
    assert list(params.keys()) == ['num_repeated', 'restart_prob']
    test_hr_dict = {}
    test_precision_dict = {}
    # get the testing edge 
    test_eid = torch.nonzero(full_g.edges['like'].data['test_mask'], as_tuple=False).int().squeeze()
    test_interact_dict = generate_history_interaction(*full_g.find_edges(test_eid, etype="like"))
    topk_visited_anime = recommend_with_random_walk(val_g, **params)
    # evaluate the result
    for k in range(1, max_k+1):
        hr = hit_at_k(topk_visited_anime, test_interact_dict, k)
        precision = precision_at_k(topk_visited_anime, test_interact_dict, k)
        logging.info(f"hit@{k}: {hr:.4f}")
        logging.info(f"precision@{k}: {precision:.4f}")
        test_hr_dict[k] = hr
        test_precision_dict[k] = precision
        
    return topk_visited_anime, test_hr_dict, test_precision_dict

In [24]:
# num_repeated = 1 is meaningless as it means recommend the previous watched anime 
params = {
    "num_repeated" : [i for i in range(2, 5)],
    "restart_prob" : [i/10 for i in range(0, 10, 2)]
}

In [25]:
best_params = fit_random_walk(train_g, graph, params = params, k = k )
topk_visited_anime, test_hr_dict, test_precision_dict = test_random_walk(valid_g, graph, best_params, max_k = k)

100%|██████████| 1000/1000 [00:31<00:00, 31.56it/s]
100%|██████████| 306492/306492 [02:38<00:00, 1928.26it/s]
100%|██████████| 1000/1000 [00:24<00:00, 41.08it/s]
100%|██████████| 306492/306492 [02:36<00:00, 1962.15it/s]
100%|██████████| 1000/1000 [00:21<00:00, 47.17it/s]
100%|██████████| 306492/306492 [02:33<00:00, 2001.94it/s]
100%|██████████| 1000/1000 [00:18<00:00, 55.41it/s]
100%|██████████| 306492/306492 [02:26<00:00, 2085.69it/s]
100%|██████████| 1000/1000 [00:14<00:00, 66.71it/s]
100%|██████████| 306492/306492 [02:18<00:00, 2210.46it/s]
100%|██████████| 1000/1000 [00:43<00:00, 22.96it/s]
100%|██████████| 306492/306492 [03:20<00:00, 1525.57it/s]
100%|██████████| 1000/1000 [00:34<00:00, 28.98it/s]
100%|██████████| 306492/306492 [03:13<00:00, 1581.81it/s]
100%|██████████| 1000/1000 [00:26<00:00, 37.32it/s]
100%|██████████| 306492/306492 [03:04<00:00, 1664.57it/s]
100%|██████████| 1000/1000 [00:20<00:00, 48.62it/s]
100%|██████████| 306492/306492 [02:53<00:00, 1763.44it/s]
100%|█████

#### Why random walk perform better
- Simpler approach
- Different neighbour will have different importance, rather than assumming they are same 
- Take information from more than 3 hop neighbour (which is better than 2 hop neighbour)

#### Generate Recommendation
- show the result recommendation for some user and what they have watch before

In [36]:
recommended_anime = recommend_with_random_walk(graph.to(device), **best_params)

100%|██████████| 1000/1000 [00:04<00:00, 203.04it/s]
100%|██████████| 306492/306492 [04:27<00:00, 1147.48it/s]


In [49]:
anime_data = pd.read_csv("./data/anime.csv", na_values = "Unknown")
anime_mapper = torch.load("data/anime_id_mapper.pt")
anime_mapper = {v:k for k, v in anime_mapper.items()}

In [58]:
def idx2anime(anime_data: pd.DataFrame, anime_mapper: dict, recommendation_idx: list) -> pd.DataFrame:
    result = pd.DataFrame({"anime_id": [anime_mapper[idx] for idx in recommendation_idx]})
    result = result.merge(
        anime_data,
        left_on='anime_id', right_on="MAL_ID"
    )
    return result

In [81]:
random_user_idx = np.random.randint(0, graph.num_nodes("user"))
recommendation_df = idx2anime(anime_data, anime_mapper, recommended_anime[random_user_idx])
recommendation_df.sample(10)

Unnamed: 0,anime_id,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
4,1575,1575,Code Geass: Hangyaku no Lelouch,8.72,"Action, Military, Sci-Fi, Super Power, Drama, ...",Code Geass:Lelouch of the Rebellion,コードギアス 反逆のルルーシュ,TV,25.0,"Oct 6, 2006 to Jul 29, 2007",...,326710.0,309688.0,213516.0,93305.0,31697.0,14686.0,7065.0,3100.0,1630.0,2621.0
0,19815,19815,No Game No Life,8.2,"Game, Adventure, Comedy, Supernatural, Ecchi, ...","No Game, No Life",ノーゲーム・ノーライフ,TV,12.0,"Apr 9, 2014 to Jun 25, 2014",...,227827.0,285623.0,309230.0,186864.0,70141.0,30284.0,14345.0,6228.0,3313.0,3521.0
8,4224,4224,Toradora!,8.24,"Slice of Life, Comedy, Romance, School",Toradora!,とらドラ！,TV,25.0,"Oct 2, 2008 to Mar 26, 2009",...,185286.0,255249.0,273494.0,160797.0,58749.0,25479.0,10810.0,4372.0,2456.0,2890.0
3,5114,5114,Fullmetal Alchemist: Brotherhood,9.19,"Action, Military, Adventure, Comedy, Drama, Ma...",Fullmetal Alchemist:Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,TV,64.0,"Apr 5, 2009 to Jul 4, 2010",...,714811.0,401507.0,199160.0,70045.0,20210.0,9308.0,3222.0,1536.0,2162.0,16806.0
1,1535,1535,Death Note,8.63,"Mystery, Police, Psychological, Supernatural, ...",Death Note,デスノート,TV,37.0,"Oct 4, 2006 to Jun 27, 2007",...,557406.0,535252.0,415890.0,201522.0,68577.0,28048.0,10462.0,3692.0,2256.0,3586.0
2,2904,2904,Code Geass: Hangyaku no Lelouch R2,8.91,"Action, Military, Sci-Fi, Super Power, Drama, ...",Code Geass:Lelouch of the Rebellion R2,コードギアス 反逆のルルーシュ 続編,TV,25.0,"Apr 6, 2008 to Sep 28, 2008",...,358705.0,247100.0,148191.0,65065.0,22685.0,9786.0,5306.0,2505.0,1322.0,2250.0
6,16498,16498,Shingeki no Kyojin,8.48,"Action, Military, Mystery, Super Power, Drama,...",Attack on Titan,進撃の巨人,TV,25.0,"Apr 7, 2013 to Sep 29, 2013",...,470882.0,514879.0,459113.0,220228.0,70768.0,31141.0,11805.0,4637.0,2707.0,4939.0
5,23273,23273,Shigatsu wa Kimi no Uso,8.74,"Drama, Music, Romance, School, Shounen",Your Lie in April,四月は君の嘘,TV,22.0,"Oct 10, 2014 to Mar 20, 2015",...,307670.0,250337.0,177967.0,82986.0,30877.0,14233.0,7001.0,2911.0,1562.0,2645.0
7,33486,33486,Boku no Hero Academia 2nd Season,8.33,"Action, Comedy, Super Power, School, Shounen",My Hero Academia 2,僕のヒーローアカデミア,TV,25.0,"Apr 1, 2017 to Sep 30, 2017",...,188165.0,322936.0,347893.0,166839.0,46700.0,16556.0,4760.0,1849.0,948.0,2065.0
9,25777,25777,Shingeki no Kyojin Season 2,8.45,"Action, Military, Mystery, Super Power, Drama,...",Attack on Titan Season 2,進撃の巨人 Season2,TV,12.0,"Apr 1, 2017 to Jun 17, 2017",...,239823.0,308956.0,304020.0,135241.0,38504.0,12841.0,5160.0,1716.0,839.0,1804.0


In [82]:
_, v = graph.out_edges([random_user_idx], etype='like')
watched_df = idx2anime(anime_data, anime_mapper, v.tolist())
watched_df.sample(10, replace=True)

Unnamed: 0,anime_id,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
35,11577,11577,Steins;Gate Movie: Fuka Ryouiki no Déjà vu,8.5,"Sci-Fi, Drama",Steins;Gate:The Movie − Load Region of Déjà Vu,劇場版 シュタインズゲート 負荷領域のデジャヴ,Movie,1.0,"Apr 20, 2013",...,63370.0,83098.0,69984.0,32171.0,10369.0,3911.0,1604.0,628.0,310.0,690.0
25,6547,6547,Angel Beats!,8.15,"Action, Comedy, Drama, School, Supernatural",Angel Beats!,Angel Beats!（エンジェルビーツ）,TV,13.0,"Apr 3, 2010 to Jun 26, 2010",...,200757.0,242398.0,264330.0,169476.0,68296.0,31152.0,14127.0,5810.0,2758.0,2569.0
32,20517,20517,Little Busters!: EX,7.74,"Slice of Life, Comedy, Supernatural, Drama, Ro...",Little Busters! EX,リトルバスターズ！EX,Special,8.0,"Jan 29, 2014 to Jul 30, 2014",...,2493.0,3678.0,6886.0,5233.0,1963.0,818.0,293.0,113.0,53.0,60.0
29,27775,27775,Plastic Memories,7.94,"Sci-Fi, Drama, Romance",Plastic Memories,プラスティック・メモリーズ,TV,13.0,"Apr 5, 2015 to Jun 28, 2015",...,51548.0,66207.0,94793.0,65999.0,26175.0,11927.0,5412.0,2090.0,945.0,739.0
0,28851,28851,Koe no Katachi,9.0,"Drama, School, Shounen",A Silent Voice,聲の形,Movie,1.0,"Sep 17, 2016",...,393684.0,295492.0,156604.0,61581.0,19228.0,7135.0,3108.0,1242.0,698.0,2071.0
6,33731,33731,Gabriel DropOut,7.49,"Comedy, Demons, Supernatural, School, Shounen",Gabriel DropOut,ガヴリールドロップアウト,TV,12.0,"Jan 9, 2017 to Mar 27, 2017",...,12557.0,22652.0,55211.0,55489.0,21305.0,8645.0,2852.0,1055.0,462.0,400.0
32,20517,20517,Little Busters!: EX,7.74,"Slice of Life, Comedy, Supernatural, Drama, Ro...",Little Busters! EX,リトルバスターズ！EX,Special,8.0,"Jan 29, 2014 to Jul 30, 2014",...,2493.0,3678.0,6886.0,5233.0,1963.0,818.0,293.0,113.0,53.0,60.0
28,13125,13125,Shinsekai yori,8.35,"Drama, Horror, Mystery, Psychological, Sci-Fi,...",From the New World,新世界より,TV,25.0,"Sep 29, 2012 to Mar 23, 2013",...,56953.0,64171.0,53956.0,29407.0,12975.0,6878.0,3978.0,1772.0,1021.0,1257.0
26,32281,32281,Kimi no Na wa.,8.96,"Romance, Supernatural, School, Drama",Your Name.,君の名は。,Movie,1.0,"Aug 26, 2016",...,516874.0,333022.0,200239.0,86314.0,29641.0,12257.0,5199.0,2131.0,1116.0,3966.0
13,18195,18195,Little Busters!: Refrain,8.23,"Slice of Life, Comedy, Supernatural, Drama, Ro...",Little Busters! ~Refrain~,リトルバスターズ！～Refrain～,TV,13.0,"Oct 5, 2013 to Dec 28, 2013",...,13328.0,16946.0,17388.0,10182.0,4043.0,1767.0,773.0,298.0,155.0,171.0
