In [None]:
!pip install -qq dgl-cu110 dglgo -f https://data.dgl.ai/wheels/repo.html &>/dev/null

In [None]:
import pandas as pd
import numpy as np
import cudf
from pandas import Timedelta
import os
from tqdm.notebook import tqdm
import pickle
from torch.utils.data import TensorDataset, DataLoader
import math
from operator import itemgetter
import dgl
pd.set_option('display.max_rows', 10000)

import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.nn.pytorch as dglnn
import dgl.function as FN
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

### Dataset
[Last-FM](https://www.kaggle.com/datasets/japarra27/lastfm-dataset)

[Heterogeneous Global Graph Neural Networks for Personalized
Session-based Recommendation](https://arxiv.org/pdf/2107.03813.pdf)

In [None]:
data = cudf.read_parquet('/data/items.parquet')
len(data)

In [None]:
data.head()

In [None]:
print(f" # Unique User : {data['user_id'].nunique()}, # Unique Artist : {data['artist_id'].nunique()}, # Unique Track : {data['track_name'].nunique()}")

## Create session with interval of 6 hours

Utilities function

In [None]:
def get_session_id(df, session):
    # If the next row have different user_id or the time difference is greater than session, then it is a new session
    df_prev = df.shift(1)
    is_new_session = (df['user_id'] != df_prev['user_id']) | (df['timestamp'] - df_prev['timestamp'] > session)
    session_id = is_new_session.cumsum()-1
    return session_id

def group_session(df, session):
    df['session_id'] = get_session_id(df, session)
    return df

def filter_short_session(df, min_session_length=2):
    session_length = df.groupby('session_id').size()
    session_length = session_length[session_length >= min_session_length]
    return df[df['session_id'].isin(session_length.index)]

def filter_infrequent_item(df, min_item_support=5):
    item_support = df.groupby('item_id').size()
    item_support = item_support[item_support >= min_item_support]
    return df[df['item_id'].isin(item_support.index)]

def filter_until_ok(df, min_session_length=2, min_item_support=5):
    while True:
        before = len(df)
        df = filter_short_session(df, min_session_length)
        df = filter_infrequent_item(df, min_item_support)
        after = len(df)
        if before == after:
            break
    return df

def trucate_session(df, session_length=20, is_sorted=True):
    if not is_sorted:
        df = df.sort_values(['session_id', 'timestamp'])
    item_idx = df.groupby('session_id').cumcount()
    return df[item_idx < session_length]

def update_id(df, field):
    labels = cudf.factorize(df[field])[0]
    kwargs = {field: labels}
    df = df.assign(**kwargs)
    return df


def remove_immediate_repeats(df):
    df_prev = df.shift()
    is_not_repeat = (df['session_id'] != df_prev['session_id']) | (df['item_id'] != df_prev['item_id'])
    return df[is_not_repeat]
    
def reorder_sessions(df):
    df_endtime  = df.groupby('session_id')['timestamp'].max().sort_value().reset_index()
    oid2nid = dict(zip(df_endtime['session_id'], df_endtime.index))
    df['session_id'].map(oid2nid, inplace=True)
    df.sort_values(['session_id', 'timestamp'], inplace=True)
    return df

def keep_top_n_items(df, n=40000):
    item_support = df.groupby('item_id').size()
    top_n_items = item_support.nlargest(n).index
    return df[df['item_id'].isin(top_n_items)]

def train_test_split(df, test_size=0.2):
    endtime  = df.groupby('session_id')['timestamp'].max().sort_values()
    num_test = int(len(endtime) * test_size)
    test_sessions = endtime.index[-num_test:]
    df_train = df[~df['session_id'].isin(test_sessions)]
    df_test = df[df['session_id'].isin(test_sessions)]
    return df_train, df_test
    
def save_sessions(df, filepath='data/sessions.csv'):
    df = reorder_sessions(df)
    sessions = df.groupby('session_id').itemId.apply(lambda x: ','.join(map(str, x)))
    sessions.to_csv(filepath, sep='\t', header=False, index=False)
    

In [None]:
interval = Timedelta(hours=6)
n = 40000

data = data[['user_id', 'artist_id', 'timestamp']]
data.columns = ['user_id', 'item_id', 'timestamp']

data.dropna(inplace=True)
data = update_id(data, 'user_id')
data = update_id(data, 'item_id')

data.sort_values(['user_id', 'timestamp'], inplace=True)
data = group_session(data, interval)

data = remove_immediate_repeats(data)
data = trucate_session(data, 20)

data = keep_top_n_items(data, n)
data = filter_until_ok(data)

In [None]:
data.head(10)

In [None]:
data.tail(10)

In [None]:
print(f"#Users : {data['user_id'].nunique()} #Items : {data['item_id'].nunique()} #Sessions : {data['session_id'].nunique()} ")

In [None]:
data.to_csv('data/data.csv', sep='\t', header=False, index=False)

In [None]:
def _aggregate_session(df):
    res = []
    for uid, group in df.groupby('user_id'):
        res += group.groupby('session_id')['item_id'].agg(list).tolist()
    return res

def _aggregate_df(df):
    res = dict()
    for uid, group in df.groupby('user_id'):
        res[uid] = group.groupby('session_id')['item_id'].agg(list).tolist()
        
def split_data(val_ratio = 0.2, test_ratio = 0.2):
    data = pd.read_csv('data/data.csv', sep='\t', names=['user_id', 'item_id', 'timestamp'])
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    
    df_train, df_test = train_test_split(data, test_size=test_ratio)
    
    df_test = df_test[df_test['item_id'].isin(df_train['item_id'].unique())]
    df_test = filter_short_session(df_test)

    print(f'No. of Clicks: {len(df_train) + len(df_test)}')
    print(f'No. of Items: {df_train['item_id'].nunique()}')

    # update itemId
    train_itemId_new, uniques = pd.factorize(df_train['item_id'])
    df_train = df_train.assign(item_id=train_itemId_new)
    oid2nid = {oid: i for i, oid in enumerate(uniques)}
    test_itemId_new = df_test['item_id'].map(oid2nid)
    df_test = df_test.assign(item_id=test_itemId_new)
    
    df_train['user_id']+=1
    df_train['item_id']+=1
    df_test['user_id']+=1
    df_test['item_id']+=1
    
    df_test = df_test.reset_index(drop=True)
    df_val = df_test.sample(frac=val_ratio, random_state=42)
    part_test = df_test[~df_test.index.isin(df_val.index)]
    
    with open('data/train.pkl', 'wb') as f:
        pickle.dump(_aggregate_df(df_train), f)
        
    with open('data/val.pkl', 'wb') as f:
        pickle.dump(_aggregate_df(df_val), f)
        
    with open('data/test.pkl', 'wb') as f:
        pickle.dump(_aggregate_df(part_test), f)
        
    with open('data/all_test.pkl', 'wb') as f:
        pickle.dump(_aggregate_df(df_test), f)
    

In [None]:
%%time
split_data()

## Create Label and DataLoader

In [None]:
SZ = 12
SEQ_LEN = 10 ## Window Size to create a training Sequence
BATCH_SIZE = 512

In [None]:
def common_seq(data_list):
    # final_seqs = [(user_id, seq, [next_item])]
    uid = []

    masks = []
    labels =[]
    browsed_ids = []
    temp_browsed_id = [0 for _ in SEQ_LEN]
    pos_idx = []
    seq_lens = []
    
    final_seq = []
    
    for u in tqdm(data_list):
        u_seqs = data_list[u]
        for seq in u_seqs:
            for i in range(1, len(seq)):
                
                temp_seq = seq[-i-SEQ_LEN:-i]
                len_seq = len(temp_seq)
                mask = len_seq + [0]*(SEQ_LEN-len_seq)
                pos_idx = [len_seq-1-i for i in range(len_seq)]+ [0]*(SEQ_LEN-len_seq)
                browsed_id = temp_browsed_id.copy()
                browsed_id[:len_seq] = temp_seq
                
                masks.append(mask)
                pos_idx.append(pos_idx)
                browsed_ids.append(browsed_id)
                labels.append([int(seq[-i])])
                uid.append([int(u)])
                seq_lens.append(len_seq)
                
                final_seq.append((u, seq[:-i], [seq[-i]]))
        
    labels = torch.tensor(labels, dtype=torch.long)
    uid = torch.tensor(uid, dtype=torch.long)
    masks = torch.tensor(masks, dtype=torch.bool)  
    browsed_ids = torch.tensor(browsed_ids, dtype=torch.long)
    pos_idx = torch.tensor(pos_idx, dtype=torch.long)
    seq_lens = torch.tensor(seq_lens, dtype=torch.long)
        
    return final_seq, (uid, browsed_ids, masks,seq_lens, pos_idx, labels)

In [None]:
with open('data/train.pkl', 'rb') as f:
    train = pickle.load(f)

with open('data/test.pkl', 'rb') as f:
    test = pickle.load(f)
    
test_seq, test_data = common_seq(test)
train_seq, train_data = common_seq(train)

# with open('data/train_seq.pkl', 'wb') as f:
#     pickle.dump(train_seq, f)
# with open('data/test_seq.pkl', 'wb') as f:
#     pickle.dump(test_seq, f)

In [None]:
train_iter = DataLoader(
    dataset=TensorDataset(*train_data),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)

test_iter = DataLoader(
    dataset=TensorDataset(*test_data),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)

print(f" Length of Training DataLoader is {len(train_iter)} & Test DataLoader {len(test_iter)} ")

### Check for an iteration

In [None]:
eg = next(iter(train_iter))
print(f"Number of elements in the example tuple {len(eg)} - corresponds to uid | browsed_ids | mask | seq_len | label | pos_idx ")

print("Size of - ")
print(f" User ID Tensor - {eg[0].size()} ") 
print(f" Sequence of Items - {eg[1].size()} ") 
print(f" Sequence of Masks - {eg[2].size()} ") 
print(f" Actual Sequence Length (before padding) - {eg[3].size()} ") 
print(f" Labels - {eg[4].size()} ") 
print(f" Position Index (Oldest Index = 0, Latest Index can be till 9, Padded Index = 0) - {eg[5].size()} ")

In [None]:
print("Actual tensor of - ")
print(f" User ID - {eg[0][0]} ") 
print(f" Sequence of Items - {eg[0][1]} ") 
print(f" Sequence of Masks - {eg[0][2]} ") 
print(f" Actual Sequence Length (before padding) - {eg[0][3]} ") 
print(f" Labels - {eg[0][4]} ") 
print(f" Position Index (Oldest Index = 0, Latest Index can be till 9, Padded Index = 0) - {eg[0][5]} ")

## Create Heterogenous Global Graph

### First we have to create 4 type of edges
* In and out connection (the prev node and the forward node) of a item in session
* User similarity
* Item similarity (based on session)

In [None]:
def sample_relation(num, sample_size=20):
    
    adj1 = [dict() for _ in range(num)]
    adj2 = [dict() for _ in range(num)]
    adj_in = [[] for _ in range(num)]
    adj_out = [[] for _ in range(num)]

    
    with open('data/train.pkl', 'rb') as f:
        graph = pickle.load(f)
    
    for u in tqdm(graph):
        u_seqs = graph[u]
        for seq in u_seqs:
            for i in range(1, len(seq)):
                if seq[i] not in adj1[seq[i-1]]:
                    adj1[seq[i-1]][seq[i]] = 1
                else:
                    adj1[seq[i-1]][seq[i]] += 1

                if seq[i-1] not in adj2[seq[i]]:
                    adj2[seq[i]][seq[i-1]] = 1
                else:
                    adj2[seq[i]][seq[i-1]] += 1
                    
    weights = [[] for _ in range(num)]
    
    for t in range(1, num):
        x = [v for v in sorted(adj1[t].items(), reverse=True, key=lambda x: x[1])]
        adj_out[t] = [v[0] for v in x]

    for t in range(1, num):
        x = [v for v in sorted(adj2[t].items(), reverse=True, key=lambda x: x[1])]
        adj_in[t] = [v[0] for v in x]

    # edge sampling 
    for i in range(1, num):
        adj_in[i] = adj_in[i][:sample_size]
    for i in range(1, num):
        adj_out[i] = adj_out[i][:sample_size]
        
    print(f"Items which most frequently lies to the left (previous time step) of Item 1 : {adj_in[1]} ")
    print(f"Items which most frequently lies to the right (next time step) of Item 1 : {adj_out[1]} ")
        
    with open('data/adj_in.pkl', 'wb') as f:
        pickle.dump(adj_in, f)
    
    with open('data/adj_out.pkl', 'wb') as f:
        pickle.dump(adj_out, f)

### User similarity

In [None]:
def userCF(K=100):
    
    vid_user = {}
    user_sim_matrix ={}
    uid_vcount = {}
    
    with open('data/train.pkl', 'rb') as f:
        session_data = pickle.load(f)
        
    for uid in tqdm(session_data):
        for seq in session_data[uid]:
            uid_vcount[uid] = set()
            for vid in seq:
                if vid not in vid_user:
                    vid_user[vid] = set()
                vid_user[vid].add(uid)
                uid_vcount[uid].add(vid)
                
    for vid, users in tqdm(vid_user.items()):
        for u in users:
            if u not in user_sim_matrix:
                user_sim_matrix[u] = dict()
            for v in users:
                if u == v:
                    continue
                if v not in user_sim_matrix[u]:
                    user_sim_matrix[u][v] = 0
                user_sim_matrix[u][v] += 1/len(users)
                
    for u, related_users in tqdm(user_sim_matrix.items()):
        for v, count in related_users.items():
            user_sim_matrix[u][v] = count / math.sqrt(len(uid_vcount[u]) * len(uid_vcount[v]))
    
    user_topK = {}     
            
    for user in user_sim_matrix:
        user_topK[user] = sorted(user_sim_matrix[user].items(), key=itemgetter(1), reverse=True)[:K]
        
    with open('data/user_sim_matrix.pkl', 'wb') as f:
        pickle.dump(user_topK, f)

In [None]:
def itemCF_by_Session(K=200):
    """
        calculate item similarity matrix by session (kinda like TF-IDF)
    """
    
    sess_item = {}
    item_sim_matrix = {}
    vid_ucount = {}
    sess_cnt = 0
    
    with open('data/train.pkl', 'rb') as f:
        session_data = pickle.load(f)
        
    for uid in tqdm(session_data):
        for seq in session_data[uid]:
            sess_cnt += 1
            sess_item[sess_cnt] = set()
            for vid in seq:
                sess_item[sess_cnt].add(vid)
                if vid not in vid_ucount:
                    vid_ucount[vid] = set()
                vid_ucount[vid].add(sess_cnt)
    
    for sess, items in tqdm(sess_item.items()):
        for u in items:
            if u not in item_sim_matrix:
                item_sim_matrix[u] = dict()
            for v in items:
                if u == v:
                    continue
                if v not in item_sim_matrix[u]:
                    item_sim_matrix[u][v] = 0
                item_sim_matrix[u][v] += 1/len(items)
                
    for u, related_items in tqdm(item_sim_matrix.items()):
        for v, count in related_items.items():
            item_sim_matrix[u][v] = count / math.sqrt(len(vid_ucount[u]) * len(vid_ucount[v]))
            
    item_topK = {}
    for item in item_sim_matrix:
        item_topK[item] = sorted(item_sim_matrix[item].items(), key=itemgetter(1), reverse=True)[:K]
        
    with open('data/item_sim_matrix.pkl', 'wb') as f:
        pickle.dump(item_topK, f)

In [None]:
sample_relation()
userCF()
itemCF_by_Session()

### Create Heterogenous Global Graph

In [None]:
def uui_graph(sample_size, topK,add_u=True, add_v=True):
    
    with open('data/train.pkl', 'rb') as f:
        graph = pickle.load(f)
    with open('data/adj_in.pkl', 'rb') as f:
        adj_in = pickle.load(f)
    with open('data/adj_out.pkl', 'rb') as f:
        adj_out = pickle.load(f)
    with open('data/user_sim_matrix.pkl', 'rb') as f:
        user_sim_matrix = pickle.load(f)
    with open('data/item_sim_matrix.pkl', 'rb') as f:
        item_sim_matrix = pickle.load(f)
        
    pre = []
    nxt = []
    src_v = []
    dst_u = []
    
    for i in range(1, len(adj_in)):
        _pre = []
        _nxt = []
        
        for j in adj_in[i]:
            _pre.append(i)
            _nxt.append(j)
        
        pre += _pre
        nxt += _nxt
    
    o_pre = []
    o_nxt = []
    
    for i in range(1, len(adj_out)):
        _pre = []
        _nxt = []
        
        for j in adj_out[i]:
            _pre.append(i)
            _nxt.append(j)
        
        o_pre += _pre
        o_nxt += _nxt
        
    for u in tqdm(graph):
        for seq in graph[u]:
            pre += seq[:-1]
            nxt += seq[1:]
            dst_u += [u for _ in seq]
            src_v += seq
            
    topv_src = []
    topv_dst = []
    
    for v in tqdm(item_sim_matrix):
        tmp_src =[]
        tmp_dst =[]
        
        exclusion = adj_in[v] + adj_out[v]
        for vid, value in item_sim_matrix[v][:topK][:int(len(exclusion))]:
            if vid not in exclusion:
                tmp_src.append(v)
                tmp_dst.append(vid)
                
        topv_src += tmp_src
        topv_dst += tmp_dst
        
    u_src = []
    u_dst = []
    
    for u in tqdm(user_sim_matrix):
        tmp_src =[]
        tmp_dst =[]
        
        for uid, value in user_sim_matrix[u][:topK]:
            tmp_src.append(u)
            tmp_dst.append(uid)
                
        u_src += tmp_src
        u_dst += tmp_dst
        
    item_num = max(max(pre), max(nxt)) + 1
    user_num = max(max(u_src), max(u_dst)) 
    
    u_src = [i+item_num for i in u_src]
    u_dst = [i+item_num for i in u_dst]
    dst_u = [i+item_num for i in dst_u]
    
    G = dgl.graph((pre,nxt))
    G.add_edges(nxt, pre)
    G.add_edges(dst_u, src_v)
    G.add_edges(src_v, dst_u)
    
    if add_u:
        G.add_edges(u_src, u_dst)
        G.add_edges(u_dst, u_src)
        
    if add_v:
        G.add_edges(topv_src, topv_dst)
        G.add_edges(topv_dst, topv_src)
        
    G = dgl.add_self_loop(G)
    
    return G, item_num,pre, nxt, dst_u, src_v, u_src, u_dst, topv_src, topv_dst   
            

In [None]:
G,item_num, pre, nxt, dst_u, src_v, u_src, u_dst, topv_src, topv_dst = uui_graph(SZ, topK = 20, add_u = True, add_v = True)

## HG_GNN