In [1]:
%pip install -qq dgl-cu118 dglgo -f https://data.dgl.ai/wheels/cu118/repo.html &>/dev/null

Note: you may need to restart the kernel to use updated packages.


The syntax of the command is incorrect.


In [2]:
import pandas as pd
import numpy as np
from pandas import Timedelta
import os
from tqdm.notebook import tqdm
import pickle
from torch.utils.data import TensorDataset, DataLoader
import math
from operator import itemgetter
import dgl
pd.set_option('display.max_rows', 10000)

import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.nn.pytorch as dglnn
import dgl.function as FN
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
DIRECTORY ='content/data'

### Dataset
[Last-FM](https://www.kaggle.com/datasets/japarra27/lastfm-dataset)

[Heterogeneous Global Graph Neural Networks for Personalized
Session-based Recommendation](https://arxiv.org/pdf/2107.03813.pdf)

[Github](https://github.com/0215Arthur/HG-GNN)

In [4]:
data = pd.read_parquet(f'{DIRECTORY}/lastfm_union.parquet')
len(data)

19098852

In [5]:
data.head()

Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name,gender,age,country,registered
0,user_000001,2009-05-04 23:08:57+00:00,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007,M,,JAPAN,2006-08-13
1,user_000001,2009-05-04 13:54:10+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,åæ¬é¾ä¸,,Composition 0919 (Live_2009_4_15),M,,JAPAN,2006-08-13
2,user_000001,2009-05-04 13:52:04+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,åæ¬é¾ä¸,,Mc2 (Live_2009_4_15),M,,JAPAN,2006-08-13
3,user_000001,2009-05-04 13:42:52+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,åæ¬é¾ä¸,,Hibari (Live_2009_4_15),M,,JAPAN,2006-08-13
4,user_000001,2009-05-04 13:42:11+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,åæ¬é¾ä¸,,Mc1 (Live_2009_4_15),M,,JAPAN,2006-08-13


In [6]:
print(f" # Unique User : {data['user_id'].nunique()}, # Unique Artist : {data['artist_id'].nunique()}, # Unique Track : {data['track_name'].nunique()}")

 # Unique User : 992, # Unique Artist : 107295, # Unique Track : 1083471


## Create session with interval of 6 hours

Utilities function

In [7]:
def get_session_id(df, session):
    # If the next row have different user_id or the time difference is greater than session, then it is a new session
    df_prev = df.shift(1)
    is_new_session = (df['user_id'] != df_prev['user_id']) | (df['timestamp'] - df_prev['timestamp'] > session)
    session_id = is_new_session.cumsum()-1
    return session_id

def group_session(df, session):
    df['session_id'] = get_session_id(df, session)
    return df

def filter_short_session(df, min_session_length=2):
    session_length = df.groupby('session_id').size()
    session_length = session_length[session_length >= min_session_length]
    return df[df['session_id'].isin(session_length.index)]

def filter_infrequent_item(df, min_item_support=5):
    item_support = df.groupby('item_id').size()
    item_support = item_support[item_support >= min_item_support]
    return df[df['item_id'].isin(item_support.index)]

def filter_until_ok(df, min_session_length=2, min_item_support=5):
    while True:
        before = len(df)
        df = filter_short_session(df, min_session_length)
        df = filter_infrequent_item(df, min_item_support)
        after = len(df)
        if before == after:
            break
    return df

def trucate_session(df, session_length=20, is_sorted=True):
    if not is_sorted:
        df = df.sort_values(['session_id', 'timestamp'])
    item_idx = df.groupby('session_id').cumcount()
    return df[item_idx < session_length]

def update_id(df, field):
    labels = pd.factorize(df[field])[0]
    kwargs = {field: labels}
    df = df.assign(**kwargs)
    return df


def remove_immediate_repeats(df):
    df_prev = df.shift()
    is_not_repeat = (df['session_id'] != df_prev['session_id']) | (df['item_id'] != df_prev['item_id'])
    return df[is_not_repeat]
    
def reorder_sessions(df):
    df_endtime  = df.groupby('session_id')['timestamp'].max().sort_value().reset_index()
    oid2nid = dict(zip(df_endtime['session_id'], df_endtime.index))
    df['session_id'].map(oid2nid, inplace=True)
    df.sort_values(['session_id', 'timestamp'], inplace=True)
    return df

def keep_top_n_items(df, n=40000):
    item_support = df.groupby('item_id').size()
    top_n_items = item_support.nlargest(n).index
    return df[df['item_id'].isin(top_n_items)]

def train_test_split(df, test_size=0.2):
    endtime  = df.groupby('session_id')['timestamp'].max().sort_values()
    num_test = int(len(endtime) * test_size)
    test_sessions = endtime.index[-num_test:]
    df_train = df[~df['session_id'].isin(test_sessions)]
    df_test = df[df['session_id'].isin(test_sessions)]
    return df_train, df_test
    

In [8]:
interval = Timedelta(hours=6)
n = 40000

data = data[['user_id', 'artist_id', 'timestamp']]
data.columns = ['user_id', 'item_id', 'timestamp']

data.dropna(inplace=True)
data = update_id(data, 'user_id')
data = update_id(data, 'item_id')

data.sort_values(['user_id', 'timestamp'], inplace=True)
data = group_session(data, interval)

data = remove_immediate_repeats(data)
data = trucate_session(data, 20)

data = keep_top_n_items(data, n)
data = filter_until_ok(data)

In [9]:
data.head(10)

Unnamed: 0,user_id,item_id,timestamp,session_id
16684,0,186,2006-08-13 13:59:20+00:00,0
16681,0,550,2006-08-13 14:17:40+00:00,0
16680,0,551,2006-08-13 14:19:06+00:00,0
16679,0,440,2006-08-13 14:23:03+00:00,0
16677,0,274,2006-08-13 14:55:14+00:00,0
16676,0,72,2006-08-13 14:59:59+00:00,0
16675,0,250,2006-08-13 15:05:20+00:00,0
16673,0,577,2006-08-13 15:12:12+00:00,0
16672,0,525,2006-08-13 15:17:35+00:00,0
16671,0,280,2006-08-13 15:23:08+00:00,0


In [10]:
data.tail(10)

Unnamed: 0,user_id,item_id,timestamp,session_id
19080583,991,64472,2009-05-02 00:03:08+00:00,427539
19080569,991,2486,2009-05-02 01:07:00+00:00,427539
19080560,991,12700,2009-05-02 04:00:17+00:00,427539
19080550,991,829,2009-05-02 04:34:38+00:00,427539
19080542,991,3331,2009-05-02 05:14:20+00:00,427539
19080520,991,38,2009-05-02 17:33:03+00:00,427540
19080499,991,17642,2009-05-02 19:51:15+00:00,427540
19080498,991,17805,2009-05-02 19:57:17+00:00,427540
19080485,991,38713,2009-05-03 20:28:10+00:00,427542
19080484,991,202,2009-05-04 00:27:31+00:00,427542


In [11]:
print(f"#Users : {data['user_id'].nunique()} #Items : {data['item_id'].nunique()} #Sessions : {data['session_id'].nunique()} ")

#Users : 989 #Items : 39835 #Sessions : 354770 


In [12]:
data.to_csv(f'{DIRECTORY}/data.csv', sep='\t', header=False, index=False)

In [13]:
def _aggregate_session(df):
    res = []
    for uid, group in df.groupby('user_id'):
        res += group.groupby('session_id')['item_id'].agg(list).tolist()
    return res

def _aggregate_df(df):
    res = dict()
    for uid, group in df.groupby('user_id'):
        res[uid] = group.groupby('session_id')['item_id'].agg(list).tolist()
    
    return res
        
def split_data(val_ratio = 0.2, test_ratio = 0.2):
    data = pd.read_csv('data/data.csv', sep='\t', names=['user_id', 'item_id', 'timestamp','session_id'])
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    
    df_train, df_test = train_test_split(data, test_size=test_ratio)
    
    df_test = df_test[df_test['item_id'].isin(df_train['item_id'].unique())]
    df_test = filter_short_session(df_test)


    # update itemId
    train_itemId_new, uniques = pd.factorize(df_train['item_id'])
    df_train = df_train.assign(item_id=train_itemId_new)
    oid2nid = {oid: i for i, oid in enumerate(uniques)}
    test_itemId_new = df_test['item_id'].map(oid2nid)
    df_test = df_test.assign(item_id=test_itemId_new)
    
    df_train['user_id']+=1
    df_train['item_id']+=1
    df_test['user_id']+=1
    df_test['item_id']+=1
    
    df_test = df_test.reset_index(drop=True)
    df_val = df_test.sample(frac=val_ratio, random_state=42)
    part_test = df_test[~df_test.index.isin(df_val.index)]
    
    with open(f'{DIRECTORY}/train.pkl', 'wb') as f:
        pickle.dump(_aggregate_df(df_train), f)
        
    with open(f'{DIRECTORY}/val.pkl', 'wb') as f:
        pickle.dump(_aggregate_df(df_val), f)
        
    with open(f'{DIRECTORY}/test.pkl', 'wb') as f:
        pickle.dump(_aggregate_df(part_test), f)
        
    with open(f'{DIRECTORY}/all_test.pkl', 'wb') as f:
        pickle.dump(_aggregate_df(df_test), f)
    

In [14]:
%%time
split_data()

CPU times: total: 2.17 s
Wall time: 11.9 s


## Create Label and DataLoader

In [15]:
SZ = 12
SEQ_LEN = 10 ## Window Size to create a training Sequence
BATCH_SIZE = 512

In [16]:
def common_seq(data_list):
    # final_seqs = [(user_id, seq, [next_item])]
    uid = []

    masks = []
    labels =[]
    browsed_ids = []
    temp_browsed_id = [0 for _ in range(SEQ_LEN)]
    pos_idx = []
    seq_lens = []
    
    final_seq = []
    
    for u in tqdm(data_list):
        u_seqs = data_list[u]
        for seq in u_seqs:
            for i in range(1, len(seq)):
                
                temp_seq = seq[-i-SEQ_LEN:-i]
                len_seq = len(temp_seq)
                mask = [1]*len_seq + [0]*(SEQ_LEN-len_seq)
                pos_id = [len_seq-1-i for i in range(len_seq)]+ [0]*(SEQ_LEN-len_seq)
                browsed_id = temp_browsed_id.copy()
                browsed_id[:len_seq] = temp_seq
                
                masks.append(mask)
                pos_idx.append(pos_id)
                browsed_ids.append(browsed_id)
                labels.append([int(seq[-i])])
                uid.append([int(u)])
                seq_lens.append(len_seq)
                
                final_seq.append((u, seq[:-i], [seq[-i]]))
        
    labels = torch.tensor(labels, dtype=torch.long)
    uid = torch.tensor(uid, dtype=torch.long)
    masks = torch.tensor(masks, dtype=torch.bool)  
    browsed_ids = torch.tensor(browsed_ids, dtype=torch.long)
    pos_idx = torch.tensor(pos_idx, dtype=torch.long)
    seq_lens = torch.tensor(seq_lens, dtype=torch.long)
        
    return final_seq, (uid, browsed_ids, masks,seq_lens, pos_idx, labels)

In [17]:
with open(f'{DIRECTORY}/train.pkl', 'rb') as f:
    train = pickle.load(f)

with open(f'{DIRECTORY}/test.pkl', 'rb') as f:
    test = pickle.load(f)
    
test_seq, test_data = common_seq(test)
train_seq, train_data = common_seq(train)

# with open('data/train_seq.pkl', 'wb') as f:
#     pickle.dump(train_seq, f)
# with open('data/test_seq.pkl', 'wb') as f:
#     pickle.dump(test_seq, f)

  0%|          | 0/796 [00:00<?, ?it/s]

  0%|          | 0/897 [00:00<?, ?it/s]

In [18]:
train_dataloader = DataLoader(
    dataset=TensorDataset(*train_data),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)

test_dataloader = DataLoader(
    dataset=TensorDataset(*test_data),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)

print(f" Length of Training DataLoader is {len(train_dataloader)} & Test DataLoader {len(test_dataloader)} ")

 Length of Training DataLoader is 5839 & Test DataLoader 1084 


### Check for an iteration

In [19]:
eg = next(iter(train_dataloader))
print(f"Number of elements in the example tuple {len(eg)} - corresponds to uid | browsed_ids | mask | seq_len | label | pos_idx ")

print("Size of - ")
print(f" User ID Tensor - {eg[0].size()} ") 
print(f" Sequence of Items - {eg[1].size()} ") 
print(f" Sequence of Masks - {eg[2].size()} ") 
print(f" Actual Sequence Length (before padding) - {eg[3].size()} ") 
print(f" Labels - {eg[5].size()} ") 
print(f" Position Index (Oldest Index = 0, Latest Index can be till 9, Padded Index = 0) - {eg[4].size()} ")

Number of elements in the example tuple 6 - corresponds to uid | browsed_ids | mask | seq_len | label | pos_idx 
Size of - 
 User ID Tensor - torch.Size([512, 1]) 
 Sequence of Items - torch.Size([512, 10]) 
 Sequence of Masks - torch.Size([512, 10]) 
 Actual Sequence Length (before padding) - torch.Size([512]) 
 Labels - torch.Size([512, 1]) 
 Position Index (Oldest Index = 0, Latest Index can be till 9, Padded Index = 0) - torch.Size([512, 10]) 


In [20]:
print("Actual tensor of - ")
print(f" User ID - {eg[0][0]} ") 
print(f" Sequence of Items - {eg[0][1]} ") 
print(f" Sequence of Masks - {eg[0][2]} ") 
print(f" Actual Sequence Length (before padding) - {eg[0][3]} ") 
print(f" Labels - {eg[0][5]} ") 
print(f" Position Index (Oldest Index = 0, Latest Index can be till 9, Padded Index = 0) - {eg[0][4]} ")

Actual tensor of - 
 User ID - tensor([725]) 
 Sequence of Items - tensor([89]) 
 Sequence of Masks - tensor([273]) 
 Actual Sequence Length (before padding) - tensor([367]) 
 Labels - tensor([913]) 
 Position Index (Oldest Index = 0, Latest Index can be till 9, Padded Index = 0) - tensor([773]) 


## Create Heterogenous Global Graph

### First we have to create 4 type of edges
* In and out connection (the prev node and the forward node) of a item in session
* User similarity
* Item similarity (based on session)

In [21]:
def sample_relation(num, sample_size=20):
    
    adj1 = [dict() for _ in range(num)]
    adj2 = [dict() for _ in range(num)]
    adj_in = [[] for _ in range(num)]
    adj_out = [[] for _ in range(num)]

    
    with open('data/train.pkl', 'rb') as f:
        graph = pickle.load(f)
    
    for u in tqdm(graph):
        u_seqs = graph[u]
        for seq in u_seqs:
            for i in range(1, len(seq)):
                if seq[i] not in adj1[seq[i-1]]:
                    adj1[seq[i-1]][seq[i]] = 1
                else:
                    adj1[seq[i-1]][seq[i]] += 1

                if seq[i-1] not in adj2[seq[i]]:
                    adj2[seq[i]][seq[i-1]] = 1
                else:
                    adj2[seq[i]][seq[i-1]] += 1
                    
    weights = [[] for _ in range(num)]
    
    for t in range(1, num):
        x = [v for v in sorted(adj1[t].items(), reverse=True, key=lambda x: x[1])]
        adj_out[t] = [v[0] for v in x]

    for t in range(1, num):
        x = [v for v in sorted(adj2[t].items(), reverse=True, key=lambda x: x[1])]
        adj_in[t] = [v[0] for v in x]

    # edge sampling 
    for i in range(1, num):
        adj_in[i] = adj_in[i][:sample_size]
    for i in range(1, num):
        adj_out[i] = adj_out[i][:sample_size]
        
    print(f"Items which most frequently lies to the left (previous time step) of Item 1 : {adj_in[1]} ")
    print(f"Items which most frequently lies to the right (next time step) of Item 1 : {adj_out[1]} ")
        
    with open(f'{DIRECTORY}/adj_in.pkl', 'wb') as f:
        pickle.dump(adj_in, f)
    
    with open(f'{DIRECTORY}/adj_out.pkl', 'wb') as f:
        pickle.dump(adj_out, f)

### User similarity

In [22]:
def userCF(K=100):
    
    vid_user = {}
    user_sim_matrix ={}
    uid_vcount = {}
    
    with open('data/train.pkl', 'rb') as f:
        session_data = pickle.load(f)
        
    for uid in tqdm(session_data):
        for seq in session_data[uid]:
            uid_vcount[uid] = set()
            for vid in seq:
                if vid not in vid_user:
                    vid_user[vid] = set()
                vid_user[vid].add(uid)
                uid_vcount[uid].add(vid)
                
    for vid, users in tqdm(vid_user.items()):
        for u in users:
            if u not in user_sim_matrix:
                user_sim_matrix[u] = dict()
            for v in users:
                if u == v:
                    continue
                if v not in user_sim_matrix[u]:
                    user_sim_matrix[u][v] = 0
                user_sim_matrix[u][v] += 1/len(users)
                
    for u, related_users in tqdm(user_sim_matrix.items()):
        for v, count in related_users.items():
            user_sim_matrix[u][v] = count / math.sqrt(len(uid_vcount[u]) * len(uid_vcount[v]))
    
    user_topK = {}     
            
    for user in user_sim_matrix:
        user_topK[user] = sorted(user_sim_matrix[user].items(), key=itemgetter(1), reverse=True)[:K]
        
    with open(f'{DIRECTORY}/user_sim_matrix.pkl', 'wb') as f:
        pickle.dump(user_topK, f)

In [23]:
def itemCF_by_Session(K=200):
    """
        calculate item similarity matrix by session (kinda like TF-IDF)
    """
    
    sess_item = {}
    item_sim_matrix = {}
    vid_ucount = {}
    sess_cnt = 0
    
    with open('data/train.pkl', 'rb') as f:
        session_data = pickle.load(f)
        
    for uid in tqdm(session_data):
        for seq in session_data[uid]:
            sess_cnt += 1
            sess_item[sess_cnt] = set()
            for vid in seq:
                sess_item[sess_cnt].add(vid)
                if vid not in vid_ucount:
                    vid_ucount[vid] = set()
                vid_ucount[vid].add(sess_cnt)
    
    for sess, items in tqdm(sess_item.items()):
        for u in items:
            if u not in item_sim_matrix:
                item_sim_matrix[u] = dict()
            for v in items:
                if u == v:
                    continue
                if v not in item_sim_matrix[u]:
                    item_sim_matrix[u][v] = 0
                item_sim_matrix[u][v] += 1/len(items)
                
    for u, related_items in tqdm(item_sim_matrix.items()):
        for v, count in related_items.items():
            item_sim_matrix[u][v] = count / math.sqrt(len(vid_ucount[u]) * len(vid_ucount[v]))
            
    item_topK = {}
    for item in item_sim_matrix:
        item_topK[item] = sorted(item_sim_matrix[item].items(), key=itemgetter(1), reverse=True)[:K]
        
    with open(f'{DIRECTORY}/item_sim_matrix.pkl', 'wb') as f:
        pickle.dump(item_topK, f)

In [24]:
sample_relation(num=n,sample_size=SZ)
userCF()
itemCF_by_Session()

  0%|          | 0/897 [00:00<?, ?it/s]

Items which most frequently lies to the left (previous time step) of Item 1 : [88, 55, 68, 85, 13, 80, 107, 90] 
Items which most frequently lies to the right (next time step) of Item 1 : [7, 2, 38, 86, 99, 120, 125] 


  0%|          | 0/897 [00:00<?, ?it/s]

  0%|          | 0/38692 [00:00<?, ?it/s]

  0%|          | 0/897 [00:00<?, ?it/s]

  0%|          | 0/897 [00:00<?, ?it/s]

  0%|          | 0/283816 [00:00<?, ?it/s]

  0%|          | 0/38692 [00:00<?, ?it/s]

### Create Heterogenous Global Graph

In [45]:
def uui_graph( topK,add_u=True, add_v=True):
    
    with open(f'{DIRECTORY}/train.pkl', 'rb') as f:
        graph = pickle.load(f)
    with open(f'{DIRECTORY}/adj_in.pkl', 'rb') as f:
        adj_in = pickle.load(f)
    with open(f'{DIRECTORY}/adj_out.pkl', 'rb') as f:
        adj_out = pickle.load(f)
    with open(f'{DIRECTORY}/user_sim_matrix.pkl', 'rb') as f:
        user_sim_matrix = pickle.load(f)
    with open(f'{DIRECTORY}/item_sim_matrix.pkl', 'rb') as f:
        item_sim_matrix = pickle.load(f)
        
    pre = []
    nxt = []
    src_v = []
    dst_u = []
    
    for i in range(1, len(adj_in)):
        _pre = []
        _nxt = []
        
        for j in adj_in[i]:
            _pre.append(i)
            _nxt.append(j)
        
        pre += _pre
        nxt += _nxt
    
    o_pre = []
    o_nxt = []
    
    for i in range(1, len(adj_out)):
        _pre = []
        _nxt = []
        
        for j in adj_out[i]:
            _pre.append(i)
            _nxt.append(j)
        
        o_pre += _pre
        o_nxt += _nxt
        
    for u in tqdm(graph):
        for seq in graph[u]:
            pre += seq[:-1]
            nxt += seq[1:]
            dst_u += [u for _ in seq]
            src_v += seq
            
    topv_src = []
    topv_dst = []
    
    for v in tqdm(item_sim_matrix):
        tmp_src =[]
        tmp_dst =[]
        
        exclusion = adj_in[v] + adj_out[v]
        for vid, value in item_sim_matrix[v][:topK][:int(len(exclusion))]:
            if vid not in exclusion:
                tmp_src.append(v)
                tmp_dst.append(vid)
                
        topv_src += tmp_src
        topv_dst += tmp_dst
        
    u_src = []
    u_dst = []
    
    for u in tqdm(user_sim_matrix):
        tmp_src =[]
        tmp_dst =[]
        
        for uid, value in user_sim_matrix[u][:topK]:
            tmp_src.append(u)
            tmp_dst.append(uid)
                
        u_src += tmp_src
        u_dst += tmp_dst
        
    item_num = max(max(pre), max(nxt)) + 1
    user_num = max(max(u_src), max(u_dst)) 
    
    u_src = [i+item_num for i in u_src]
    u_dst = [i+item_num for i in u_dst]
    dst_u = [i+item_num for i in dst_u]
    
    pre = torch.tensor(pre, dtype=torch.long, device=device)
    nxt = torch.tensor(nxt, dtype=torch.long, device=device)
    dst_u = torch.tensor(dst_u, dtype=torch.long, device=device)
    src_v = torch.tensor(src_v, dtype=torch.long, device=device)
    u_src = torch.tensor(u_src, dtype=torch.long, device=device)
    u_dst = torch.tensor(u_dst, dtype=torch.long, device=device)
    topv_src = torch.tensor(topv_src, dtype=torch.long, device=device)
    topv_dst = torch.tensor(topv_dst, dtype=torch.long, device=device)
    
    G = dgl.graph((pre,nxt))
    G.add_edges(nxt, pre)
    G.add_edges(dst_u, src_v)
    G.add_edges(src_v, dst_u)
    
    if add_u:
        G.add_edges(u_src, u_dst)
        G.add_edges(u_dst, u_src)
        
    if add_v:
        G.add_edges(topv_src, topv_dst)
        G.add_edges(topv_dst, topv_src)
        
    G = dgl.add_self_loop(G)
    
    return G, item_num,pre, nxt, dst_u, src_v, u_src, u_dst, topv_src, topv_dst   

In [46]:
G,item_num, pre, nxt, dst_u, src_v, u_src, u_dst, topv_src, topv_dst = uui_graph(topK = 20, add_u = True, add_v = True)

  0%|          | 0/897 [00:00<?, ?it/s]

  0%|          | 0/38692 [00:00<?, ?it/s]

  0%|          | 0/897 [00:00<?, ?it/s]

DGLError: [01:13:22] C:\Users\peizhou\workspace\DGL_scripts\release\win-64\dgl\src\array\array.cc:50: Operator Range does not support cuda device.

## HG_GNN

![ss](image/hg_gnn.png)

In [47]:
class HG_GNN(nn.Module):
    def __init__(self, G, config, item_num, max_seq_len=10, max_sess=10):
        super(HG_GNN, self).__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.G = G.to(self.device)
        self.item_num = item_num
        self.max_seq_len = max_seq_len
        self.max_sess = max_sess
        self.config = config
        self.hidden_size = config['hidden_size']
        self.em_size = config['embed_size']
        
        self.pos_embedding = nn.Embedding(200, self.em_size)
        self.v2e = nn.Embedding(G.number_of_nodes(), self.em_size)
        
        self.conv1 =dglnn.SAGEConv(self.em_size, self.em_size, 'mean')
        
        dropout = config['dropout']
        
        self.emb_dropout = nn.Dropout(dropout)

        self.sigmoid_concat = nn.Sequential(
            nn.Linear(self.em_size*2, 1),
            nn.Sigmoid()
        )
        
        self.w_1 = nn.Parameter(torch.Tensor(2*self.em_size, self.em_size))
        self.w_2 = nn.Parameter(torch.Tensor(self.em_size, 1))
        
        self.glu1 = nn.Linear(self.em_size, self.em_size)
        self.glu2 = nn.Linear(self.em_size, self.em_size, bias=False)
        
        self.w_3 = nn.Parameter(torch.Tensor(self.em_size, self.em_size))
        self.w_4 = nn.Parameter(torch.Tensor(self.em_size, 1))
        
        self.glu3 = nn.Linear(self.em_size, self.em_size)
        self.glu4 = nn.Linear(self.em_size, self.em_size, bias=False)
        
        self.reset_parameters()
                
    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.em_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)
            
    def compute_hidden_vector(self, hidden, mask, pos_idx):
        
        mask = mask.unsqueeze(-1)
        seq_len = hidden.size(1)
        pos_embedding = self.pos_embedding(pos_idx)
        tmp = torch.sum(hidden*mask, 1)/torch.sum(mask, 1)
        
        hs = tmp.unsqueeze(1).repeat(1, seq_len, 1)
        nh = torch.matmul(torch.cat([pos_embedding,hidden],-1),self.w_1)
        nh = torch.tanh(nh)
        nh = torch.sigmoid(self.glu1(nh)+self.glu2(hs))
        
        beta = torch.matmul(nh, self.w_2) * mask
        
        select = torch.sum(beta*hidden, 1)
        
        return select, tmp
        
    def sess_user_vector(self, user_vec, note_embeds, mask):
        
        mask = mask.unsqueeze(-1)
        
        hs = user_vec.repeat(1, self.mask.size(1), 1)
        nh = torch.matmul(note_embeds,self.w_3)
        nh = torch.tanh(nh)
        nh = torch.sigmoid(self.glu3(nh)+self.glu4(hs))
        
        beta = torch.matmul(nh, self.w_4) * mask
        select = torch.sum(beta*note_embeds, 1)
        
        return select
        
    def forward(self,user, seq, mask, seq_len, pos_idx):
        """
        seq(bs*L)
        seq: bs*L
        his_ids: bs * M
        mask:
        seq_len(bs)
        """
        
        user = user + self.item_num
        v2e_all = self.v2e(torch.arange(0, self.G.number_of_nodes()).long().to(self.device))
        h1 = self.conv1(self.G,self.emb_dropout(v2e_all))
        h1 = F.relu(h1)
        
        bs = seq.size(0)
        L = seq.size(1)
        
        node_list = seq
        nodes_embeds = (h1[node_list] + self.v2e(node_list))/2
        seq_embeds = (h1[user] + self.v2e(user))/2

        nodes_embeds = nodes_embeds.view(bs, L, -1)
        
        sess_vec, avg_sess = self.compute_hidden_vector(nodes_embeds, mask, pos_idx)
        sess_user = self.sess_user_vector(seq_embeds, nodes_embeds, mask)
        
        alpha = self.sigmoid_concat(torch.cat([sess_user, sess_vec], 1))
        
        seq_embeds = alpha * sess_user + (1-alpha) * sess_vec
        scores = torch.matmul(seq_embeds, v2e_all.T)
        
        return scores

In [48]:
config = {
'embed_size' : 128,
'learning_rate' : 0.001,
'hidden_size' : 64,
'batch_size' : 512,
'epoch' : 10,
'gnn_layer_size' : 2,
'patience' : 5,
'save_flag' : 0,
'dropout' : 0.5, 
'comment' : "",
'lr_dc' : 0.1
}

In [29]:
def metrics(res,labels):
    # res: (bs, seq_len)
    res = torch.concat(res)
    acc_ar = (res == labels)
    acc = acc_ar.sum()
    
    rank = torch.argmax(acc_ar, -1) +1
    mrr = acc/rank.mean()
    ndcg = (1/torch.log2(rank+1)).mean()
    
    return acc.mean(), mrr, ndcg

In [30]:
def evaluate_topk(config, model, test_dataset):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    res50 = []
    res20 = []
    res10 = []
    res5  = []
    label = []
    
    with torch.no_grad():
        for (uid, seq, mask, seq_len, pos_idx, labels) in tqdm(test_dataset):
            
            user = user.to(device)
            seq = seq.to(device)
            mask = mask.to(device)
            seq_len = seq_len.to(device)
            pos_idx = pos_idx.to(device)
            labels = labels.to(device)
            
            scores = model(uid, seq, mask, seq_len, pos_idx)
            
            res50.append(torch.topk(scores, 50)[1])
            res20.append(torch.topk(scores, 20)[1])
            res10.append(torch.topk(scores, 10)[1])
            res5.append(torch.topk(scores, 5)[1])
            label.append(labels)
        
        label = torch.concat(label)
        
        acc50, mrr50, ndcg50 = metrics(res50, labels)
        acc20, mrr20, ndcg20 = metrics(res20, labels)
        acc10, mrr10, ndcg10 = metrics(res10, labels)
        acc5,  mrr5,  ndcg5 = metrics(res5, labels)
        
        print("Top50 : acc {} , mrr {}, ndcg {}".format(acc50, mrr50, ndcg50))
        print("Top20 : acc {} , mrr {}, ndcg {}".format(acc20, mrr20, ndcg20))
        print("Top10 : acc {} , mrr {}, ndcg {}".format(acc10, mrr10, ndcg10))
        print("Top5 : acc {} , mrr {}, ndcg {}".format(acc5, mrr5, ndcg5))

            
            
            

In [36]:
def train(config, model, train_dataset,test_dataset):
    opimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(opimizer, step_size=config['epoch'], gamma=config['lr_dc'])
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print_epoch_step = 2
    criteria = nn.CrossEntropyLoss()
    

    for epoch in range(config['epoch']):
        t_loss = float(0)
        model.train()
        for i, data in tqdm(enumerate(train_dataset), desc='Epoch '+str(epoch)):
            opimizer.zero_grad()
            user, seq, mask, seq_len, pos_idx, labels = data
            
            user = user.to(device)
            seq = seq.to(device)
            mask = mask.to(device)
            seq_len = seq_len.to(device)
            pos_idx = pos_idx.to(device)
            labels = labels.to(device)
            
            scores = model(user, seq, mask, seq_len, pos_idx)
            loss = criteria(scores, (labels-1).squeeze())
            loss.backward()
            opimizer.step()
            t_loss += loss.item()
        scheduler.step()
        
        if epoch % print_epoch_step == 0:
            print(f"Epoch {epoch} - Loss : {t_loss/len(train_dataset)} ")
            evaluate_topk(config, model, test_data)
        

In [34]:
model = HG_GNN(G, config, item_num, max_seq_len=10, max_sess=10)
model = model.to(device)

In [37]:
train(config, model, train_dataloader, test_dataloader)

Epoch 0: 0it [00:00, ?it/s]

DGLError: Cannot assign node feature "h" on device cuda:0 to a graph on device cpu. Call DGLGraph.to() to copy the graph to the same device.