In [None]:
!pip install -qq dgl-cu110 dglgo -f https://data.dgl.ai/wheels/repo.html &>/dev/null

In [None]:
import pandas as pd
import numpy as np
import cudf
from pandas import Timedelta
import os
from tqdm.notebook import tqdm
import pickle
from torch.utils.data import Dataset, DataLoader
import math
from operator import itemgetter
import dgl
pd.set_option('display.max_rows', 10000)

### Dataset
[Last-FM](https://www.kaggle.com/datasets/japarra27/lastfm-dataset)

In [None]:
data = cudf.read_parquet('/data/items.parquet')
len(data)

In [None]:
data.head()

In [None]:
print(f" # Unique User : {data['user_id'].nunique()}, # Unique Artist : {data['artist_id'].nunique()}")

## Create session with interval of 6 hours

Utilities function

In [None]:
def get_session_id(df, session):
    # If the next row have different user_id or the time difference is greater than session, then it is a new session
    df_prev = df.shift(1)
    is_new_session = (df['user_id'] != df_prev['user_id']) | (df['timestamp'] - df_prev['timestamp'] > session)
    session_id = is_new_session.cumsum()-1
    return session_id

def group_session(df, session):
    df['session_id'] = get_session_id(df, session)
    return df

def filter_short_session(df, min_session_length=2):
    session_length = df.groupby('session_id').size()
    session_length = session_length[session_length >= min_session_length]
    return df[df['session_id'].isin(session_length.index)]

def filter_infrequent_item(df, min_item_support=5):
    item_support = df.groupby('itemId').size()
    item_support = item_support[item_support >= min_item_support]
    return df[df['itemId'].isin(item_support.index)]

def filter_until_ok(df, min_session_length=2, min_item_support=5):
    while True:
        before = len(df)
        df = filter_short_session(df, min_session_length)
        df = filter_infrequent_item(df, min_item_support)
        after = len(df)
        if before == after:
            break
    return df

def trucate_session(df, session_length=20):
    df = df.sort_values(['session_id', 'timestamp'])
    item_idx = df.groupby('session_id').cumcount()
    return df[item_idx < session_length]

def update_id(df, field):
    labels = cudf.factorize(df[field])[0]
    kwargs = {field: labels}
    df = df.assign(**kwargs)
    return df


def remove_immediate_repeats(df):
    df_prev = df.shift()
    is_not_repeat = (df['session_id'] != df_prev['session_id']) | (df['itemId'] != df_prev['itemId'])
    return df[is_not_repeat]
    
def reorder_sessions(df):
    df_endtime  = df.groupby('session_id')['timestamp'].max().sort_value().reset_index()
    oid2nid = dict(zip(df_endtime['session_id'], df_endtime.index))
    df['session_id'].map(oid2nid, inplace=True)
    df.sort_values(['session_id', 'timestamp'], inplace=True)
    return df

def keep_top_n_items(df, n=40000):
    item_support = df.groupby('itemId').size()
    top_n_items = item_support.nlargest(n).index
    return df[df['itemId'].isin(top_n_items)]

def train_test_split(df, test_size=0.2):
    endtime  = df.groupby('session_id')['timestamp'].max().sort_values()
    num_test = int(len(endtime) * test_size)
    test_sessions = endtime.index[-num_test:]
    df_train = df[~df['session_id'].isin(test_sessions)]
    df_test = df[df['session_id'].isin(test_sessions)]
    return df_train, df_test
    
def save_sessions(df, filepath='data/sessions.txt'):
    df = reorder_sessions(df)
    sessions = df.groupby('session_id').itemId.apply(lambda x: ','.join(map(str, x)))
    sessions.to_csv(filepath, sep='\t', header=False, index=False)