In [4]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

In [5]:
DATASET = 'LastFM'  # only support "ml-100k" and "ml-1m" now
RAW_PATH = os.path.join('./', DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99

# Load Data

1. Load interaction data and item metadata
2. Filter out items with less than 5 interactions
3. Calculate basic statistics

In [6]:
data_df = pd.read_csv(os.path.join(DATASET, 'u.data'))

In [26]:
meta_df = pd.read_csv(os.path.join(DATASET, 'u.item'))

In [7]:
data_df = data_df.groupby('user_id', group_keys=False).apply(lambda x: x.sample(frac=0.05))

In [8]:
data_df.head()

Unnamed: 0,user_id,item_id,time
3609,user_000001,268baf7b-f29b-4c31-b2a4-60f9398c1f0b,1228660775
9906,user_000001,1acc9160-91dd-429f-a483-69eeb1b76118,1166001500
3359,user_000001,614fe1f9-b635-4ddc-a4cb-03116750cbc0,1229767563
1160,user_000001,ed6b0da1-b781-4c22-be7a-6ddfeec28042,1237641058
7563,user_000001,f59bfff4-1784-4ef6-84e5-75da053ba39a,1206452129


In [9]:
# Only retain users and items with at least 5 associated interactions

print('Filter before:', len(data_df))
filter_before = -1
while filter_before != len(data_df):
    filter_before = len(data_df)
    for stage in ['user_id', 'item_id']:
        val_cnt = data_df[stage].value_counts()
        cnt_df = pd.DataFrame({stage: val_cnt.index, 'cnt': val_cnt.values})
        data_df = pd.merge(data_df, cnt_df, on=stage, how='left')
        data_df = data_df[data_df['cnt'] >= 5].drop(columns=['cnt'])
print('Filter after:', len(data_df))

Filter before: 833550
Filter after: 457554


### Statistics

In [10]:
n_users = data_df['user_id'].value_counts().size
n_items = data_df['item_id'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

In [11]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 932
# Items: 38441
# Interactions: 457554
Time Span: 2005-02-14/2009-06-19


# Build Dataset

### Interaction data

In [12]:
np.random.seed(RANDOM_SEED)

In [13]:
out_df = data_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
out_df = out_df.reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,user_000142,bc862198-8883-4253-b1b6-51f2d3048e85,1108339681
1,user_000174,c08da71c-e876-43d2-8756-3d3674fae876,1108341519
2,user_000896,c0980230-a5f2-454a-9fdc-fbee776830d3,1108349399
3,user_000525,e7c33739-abde-44c4-844f-a185abb025f6,1108359501
4,user_000525,1d69ccec-231f-48d9-bc53-ac195ef7854e,1108360234


In [14]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,131,28388,1108339681
1,161,29023,1108341519
2,837,29029,1108349399
3,493,34789,1108359501
4,493,4388,1108360234


In [15]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [16]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(455690, 932, 932)

In [17]:
train_df

Unnamed: 0,user_id,item_id,time
0,131,28388,1108339681
1,161,29023,1108341519
2,837,29029,1108349399
3,493,34789,1108359501
4,493,4388,1108360234
...,...,...,...
457543,572,20177,1245402592
457545,896,38316,1245407701
457547,896,34752,1245411903
457548,896,3009,1245416949


In [18]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
3070,389,20829,1114450290,"[2733, 21244, 30404, 32104, 20758, 14936, 1543..."
9086,129,22903,1122313187,"[15042, 23307, 36951, 13161, 26636, 33076, 133..."
18112,166,19741,1130607605,"[5209, 6178, 4864, 8390, 895, 30143, 32905, 26..."
19100,565,20640,1131333963,"[11591, 11318, 9009, 31839, 18492, 14673, 2626..."
40001,802,23084,1141534830,"[18356, 11466, 13750, 34440, 33375, 18505, 158..."


In [19]:
# save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

# Metadata

In [27]:
meta_df = meta_df[meta_df['item_id'].isin(item2id.keys())]
meta_df['item_id_m'] = meta_df['item_id'].apply(lambda x: item2id[x])

In [29]:
meta_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), index=False)