In [11]:
import os
import re
import zipfile
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [12]:
DATASET = 'SVRec' 
RAW_PATH = os.path.join('./', DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99

# Load Data

1. Load interaction data and item metadata
2. Filter out items with less than 5 interactions
3. Calculate basic statistics

In [13]:
if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)

inter_df = pd.read_csv('/work/hzy/douyin/work_202306/immerse_rec/data_organize/SVRec.inter',sep='\t')
item_df = pd.read_csv('/work/hzy/douyin/work_202306/immerse_rec/data_organize/SVRec.item',sep='\t')
user_df = pd.read_csv('/work/hzy/douyin/work_202306/immerse_rec/data_organize/SVRec.user',sep='\t')

In [14]:
print(inter_df.columns)
inter_df

Index(['user_id:token', 'item_id:token', 'start_ts:float', 'end_ts:float',
       'session_id:float', 'video_order:float', 'rating_immersion:float',
       'view_duration:float', 'view_ratio:float', 'rating_like:float'],
      dtype='object')


Unnamed: 0,user_id:token,item_id:token,start_ts:float,end_ts:float,session_id:float,video_order:float,rating_immersion:float,view_duration:float,view_ratio:float,rating_like:float
0,0,0,1667721879363,1667721894899,1,1,1,15.536,0.267862,0
1,0,1,1667721894938,1667721907040,1,2,2,12.102,0.403400,0
2,0,2,1667721907073,1667721937179,1,3,2,30.106,0.614408,0
3,0,3,1667721937188,1667721948699,1,4,1,11.511,0.239812,0
4,0,4,1667721948710,1667721955060,1,5,1,6.350,0.204839,0
...,...,...,...,...,...,...,...,...,...,...
3652,29,2679,1678614530767,1678614551372,5,90,4,20.605,0.069847,0
3653,29,2680,1678614574605,1678614592937,5,91,4,18.332,0.241211,0
3654,29,2681,1678614592954,1678614612140,5,92,1,19.186,0.504895,0
3655,29,2682,1678614612155,1678614626363,5,93,3,14.208,1.092923,0


In [15]:
# TODO：确定以哪列作为优化目标
data_df = inter_df[['user_id:token', 'item_id:token', 'rating_like:float', 'start_ts:float']]
data_df.columns = ['user_id', 'item_id', 'label', 'time']
data_df

Unnamed: 0,user_id,item_id,label,time
0,0,0,0,1667721879363
1,0,1,0,1667721894938
2,0,2,0,1667721907073
3,0,3,0,1667721937188
4,0,4,0,1667721948710
...,...,...,...,...
3652,29,2679,0,1678614530767
3653,29,2680,0,1678614574605
3654,29,2681,0,1678614592954
3655,29,2682,0,1678614612155


In [16]:
print(item_df.columns)
item_df

Index(['item_id:token', 'author_id:token', 'item_label:token',
       'item_duration:float', 'vv_all:float', 'like_cnt:float'],
      dtype='object')


Unnamed: 0,item_id:token,author_id:token,item_label:token,item_duration:float,vv_all:float,like_cnt:float
0,0,2,The police station,58,2739780,28361
1,1,3,A little game _ king glory,30,24236,83
2,2,4,The dog,49,3866245,82631
3,3,5,Stone sea shells,48,15475,43
4,4,6,News interview,31,136148,1898
...,...,...,...,...,...,...
2679,2679,3900,speech,295,5186939,104918
2680,2680,3719,crosstalk,76,5964301,48019
2681,2681,3901,The film and television play,38,4743061,282844
2682,2682,3902,News interview,13,2299271,12634


In [17]:
meta_df = item_df[['item_id:token', 'author_id:token', 'item_label:token', 'item_duration:float', 'vv_all:float', 'like_cnt:float']]

# The header should start with i_ and the values need to be discrete and finite.
meta_df.columns = ['item_id', 'i_authorid', 'i_label', 'i_duration', 'i_vvall','i_likecnt']
meta_df

Unnamed: 0,item_id,i_authorid,i_label,i_duration,i_vvall,i_likecnt
0,0,2,The police station,58,2739780,28361
1,1,3,A little game _ king glory,30,24236,83
2,2,4,The dog,49,3866245,82631
3,3,5,Stone sea shells,48,15475,43
4,4,6,News interview,31,136148,1898
...,...,...,...,...,...,...
2679,2679,3900,speech,295,5186939,104918
2680,2680,3719,crosstalk,76,5964301,48019
2681,2681,3901,The film and television play,38,4743061,282844
2682,2682,3902,News interview,13,2299271,12634


### Statistics

In [18]:
n_users = data_df['user_id'].value_counts().size
n_items = data_df['item_id'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

In [19]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(int(min_time/1000)).strftime(time_format),
    datetime.utcfromtimestamp(int(max_time/1000)).strftime(time_format))
)

# Users: 30
# Items: 2684
# Interactions: 3657
Time Span: 2022-11-06/2023-03-12


# Build Dataset

### Interaction data

In [20]:
np.random.seed(RANDOM_SEED)

In [21]:
out_df = data_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
out_df = out_df.reset_index(drop=True)
out_df

Unnamed: 0,user_id,item_id,time
0,0,0,1667721879363
1,0,1,1667721894938
2,0,2,1667721907073
3,0,3,1667721937188
4,0,4,1667721948710
...,...,...,...
3652,29,2679,1678614530767
3653,29,2680,1678614574605
3654,29,2681,1678614592954
3655,29,2682,1678614612155


In [22]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df

Unnamed: 0,user_id,item_id,time
0,1,1,1667721879363
1,1,2,1667721894938
2,1,3,1667721907073
3,1,4,1667721937188
4,1,5,1667721948710
...,...,...,...
3652,30,2680,1678614530767
3653,30,2681,1678614574605
3654,30,2682,1678614592954
3655,30,2683,1678614612155


In [23]:
# leave one out spliting
# set the length of dev and test
N=10

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(N).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [24]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(3057, 300, 300)

In [25]:
train_df

Unnamed: 0,user_id,item_id,time
0,1,1,1667721879363
1,1,2,1667721894938
2,1,3,1667721907073
3,1,4,1667721937188
4,1,5,1667721948710
...,...,...,...
3632,30,2661,1678613579561
3633,30,2555,1678613592745
3634,30,2662,1678613594930
3635,30,2663,1678613622251


In [26]:
dev_df

Unnamed: 0,user_id,item_id,time,neg_items
129,1,130,1667727092545,"[707, 811, 320, 2064, 1543, 1765, 1494, 1228, ..."
130,1,131,1667727118957,"[1823, 241, 1761, 1149, 552, 492, 1982, 1890, ..."
131,1,132,1667727122983,"[1487, 893, 711, 2008, 896, 794, 1821, 1734, 5..."
132,1,133,1667727126666,"[1113, 2218, 2654, 2313, 447, 1418, 164, 155, ..."
133,1,134,1667727132540,"[153, 547, 1728, 1752, 213, 1378, 722, 247, 21..."
...,...,...,...,...
3642,30,2670,1678614301897,"[1348, 591, 274, 1532, 147, 1116, 1128, 1765, ..."
3643,30,2671,1678614405019,"[1696, 788, 1353, 2009, 1202, 1795, 6, 1287, 1..."
3644,30,2672,1678614415228,"[1107, 479, 1819, 954, 347, 970, 162, 242, 140..."
3645,30,2673,1678614420983,"[58, 1798, 1899, 1560, 1446, 1591, 1089, 1312,..."


In [27]:
test_df

Unnamed: 0,user_id,item_id,time,neg_items
139,1,140,1667727432174,"[2608, 1654, 836, 764, 1732, 1034, 278, 1779, ..."
140,1,141,1667727466594,"[2419, 556, 955, 2072, 1724, 171, 2426, 2147, ..."
141,1,142,1667727473928,"[624, 771, 1564, 1242, 822, 308, 1199, 1173, 1..."
142,1,143,1667727493878,"[585, 2504, 1112, 1618, 1145, 1909, 733, 933, ..."
143,1,144,1667727505668,"[2503, 1290, 604, 1444, 2388, 1869, 1299, 2275..."
...,...,...,...,...
3652,30,2680,1678614530767,"[1875, 622, 972, 1318, 721, 2284, 1309, 771, 2..."
3653,30,2681,1678614574605,"[2451, 2519, 1936, 824, 463, 637, 2132, 990, 2..."
3654,30,2682,1678614592954,"[581, 1492, 1818, 1341, 1549, 1075, 1860, 1604..."
3655,30,2683,1678614612155,"[928, 2469, 306, 2495, 1771, 2086, 1650, 481, ..."


In [28]:
# save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

### Item Metadata

In [30]:
meta_df['item_id'] = meta_df['item_id'].apply(lambda x: item2id[x])

seps = meta_df['i_label'].unique()
# year_dict = {}
# for i, sep in enumerate(seps[:-1]):
#     for j in range(seps[i], seps[i + 1]):
#         year_dict[j] = i + 1
# item_df['i_year'] = item_df['i_year'].apply(lambda x: year_dict[x] if x > 0 else 0)
    
# item_df.head()
seps

array(['The police station', 'A little game _ king glory', 'The dog',
       'Stone sea shells', 'News interview', 'Food production',
       'The flood', 'Beauty microphone singing', 'Fire control knowledge',
       'Spread the', 'The film and television play', 'The concert',
       'Domestic politics news', "Men's and women's indoor plot",
       'Office of the plot', 'pawnbrokers',
       'A little game _ after tomorrow', 'Car is introduced', 'The cat',
       "Men's and women's outdoor drama", 'Milk tea',
       'A little game _ through the wire', 'The legal system',
       'Psychological knowledge', 'KTV', 'Weapons and equipment',
       'Military information', 'childbirth', 'Real estate knowledge',
       'Before preparing', 'The constellation chart',
       'cooked wheaten food', 'The pet hospital', 'The station',
       'Sand bottle drawing', 'Baby take and he shot', 'The doctor',
       'Street food', 'Soldiers everyday', 'Rural housing', 'variety',
       'The panda', 'Childre

In [None]:
label_dict = {}
for i, sep in enumerate(seps):
    label_dict[sep] = i + 1
print(label_dict)
meta_df['i_label'] = meta_df['i_label'].apply(lambda x: label_dict[x])
    
meta_df

{'The police station': 1, 'A little game _ king glory': 2, 'The dog': 3, 'Stone sea shells': 4, 'News interview': 5, 'Food production': 6, 'The flood': 7, 'Beauty microphone singing': 8, 'Fire control knowledge': 9, 'Spread the': 10, 'The film and television play': 11, 'The concert': 12, 'Domestic politics news': 13, "Men's and women's indoor plot": 14, 'Office of the plot': 15, 'pawnbrokers': 16, 'A little game _ after tomorrow': 17, 'Car is introduced': 18, 'The cat': 19, "Men's and women's outdoor drama": 20, 'Milk tea': 21, 'A little game _ through the wire': 22, 'The legal system': 23, 'Psychological knowledge': 24, 'KTV': 25, 'Weapons and equipment': 26, 'Military information': 27, 'childbirth': 28, 'Real estate knowledge': 29, 'Before preparing': 30, 'The constellation chart': 31, 'cooked wheaten food': 32, 'The pet hospital': 33, 'The station': 34, 'Sand bottle drawing': 35, 'Baby take and he shot': 36, 'The doctor': 37, 'Street food': 38, 'Soldiers everyday': 39, 'Rural housin

Unnamed: 0,item_id,i_authorid,i_label,i_duration,i_vvall,i_likecnt
0,1,2,1,58,2739780,28361
1,2,3,2,30,24236,83
2,3,4,3,49,3866245,82631
3,4,5,4,48,15475,43
4,5,6,5,31,136148,1898
...,...,...,...,...,...,...
2679,2680,4861,73,295,5186939,104918
2680,2681,4862,160,76,5964301,48019
2681,2682,4863,11,38,4743061,282844
2682,2683,4864,5,13,2299271,12634


In [None]:
# save results

item_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)