In [1]:
import os
import re
import zipfile
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
DATASET = 'KuaiRand_history_testall' 
RAW_PATH = os.path.join('./', DATASET)

RANDOM_SEED = 0

# Load Data

1. Load interaction data and item metadata
2. Filter out items with less than 5 interactions
3. Calculate basic statistics

In [3]:
if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)

inter_df = pd.read_csv('../KuaiRand_noImmers_history.csv',sep=',',dtype={'user_id':int,'video_id':int})
item_df = pd.read_csv('../KuaiRand_ItemFeature.csv',sep=',',dtype={'video_id':int})

In [4]:
print(len(inter_df))
# inter_df = inter_df.sample(n=500000, random_state=2022)
inter_df

5055984


Unnamed: 0,user_id,video_id,time_ms,is_like,play_time_ms,duration_ms,session_id,session_order,behavior_like,behavior_view,session_length
0,0,4354972,1649467982289,0,0,70100,1,0,[],[],0
1,0,1329429,1649467982289,0,0,51422,1,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4657, 0....",10
2,0,346081,1649467982289,0,0,11696,1,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.465...",10
3,0,2058916,1649467982289,0,0,66433,1,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10
4,0,2528540,1649467982289,0,5332,11450,1,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10
...,...,...,...,...,...,...,...,...,...,...,...
5055979,999,694615,1650552207405,0,3573,35040,873310,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1,...",5
5055980,999,1240364,1650552207405,0,927,63566,873310,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.13...",5
5055981,999,2514654,1650552207405,0,65048,99100,873310,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.1313, 0...",5
5055982,999,4214495,1650552339920,0,1822,0,873311,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.1313, 0.102,...",1


In [5]:
print("number of interaction:", len(inter_df))

empty_list_count1 = inter_df['behavior_like'].apply(lambda x: len(x) == 2).sum()
empty_list_count2 = inter_df['behavior_view'].apply(lambda x: len(x) == 2).sum()
print("number of behavior list is empty:", empty_list_count1, empty_list_count2)
      
print('number of start session:', (inter_df['session_order'] == 0).sum())
inter_df = inter_df[inter_df['session_order'] != 0]

condition = inter_df['behavior_like'].apply(lambda x: len(x) == 2)
inter_df = inter_df[~condition]
empty_list_count1 = inter_df['behavior_like'].apply(lambda x: len(x) == 2).sum()
empty_list_count2 = inter_df['behavior_view'].apply(lambda x: len(x) == 2).sum()
print("number of behavior list is empty:", empty_list_count1, empty_list_count2)

print("number of remaining interaction:", len(inter_df))

print('number of like:', (inter_df['is_like'] == 1).sum())

print('number of duration is 0', (inter_df['duration_ms'] == 0).sum())

inter_df['duration_ms'] = inter_df['duration_ms'].replace(0, 1)

threshold = 0.95
print('number_viewratio', (inter_df['play_time_ms']/inter_df['duration_ms']>threshold).sum())

inter_df['view_ratio'] = inter_df.apply(lambda row: 1 if row['play_time_ms'] / row['duration_ms'] > threshold else 0, axis=1)


number of interaction: 5055984
number of behavior list is empty: 1036 1036
number of start session: 863292
number of behavior list is empty: 0 0
number of remaining interaction: 4192604
number of like: 61363
number of duration is 0 340612
number_viewratio 920904


In [6]:
print(inter_df.columns)
inter_df

Index(['user_id', 'video_id', 'time_ms', 'is_like', 'play_time_ms',
       'duration_ms', 'session_id', 'session_order', 'behavior_like',
       'behavior_view', 'session_length', 'view_ratio'],
      dtype='object')


Unnamed: 0,user_id,video_id,time_ms,is_like,play_time_ms,duration_ms,session_id,session_order,behavior_like,behavior_view,session_length,view_ratio
1,0,1329429,1649467982289,0,0,51422,1,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4657, 0....",10,0
2,0,346081,1649467982289,0,0,11696,1,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.465...",10,0
3,0,2058916,1649467982289,0,0,66433,1,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10,0
4,0,2528540,1649467982289,0,5332,11450,1,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10,0
5,0,1966868,1649467982289,0,0,22016,1,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4657, 0.0, 0....",10,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5055978,999,411299,1650552207405,0,14895,13166,873310,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.040...",5,1
5055979,999,694615,1650552207405,0,3573,35040,873310,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1,...",5,0
5055980,999,1240364,1650552207405,0,927,63566,873310,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.13...",5,0
5055981,999,2514654,1650552207405,0,65048,99100,873310,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.1313, 0...",5,0


In [7]:
# TODO：确定以哪列作为优化目标
data_df = inter_df[['user_id', 'video_id', 'time_ms','session_id', 'session_order','behavior_like','behavior_view','session_length','view_ratio']]
data_df.columns = ['user_id', 'item_id', 'time', 'session_id','c_session_order','c_behavior_like','c_behavior_view','session_length','label']
choose_column = ['user_id', 'item_id', 'label', 'time', 'c_session_order', 'c_behavior_like','c_behavior_view']
data_df

Unnamed: 0,user_id,item_id,time,session_id,c_session_order,c_behavior_like,c_behavior_view,session_length,label
1,0,1329429,1649467982289,1,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4657, 0....",10,0
2,0,346081,1649467982289,1,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.465...",10,0
3,0,2058916,1649467982289,1,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10,0
4,0,2528540,1649467982289,1,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10,0
5,0,1966868,1649467982289,1,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4657, 0.0, 0....",10,0
...,...,...,...,...,...,...,...,...,...
5055978,999,411299,1650552207405,873310,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.040...",5,1
5055979,999,694615,1650552207405,873310,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1,...",5,0
5055980,999,1240364,1650552207405,873310,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.13...",5,0
5055981,999,2514654,1650552207405,873310,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.1313, 0...",5,0


In [8]:
print(item_df.columns)
item_df

Index(['video_id', 'video_duration', 'play_cnt', 'like_cnt'], dtype='object')


Unnamed: 0,video_id,video_duration,play_cnt,like_cnt
0,0,87433.0,816.882353,6.470588
1,1,218066.0,2116.250000,38.161765
2,2,9233.0,425.681319,6.626374
3,3,16433.0,940.027778,5.402778
4,4,38766.0,4304.314607,457.235955
...,...,...,...,...
4371863,4371895,8800.0,360.416667,20.604167
4371864,4371896,,386.238806,23.940299
4371865,4371897,6800.0,4662.918367,75.551020
4371866,4371898,8680.0,52551.327586,658.672414


In [9]:
meta_df = item_df[['video_id', 'video_duration', 'play_cnt', 'like_cnt']]

# The header should start with i_ and the values need to be discrete and finite.
meta_df.columns = ['item_id', 'i_duration:numeric', 'i_vvall:numeric','i_likecnt:numeric']
meta_df

Unnamed: 0,item_id,i_duration:numeric,i_vvall:numeric,i_likecnt:numeric
0,0,87433.0,816.882353,6.470588
1,1,218066.0,2116.250000,38.161765
2,2,9233.0,425.681319,6.626374
3,3,16433.0,940.027778,5.402778
4,4,38766.0,4304.314607,457.235955
...,...,...,...,...
4371863,4371895,8800.0,360.416667,20.604167
4371864,4371896,,386.238806,23.940299
4371865,4371897,6800.0,4662.918367,75.551020
4371866,4371898,8680.0,52551.327586,658.672414


In [10]:
# 用均值填充 NaN
columns_to_normalize = ['i_duration:numeric', 'i_vvall:numeric', 'i_likecnt:numeric']
for column in columns_to_normalize:
    mean_value = meta_df[column].mean()
    meta_df[column].fillna(mean_value, inplace=True)
# 归一化
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
meta_df[columns_to_normalize] = scaler.fit_transform(meta_df[columns_to_normalize])
meta_df


Unnamed: 0,item_id,i_duration:numeric,i_vvall:numeric,i_likecnt:numeric
0,0,0.005292,0.000152,0.000030
1,1,0.013203,0.000395,0.000174
2,2,0.000557,0.000079,0.000030
3,3,0.000993,0.000175,0.000025
4,4,0.002345,0.000803,0.002090
...,...,...,...,...
4371863,4371895,0.000530,0.000067,0.000094
4371864,4371896,0.004582,0.000072,0.000109
4371865,4371897,0.000409,0.000870,0.000345
4371866,4371898,0.000523,0.009803,0.003011


### Statistics

In [11]:
n_users = data_df['user_id'].value_counts().size
n_items = data_df['item_id'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

In [12]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(int(min_time/1000)).strftime(time_format),
    datetime.utcfromtimestamp(int(max_time/1000)).strftime(time_format))
)

# Users: 983
# Items: 1875490
# Interactions: 4192604
Time Span: 2022-04-07/2022-04-21


# Build Dataset

### Interaction data

In [13]:
np.random.seed(RANDOM_SEED)

In [None]:
out_df = data_df


# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'label', 'time'])
out_df

In [15]:
# 检查是否有重复
duplicate_rows = out_df[out_df.duplicated(subset=['user_id', 'item_id'], keep=False)]
if duplicate_rows.empty:
    print("No duplicate (user_id, item_id) pairs.")
else:
    print("Duplicate (user_id, item_id) pairs found.")
    
    
duplicate_rows = out_df[out_df.duplicated(subset=['user_id', 'item_id', 'time'], keep=False)]
if duplicate_rows.empty:
    print("No duplicate (user_id, item_id) pairs.")
else:
    print("Duplicate (user_id, item_id) pairs found.")


Duplicate (user_id, item_id) pairs found.
Duplicate (user_id, item_id) pairs found.


In [16]:
choose_df = out_df[choose_column]

choose_df

Unnamed: 0,user_id,item_id,label,time,c_session_order,c_behavior_like,c_behavior_view
1,1,571390,0,1649467982289,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4657, 0...."
2,1,150081,0,1649467982289,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.465..."
3,1,883496,0,1649467982289,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1,1084990,0,1649467982289,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,1,844542,0,1649467982289,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4657, 0.0, 0...."
...,...,...,...,...,...,...,...
5055978,983,178084,1,1650552207405,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.040..."
5055979,983,299367,0,1650552207405,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1,..."
5055980,983,533300,0,1650552207405,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.13..."
5055981,983,1079070,0,1650552207405,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.1313, 0..."


In [17]:
# save results
out_df.to_csv(os.path.join(RAW_PATH, 'all_original.csv'), sep='\t', index=False)
choose_df.to_csv(os.path.join(RAW_PATH, 'all.csv'), sep='\t', index=False)

In [18]:
meta_df

Unnamed: 0,item_id,i_duration:numeric,i_vvall:numeric,i_likecnt:numeric
0,0,0.005292,0.000152,0.000030
1,1,0.013203,0.000395,0.000174
2,2,0.000557,0.000079,0.000030
3,3,0.000993,0.000175,0.000025
4,4,0.002345,0.000803,0.002090
...,...,...,...,...
4371863,4371895,0.000530,0.000067,0.000094
4371864,4371896,0.004582,0.000072,0.000109
4371865,4371897,0.000409,0.000870,0.000345
4371866,4371898,0.000523,0.009803,0.003011


### Item Metadata

In [None]:
meta_df_new = meta_df[meta_df['item_id'].apply(lambda x: x in item2id)] 
meta_df_new['item_id'] = meta_df_new['item_id'].apply(lambda x: item2id[x])


In [20]:
meta_df_new = meta_df_new.reset_index(drop=True)
meta_df_new

Unnamed: 0,item_id,i_duration:numeric,i_vvall:numeric,i_likecnt:numeric
0,1,0.013203,0.000395,0.000174
1,2,0.002345,0.000803,0.002090
2,3,0.009103,0.003816,0.007823
3,4,0.003748,0.000247,0.000084
4,5,0.001221,0.000250,0.000068
...,...,...,...,...
1875485,1875486,0.006175,0.000051,0.000037
1875486,1875487,0.008219,0.000010,0.000006
1875487,1875488,0.004582,0.000072,0.000109
1875488,1875489,0.000523,0.009803,0.003011


In [21]:
for column in meta_df_new.columns:
	mean_value = meta_df_new[column].mean()
	meta_df_new[column].fillna(mean_value, inplace=True)

In [22]:
meta_df_new

Unnamed: 0,item_id,i_duration:numeric,i_vvall:numeric,i_likecnt:numeric
0,1,0.013203,0.000395,0.000174
1,2,0.002345,0.000803,0.002090
2,3,0.009103,0.003816,0.007823
3,4,0.003748,0.000247,0.000084
4,5,0.001221,0.000250,0.000068
...,...,...,...,...
1875485,1875486,0.006175,0.000051,0.000037
1875486,1875487,0.008219,0.000010,0.000006
1875487,1875488,0.004582,0.000072,0.000109
1875488,1875489,0.000523,0.009803,0.003011


In [23]:
# save results

meta_df_new.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)

### User Metadata