In [30]:
import os
import re
import zipfile
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [31]:
DATASET = 'KuaiRand_history' 
RAW_PATH = os.path.join('./', DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 990

# Load Data

1. Load interaction data and item metadata
2. Filter out items with less than 5 interactions
3. Calculate basic statistics

In [32]:
if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)

inter_df = pd.read_csv('../KuaiRand_noImmers_history.csv',sep=',',dtype={'user_id':int,'video_id':int})
item_df = pd.read_csv('../KuaiRand_ItemFeature.csv',sep=',',dtype={'video_id':int})

In [33]:
print(len(inter_df))
# inter_df = inter_df.sample(n=500000, random_state=2022)
inter_df

5055984


Unnamed: 0,user_id,video_id,time_ms,is_like,play_time_ms,duration_ms,session_id,session_order,behavior_like,behavior_view,session_length
0,0,4354972,1649467982289,0,0,70100,1,0,[],[],0
1,0,1329429,1649467982289,0,0,51422,1,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4657, 0....",10
2,0,346081,1649467982289,0,0,11696,1,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.465...",10
3,0,2058916,1649467982289,0,0,66433,1,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10
4,0,2528540,1649467982289,0,5332,11450,1,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10
...,...,...,...,...,...,...,...,...,...,...,...
5055979,999,694615,1650552207405,0,3573,35040,873310,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1,...",5
5055980,999,1240364,1650552207405,0,927,63566,873310,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.13...",5
5055981,999,2514654,1650552207405,0,65048,99100,873310,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.1313, 0...",5
5055982,999,4214495,1650552339920,0,1822,0,873311,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.1313, 0.102,...",1


In [34]:
print("number of interaction:", len(inter_df))

empty_list_count1 = inter_df['behavior_like'].apply(lambda x: len(x) == 2).sum()
empty_list_count2 = inter_df['behavior_view'].apply(lambda x: len(x) == 2).sum()
print("number of behavior list is empty:", empty_list_count1, empty_list_count2)
      
print('number of start session:', (inter_df['session_order'] == 0).sum())
inter_df = inter_df[inter_df['session_order'] != 0]

condition = inter_df['behavior_like'].apply(lambda x: len(x) == 2)
inter_df = inter_df[~condition]
empty_list_count1 = inter_df['behavior_like'].apply(lambda x: len(x) == 2).sum()
empty_list_count2 = inter_df['behavior_view'].apply(lambda x: len(x) == 2).sum()
print("number of behavior list is empty:", empty_list_count1, empty_list_count2)

print("number of remaining interaction:", len(inter_df))

print('number of like:', (inter_df['is_like'] == 1).sum())

print('number of duration is 0', (inter_df['duration_ms'] == 0).sum())

inter_df['duration_ms'] = inter_df['duration_ms'].replace(0, 1)

threshold = 0.95
print('number_viewratio', (inter_df['play_time_ms']/inter_df['duration_ms']>threshold).sum())

inter_df['view_ratio'] = inter_df.apply(lambda row: 1 if row['play_time_ms'] / row['duration_ms'] > threshold else 0, axis=1)


number of interaction: 5055984


number of behavior list is empty: 1036 1036
number of start session: 863292
number of behavior list is empty: 0 0
number of remaining interaction: 4192604
number of like: 61363
number of duration is 0 340612
number_viewratio 920904


In [35]:
print(inter_df.columns)
inter_df

Index(['user_id', 'video_id', 'time_ms', 'is_like', 'play_time_ms',
       'duration_ms', 'session_id', 'session_order', 'behavior_like',
       'behavior_view', 'session_length', 'view_ratio'],
      dtype='object')


Unnamed: 0,user_id,video_id,time_ms,is_like,play_time_ms,duration_ms,session_id,session_order,behavior_like,behavior_view,session_length,view_ratio
1,0,1329429,1649467982289,0,0,51422,1,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4657, 0....",10,0
2,0,346081,1649467982289,0,0,11696,1,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.465...",10,0
3,0,2058916,1649467982289,0,0,66433,1,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10,0
4,0,2528540,1649467982289,0,5332,11450,1,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10,0
5,0,1966868,1649467982289,0,0,22016,1,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4657, 0.0, 0....",10,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5055978,999,411299,1650552207405,0,14895,13166,873310,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.040...",5,1
5055979,999,694615,1650552207405,0,3573,35040,873310,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1,...",5,0
5055980,999,1240364,1650552207405,0,927,63566,873310,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.13...",5,0
5055981,999,2514654,1650552207405,0,65048,99100,873310,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.1313, 0...",5,0


In [36]:
# TODO：确定以哪列作为优化目标
data_df = inter_df[['user_id', 'video_id', 'time_ms','session_id', 'session_order','behavior_like','behavior_view','session_length','view_ratio']]
data_df.columns = ['user_id', 'item_id', 'time', 'session_id','c_session_order','c_behavior_like','c_behavior_view','session_length','label']
choose_column = ['user_id', 'item_id', 'label', 'time', 'c_session_order', 'c_behavior_like','c_behavior_view']
data_df

Unnamed: 0,user_id,item_id,time,session_id,c_session_order,c_behavior_like,c_behavior_view,session_length,label
1,0,1329429,1649467982289,1,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4657, 0....",10,0
2,0,346081,1649467982289,1,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.465...",10,0
3,0,2058916,1649467982289,1,7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10,0
4,0,2528540,1649467982289,1,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10,0
5,0,1966868,1649467982289,1,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4657, 0.0, 0....",10,0
...,...,...,...,...,...,...,...,...,...
5055978,999,411299,1650552207405,873310,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.040...",5,1
5055979,999,694615,1650552207405,873310,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1,...",5,0
5055980,999,1240364,1650552207405,873310,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.13...",5,0
5055981,999,2514654,1650552207405,873310,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0407, 1, 1.1313, 0...",5,0


In [37]:
print(item_df.columns)
item_df

Index(['video_id', 'video_duration', 'play_cnt', 'like_cnt'], dtype='object')


Unnamed: 0,video_id,video_duration,play_cnt,like_cnt
0,0,87433.0,816.882353,6.470588
1,1,218066.0,2116.250000,38.161765
2,2,9233.0,425.681319,6.626374
3,3,16433.0,940.027778,5.402778
4,4,38766.0,4304.314607,457.235955
...,...,...,...,...
4371863,4371895,8800.0,360.416667,20.604167
4371864,4371896,,386.238806,23.940299
4371865,4371897,6800.0,4662.918367,75.551020
4371866,4371898,8680.0,52551.327586,658.672414


In [38]:
meta_df = item_df[['video_id', 'video_duration', 'play_cnt', 'like_cnt']]

# The header should start with i_ and the values need to be discrete and finite.
meta_df.columns = ['item_id', 'i_duration:numeric', 'i_vvall:numeric','i_likecnt:numeric']
meta_df

Unnamed: 0,item_id,i_duration:numeric,i_vvall:numeric,i_likecnt:numeric
0,0,87433.0,816.882353,6.470588
1,1,218066.0,2116.250000,38.161765
2,2,9233.0,425.681319,6.626374
3,3,16433.0,940.027778,5.402778
4,4,38766.0,4304.314607,457.235955
...,...,...,...,...
4371863,4371895,8800.0,360.416667,20.604167
4371864,4371896,,386.238806,23.940299
4371865,4371897,6800.0,4662.918367,75.551020
4371866,4371898,8680.0,52551.327586,658.672414


In [39]:
# 用均值填充 NaN
columns_to_normalize = ['i_duration:numeric', 'i_vvall:numeric', 'i_likecnt:numeric']
for column in columns_to_normalize:
    mean_value = meta_df[column].mean()
    meta_df[column].fillna(mean_value, inplace=True)
# 归一化
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
meta_df[columns_to_normalize] = scaler.fit_transform(meta_df[columns_to_normalize])
meta_df


Unnamed: 0,item_id,i_duration:numeric,i_vvall:numeric,i_likecnt:numeric
0,0,0.005292,0.000152,0.000030
1,1,0.013203,0.000395,0.000174
2,2,0.000557,0.000079,0.000030
3,3,0.000993,0.000175,0.000025
4,4,0.002345,0.000803,0.002090
...,...,...,...,...
4371863,4371895,0.000530,0.000067,0.000094
4371864,4371896,0.004582,0.000072,0.000109
4371865,4371897,0.000409,0.000870,0.000345
4371866,4371898,0.000523,0.009803,0.003011


### Statistics

In [40]:
n_users = data_df['user_id'].value_counts().size
n_items = data_df['item_id'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

In [41]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(int(min_time/1000)).strftime(time_format),
    datetime.utcfromtimestamp(int(max_time/1000)).strftime(time_format))
)

# Users: 983
# Items: 1875490
# Interactions: 4192604
Time Span: 2022-04-07/2022-04-21


# Build Dataset

### Interaction data

In [42]:
np.random.seed(RANDOM_SEED)

In [None]:
out_df = data_df


# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'label', 'time'])
out_df

In [None]:
like_df = out_df[out_df['label']==1]
like_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
like_df = like_df.reset_index(drop=True)

out_df=like_df

like_df

In [45]:
# leave one out spliting

liked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    liked_item_set[user_id] = set(seq_df['item_id'].values.tolist())

out_df.sort_values(by=['user_id','time'], kind='mergesort', inplace=True)
out_df

Unnamed: 0,user_id,item_id,time,session_id,c_session_order,c_behavior_like,c_behavior_view,session_length,label
91710,1,1072017,1649477382190,3,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0225, 0.1503, 0.05...",6,1
258102,1,1526303,1649673423739,6,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10,1
258425,1,585,1649673704264,15,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0891, 0....",6,1
258426,1,449082,1649673704264,15,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0891, 0.0308, 0.15...",6,1
258709,1,640368,1649673930311,17,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6,1
...,...,...,...,...,...,...,...,...,...
914343,983,754698,1650552181794,873309,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.1901, 1....",4,1
914344,983,1732614,1650552181794,873309,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.1901, 1.0326,...",4,1
914345,983,1005263,1650552181794,873309,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4,1
914373,983,242026,1650552207405,873310,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5,1


In [46]:
session_last = {}
def split_dataset():
    dev_ui_list=[]
    test_ui_list=[]
    for i in range(len(out_df) ):
        # if out_df['session_length'][i]<7:
        #     continue
        session_id = out_df['session_id'][i]
        if session_id not in session_last:
            session_last[session_id]={}
        session_last[session_id][out_df['c_session_order'][i]] = [out_df['user_id'][i],out_df['item_id'][i]]
    for session_id in session_last:
        my_dict = session_last[session_id]
        if len(my_dict)<4:
            continue
        # elif len(my_dict)==3:
        #     if np.random.random()<0.8:
        #         continue
        max_order = max(my_dict, key=my_dict.get)
        test_ui_list.append((my_dict[max_order][0],my_dict[max_order][1]))
        del my_dict[max_order]
        second_max_order = max(my_dict, key=my_dict.get)
        dev_ui_list.append((my_dict[second_max_order][0],my_dict[second_max_order][1]))
    return dev_ui_list, test_ui_list

dev_ui_list, test_ui_list = split_dataset()
print(len(dev_ui_list),len(test_ui_list))

39985 39985


In [47]:
out_df = out_df[choose_column]

out_df

Unnamed: 0,user_id,item_id,label,time,c_session_order,c_behavior_like,c_behavior_view
91710,1,1072017,1,1649477382190,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0225, 0.1503, 0.05..."
258102,1,1526303,1,1649673423739,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
258425,1,585,1,1649673704264,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0891, 0...."
258426,1,449082,1,1649673704264,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0891, 0.0308, 0.15..."
258709,1,640368,1,1649673930311,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...
914343,983,754698,1,1650552181794,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.1901, 1...."
914344,983,1732614,1,1650552181794,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.1901, 1.0326,..."
914345,983,1005263,1,1650552181794,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
914373,983,242026,1,1650552207405,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [48]:
# 全负采样
def generate_dev_test(data_df, dev_list, test_list):
    dev_df = pd.DataFrame(columns=data_df.columns)
    test_df = pd.DataFrame(columns=data_df.columns)
    n_items = data_df['item_id'].value_counts().size
    print('n_items',n_items)
    for index, row in data_df.iterrows():
        uid = data_df['user_id'][index]
        iid = data_df['item_id'][index]
        if (uid,iid) in dev_list:
            dev_df = dev_df.append(row)
        elif (uid,iid) in test_list:
            test_df = test_df.append(row)
    
    neg_items_dev = np.random.randint(1, n_items + 1, (len(dev_df), NEG_ITEMS))
    for i, uid in enumerate(dev_df['user_id'].values):
        user_liked = liked_item_set[uid]
        mask = np.isin(neg_items_dev[i], user_liked)
        while np.any(mask):
            neg_items_dev[i][mask] = np.random.randint(1, n_items + 1, mask.sum())
            mask = np.isin(neg_items_dev[i], user_liked)
    dev_df['neg_items'] = neg_items_dev.tolist()
    
    neg_items_test = np.random.randint(1, n_items + 1, (len(test_df), NEG_ITEMS))
    for i, uid in enumerate(test_df['user_id'].values):
        user_liked = liked_item_set[uid]
        mask = np.isin(neg_items_test[i], user_liked)
        while np.any(mask):
            neg_items_test[i][mask] = np.random.randint(1, n_items + 1, mask.sum())
            mask = np.isin(neg_items_test[i], user_liked)
    test_df['neg_items'] = neg_items_test.tolist()
    
    data_df = data_df.drop(dev_df.index)
    data_df = data_df.drop(test_df.index)
    
    return dev_df, test_df, data_df

dev_df, test_df, train_df = generate_dev_test(out_df, dev_ui_list, test_ui_list)
len(train_df), len(dev_df), len(test_df)

n_items 500310


(837887, 39987, 39986)

In [49]:
train_df

Unnamed: 0,user_id,item_id,label,time,c_session_order,c_behavior_like,c_behavior_view
91710,1,1072017,1,1649477382190,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0225, 0.1503, 0.05..."
258102,1,1526303,1,1649673423739,9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
258425,1,585,1,1649673704264,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0891, 0...."
258426,1,449082,1,1649673704264,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0891, 0.0308, 0.15..."
258709,1,640368,1,1649673930311,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...
914343,983,754698,1,1650552181794,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.1901, 1...."
914344,983,1732614,1,1650552181794,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.1901, 1.0326,..."
914345,983,1005263,1,1650552181794,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
914373,983,242026,1,1650552207405,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [50]:
for column in train_df.columns:
    dev_df[column] = dev_df[column].astype(train_df[column].dtype)
dev_df

Unnamed: 0,user_id,item_id,label,time,c_session_order,c_behavior_like,c_behavior_view,neg_items
754420,1,1250601,1,1650340124136,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0687, 1.9189,...","[461485, 305712, 435830, 117953, 439108, 15231..."
805698,1,868552,1,1650413609180,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[345136, 62080, 210933, 234628, 259021, 138085..."
276251,2,1371705,1,1649688035288,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.318...","[461844, 438880, 3913, 181659, 465347, 21753, ..."
284612,2,1062152,1,1649706375845,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[279727, 291989, 84918, 38430, 343620, 338468,..."
284679,2,1679954,1,1649706789917,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.011, 1, ...","[418002, 449130, 63419, 429953, 352215, 491072..."
...,...,...,...,...,...,...,...,...
805843,983,1704823,1,1650413834787,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.6072, 1, 0.0175, 0...","[164819, 254771, 440834, 29216, 10936, 263130,..."
821648,983,1626125,1,1650433992716,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0427, 0....","[229758, 67222, 170260, 247469, 122305, 459267..."
849784,983,1047674,1,1650462230996,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5799, 1....","[144274, 138529, 7862, 46990, 480408, 192598, ..."
882811,983,1658222,1,1650519935384,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.8356, 1.4947, 1.08...","[259076, 37947, 451990, 49264, 177689, 130784,..."


In [51]:
for column in train_df.columns:
    test_df[column] = test_df[column].astype(train_df[column].dtype)
test_df

Unnamed: 0,user_id,item_id,label,time,c_session_order,c_behavior_like,c_behavior_view,neg_items
754418,1,1431797,1,1650340124136,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[390689, 448558, 130226, 344950, 220447, 13816..."
805695,1,1316634,1,1650413609180,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0608, 1.8014, 3.14...","[157767, 346503, 6562, 77978, 483588, 306277, ..."
276252,2,1701825,1,1649688035288,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[296112, 408564, 215408, 35895, 87081, 438218,..."
284609,2,1313137,1,1649706375845,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 2.8764, 1.1066, 1.49...","[237481, 108000, 103387, 371445, 210908, 11453..."
284681,2,1771735,1,1649706789917,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.011, 1, 0.0702, 1.0069,...","[123346, 91661, 380173, 282062, 234885, 476801..."
...,...,...,...,...,...,...,...,...
805844,983,1779539,1,1650413834787,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6072, 1,...","[251240, 338379, 414247, 265027, 105852, 11464..."
821649,983,1638405,1,1650433992716,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0427, 0.0183, 1.08...","[494087, 91520, 165081, 185932, 55948, 297082,..."
849782,983,1486510,1,1650462230996,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5799, 1.0975,...","[455933, 167082, 298273, 230618, 8679, 298752,..."
882809,983,1833797,1,1650519935384,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8356, 1.4947,...","[445258, 7357, 34858, 155536, 68857, 321601, 6..."


In [52]:
# save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

In [53]:
meta_df

Unnamed: 0,item_id,i_duration:numeric,i_vvall:numeric,i_likecnt:numeric
0,0,0.005292,0.000152,0.000030
1,1,0.013203,0.000395,0.000174
2,2,0.000557,0.000079,0.000030
3,3,0.000993,0.000175,0.000025
4,4,0.002345,0.000803,0.002090
...,...,...,...,...
4371863,4371895,0.000530,0.000067,0.000094
4371864,4371896,0.004582,0.000072,0.000109
4371865,4371897,0.000409,0.000870,0.000345
4371866,4371898,0.000523,0.009803,0.003011


### Item Metadata

In [None]:
meta_df_new = meta_df[meta_df['item_id'].apply(lambda x: x in item2id)] 
meta_df_new['item_id'] = meta_df_new['item_id'].apply(lambda x: item2id[x])


In [55]:
meta_df_new = meta_df_new.reset_index(drop=True)
meta_df_new

Unnamed: 0,item_id,i_duration:numeric,i_vvall:numeric,i_likecnt:numeric
0,1,0.013203,0.000395,0.000174
1,2,0.002345,0.000803,0.002090
2,3,0.009103,0.003816,0.007823
3,4,0.003748,0.000247,0.000084
4,5,0.001221,0.000250,0.000068
...,...,...,...,...
1875485,1875486,0.006175,0.000051,0.000037
1875486,1875487,0.008219,0.000010,0.000006
1875487,1875488,0.004582,0.000072,0.000109
1875488,1875489,0.000523,0.009803,0.003011


In [56]:
for column in meta_df_new.columns:
	mean_value = meta_df_new[column].mean()
	meta_df_new[column].fillna(mean_value, inplace=True)

In [57]:
meta_df_new

Unnamed: 0,item_id,i_duration:numeric,i_vvall:numeric,i_likecnt:numeric
0,1,0.013203,0.000395,0.000174
1,2,0.002345,0.000803,0.002090
2,3,0.009103,0.003816,0.007823
3,4,0.003748,0.000247,0.000084
4,5,0.001221,0.000250,0.000068
...,...,...,...,...
1875485,1875486,0.006175,0.000051,0.000037
1875486,1875487,0.008219,0.000010,0.000006
1875487,1875488,0.004582,0.000072,0.000109
1875488,1875489,0.000523,0.009803,0.003011


In [58]:
# save results

meta_df_new.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)

### User Metadata