In [101]:
import os
import re
import zipfile
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [102]:
DATASET = 'MicroVideo_history' 
RAW_PATH = os.path.join('./', DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 999

# Load Data

1. Load interaction data and item metadata
2. Filter out items with less than 5 interactions
3. Calculate basic statistics

In [103]:
if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)

inter_df = pd.read_csv('../inter_df.csv',sep=',',dtype={'user_id':int,'item_id':int,'start_time':int})
item_df = pd.read_csv('../item_df.csv',sep=',',dtype={'item_id':int})

In [None]:
print(len(inter_df))
# inter_df = inter_df.sample(n=300000, random_state=2022)
inter_df

In [105]:
print("number of interaction:", len(inter_df))
      
print('number of start session:', (inter_df['session_order'] == 0).sum())
inter_df = inter_df[inter_df['session_order'] != 0]

condition = inter_df['session_order']==0
inter_df = inter_df[~condition]
print("number of remaining interaction:", len(inter_df))

print('number of like:', (inter_df['like'] == 1).sum())

threshold = 0.95
print('number of play ratio', (inter_df['play_ratio']>threshold).sum())

print('#user', len(inter_df['user_id'].value_counts()))
print('#items', len(inter_df['item_id'].value_counts()))

inter_df['view_ratio'] = inter_df.apply(lambda row: 1 if row['play_ratio'] > threshold else 0, axis=1)
inter_df.drop(['Unnamed: 0.1','end_time'], axis=1, inplace=True)


number of interaction: 1556288
number of start session: 190721
number of remaining interaction: 1365567
number of like: 25111
number of play ratio 319894
#user 3082
#items 708210


In [None]:
print(inter_df.columns)
inter_df = inter_df.reset_index(drop=True)
inter_df

In [None]:
data_df = inter_df[['user_id', 'item_id', 'start_time','session_id', 'session_order','behavior_like','behavior_view','view_ratio','date']]
data_df.columns = ['user_id', 'item_id', 'time', 'session_id','c_session_order','c_behavior_like','c_behavior_view','label','date']
choose_column = ['user_id', 'item_id', 'label', 'time', 'c_session_order', 'c_behavior_like','c_behavior_view']
data_df

In [None]:
print(item_df.columns)
item_df

In [None]:
meta_df = item_df[['item_id', 'item_duration', 'vv_all', 'like_cnt_all']]
meta_df.columns = ['item_id', 'i_duration:numeric', 'i_vvall:numeric','i_likecnt:numeric']
meta_df

In [None]:
# 用均值填充 NaN
columns_to_normalize = ['i_duration:numeric', 'i_vvall:numeric', 'i_likecnt:numeric']
for column in columns_to_normalize:
    mean_value = meta_df[column].mean()
    meta_df[column].fillna(mean_value, inplace=True)
# 归一化
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
meta_df[columns_to_normalize] = scaler.fit_transform(meta_df[columns_to_normalize])
meta_df


### Statistics

In [111]:
n_users = data_df['user_id'].value_counts().size
n_items = data_df['item_id'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(int(min_time/1000)).strftime(time_format),
    datetime.utcfromtimestamp(int(max_time/1000)).strftime(time_format))
)

# Users: 3082
# Items: 708210
# Interactions: 1365567
Time Span: 1970-01-20/1970-01-20


# Build Dataset

### Interaction data

In [112]:
np.random.seed(RANDOM_SEED)

In [None]:
out_df = data_df


# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'label', 'time'])
out_df

In [None]:
like_df = out_df[out_df['label']==1]
like_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
like_df = like_df.reset_index(drop=True)

out_df=like_df

like_df

In [None]:
# leave one out spliting

liked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    liked_item_set[user_id] = set(seq_df['item_id'].values.tolist())

out_df.sort_values(by=['user_id','time'], kind='mergesort', inplace=True)
out_df

In [116]:
session_last = {}
def split_dataset():
    dev_ui_list=[]
    test_ui_list=[]
    for i in range(len(out_df) ):
        user_id = out_df['user_id'][i]
        session_id = out_df['session_id'][i]
        date = out_df['date'][i]
        if user_id not in session_last:
            session_last[user_id]={}
        if date not in session_last[user_id]:
            session_last[user_id][date]={}
        if session_id not in session_last[user_id][date]:
            session_last[user_id][date][session_id]={}
        session_last[user_id][date][session_id][out_df['c_session_order'][i]] = [out_df['user_id'][i],out_df['item_id'][i]]
    for user_id in session_last:
        for date in session_last[user_id]:
            for session_id in session_last[user_id][date]:
                my_dict = session_last[user_id][date][session_id]
                if len(my_dict)<4:
                    continue
                max_order = max(my_dict, key=my_dict.get)
                test_ui_list.append((my_dict[max_order][0],my_dict[max_order][1]))
                del my_dict[max_order]
                second_max_order = max(my_dict, key=my_dict.get)
                dev_ui_list.append((my_dict[second_max_order][0],my_dict[second_max_order][1]))
    return dev_ui_list, test_ui_list

dev_ui_list, test_ui_list = split_dataset()
print(len(dev_ui_list),len(test_ui_list))

28606 28606


In [117]:
out_df = out_df[choose_column]

out_df

Unnamed: 0,user_id,item_id,label,time,c_session_order,c_behavior_like,c_behavior_view
41550,1,318114,1,1677660420,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
42391,1,317260,1,1677661134,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005..."
43100,1,186283,1,1677661739,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0052, 0...."
43313,1,155566,1,1677661932,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0052, 0.0, 4...."
43999,1,124718,1,1677662513,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...
312381,3082,244988,1,1678022744,13,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1.228, 0.4154, 1.0922, 4.1639, 0.1193, 0.0, 1..."
313118,3082,700225,1,1678023491,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5742, 0...."
313471,3082,252209,1,1678023870,8,"[0.0, 0.0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.5742, 0.0066, 0.0905, 1.735, 0.0,..."
313553,3082,367520,1,1678023952,9,"[0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 0.5742, 0.0066, 0.0905, 1.735, 0.0, 0.14..."


In [118]:
def generate_dev_test(data_df, dev_list, test_list):
    dev_df = pd.DataFrame(columns=data_df.columns)
    test_df = pd.DataFrame(columns=data_df.columns)
    n_items = data_df['item_id'].value_counts().size
    print('n_items',n_items)
    for index, row in data_df.iterrows():
        uid = data_df['user_id'][index]
        iid = data_df['item_id'][index]
        if (uid,iid) in dev_list:
            dev_df = dev_df.append(row)
        elif (uid,iid) in test_list:
            test_df = test_df.append(row)
    
    neg_items_dev = np.random.randint(1, n_items + 1, (len(dev_df), NEG_ITEMS))
    for i, uid in enumerate(dev_df['user_id'].values):
        user_liked = liked_item_set[uid]
        mask = np.isin(neg_items_dev[i], user_liked)
        while np.any(mask):
            neg_items_dev[i][mask] = np.random.randint(1, n_items + 1, mask.sum())
            mask = np.isin(neg_items_dev[i], user_liked)
    dev_df['neg_items'] = neg_items_dev.tolist()
    
    neg_items_test = np.random.randint(1, n_items + 1, (len(test_df), NEG_ITEMS))
    for i, uid in enumerate(test_df['user_id'].values):
        user_liked = liked_item_set[uid]
        mask = np.isin(neg_items_test[i], user_liked)
        while np.any(mask):
            neg_items_test[i][mask] = np.random.randint(1, n_items + 1, mask.sum())
            mask = np.isin(neg_items_test[i], user_liked)
    test_df['neg_items'] = neg_items_test.tolist()
    
    data_df = data_df.drop(dev_df.index)
    data_df = data_df.drop(test_df.index)
    
    return dev_df, test_df, data_df

dev_df, test_df, train_df = generate_dev_test(out_df, dev_ui_list, test_ui_list)
len(train_df), len(dev_df), len(test_df)

n_items 170595


(261533, 29353, 29008)

In [119]:
train_df

Unnamed: 0,user_id,item_id,label,time,c_session_order,c_behavior_like,c_behavior_view
41550,1,318114,1,1677660420,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
42391,1,317260,1,1677661134,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005..."
43100,1,186283,1,1677661739,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0052, 0...."
43313,1,155566,1,1677661932,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0052, 0.0, 4...."
43999,1,124718,1,1677662513,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...
312022,3082,272568,1,1678022386,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.6215, 1.0689, 0.0,..."
312264,3082,686159,1,1678022641,12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 1.228, 0.4154, 1.0922, 4.1639, 0.1193, 0..."
312381,3082,244988,1,1678022744,13,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1.228, 0.4154, 1.0922, 4.1639, 0.1193, 0.0, 1..."
313471,3082,252209,1,1678023870,8,"[0.0, 0.0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.5742, 0.0066, 0.0905, 1.735, 0.0,..."


In [120]:
for column in train_df.columns:
    dev_df[column] = dev_df[column].astype(train_df[column].dtype)
dev_df

Unnamed: 0,user_id,item_id,label,time,c_session_order,c_behavior_like,c_behavior_view,neg_items
81506,1,475971,1,1677720122,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 1.403, 0.17, 0.014, ...","[43568, 117953, 152316, 95940, 97640, 41994, 1..."
241666,1,646250,1,1677926132,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[139962, 85399, 40720, 140176, 68936, 79132, 1..."
39012,3,425728,1,1677658216,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1282, 0....","[820, 138273, 156056, 87802, 58204, 70407, 952..."
62063,3,457823,1,1677679360,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0385, 1.327, 1.439...","[34220, 4506, 32897, 71804, 100776, 65923, 148..."
92434,3,489051,1,1677732684,7,"[0.0, 0.0, 0.0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.2547, 2.2432, 0.0102, 0...","[153484, 146516, 68987, 26520, 34227, 135802, ..."
...,...,...,...,...,...,...,...,...
305137,3082,646746,1,1678015152,8,"[0.0, 0.0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.145, 0.0333, 1.3149, 0.5072, 0.03...","[154603, 1005, 65503, 72804, 53094, 114011, 15..."
306821,3082,702064,1,1678016917,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.402...","[62940, 131487, 76548, 59546, 14596, 58350, 12..."
308069,3082,691599,1,1678018158,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[86917, 130994, 153061, 148244, 111730, 38183,..."
312117,3082,699059,1,1678022500,9,"[0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 0.6215, 1.0689, 0.0, 1.228, 0.4154, 1.09...","[114042, 36020, 24295, 114454, 163242, 120824,..."


In [121]:
for column in train_df.columns:
    test_df[column] = test_df[column].astype(train_df[column].dtype)
test_df

Unnamed: 0,user_id,item_id,label,time,c_session_order,c_behavior_like,c_behavior_view,neg_items
82128,1,481985,1,1677720984,22,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.001, 0.0135, 1.0, 0.81, 0.1323, 0.0502, 0.0...","[86772, 69347, 69253, 141580, 52912, 115385, 7..."
242434,1,650034,1,1677926943,15,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.1672, 0.0551, 0.0528, 1.1205, 0.0858, 0.458...","[51602, 122329, 34428, 78528, 134561, 80098, 7..."
39749,3,427155,1,1677658843,13,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1.8634, 0.0332, 0.1079, 0.138, 0.117, 0.0405,...","[161970, 102482, 90818, 148937, 85704, 158934,..."
61553,3,467095,1,1677678834,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[54298, 30466, 135735, 111454, 26851, 163092, ..."
92358,3,493148,1,1677732613,6,"[0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.2547, 2.2432, 0.01...","[141310, 11181, 83425, 156576, 105742, 136925,..."
...,...,...,...,...,...,...,...,...
305117,3082,697342,1,1678015128,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.145, 0.0333, 1.314...","[148976, 29759, 75818, 169556, 37055, 796, 116..."
306753,3082,702133,1,1678016844,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[30628, 52817, 121606, 118440, 296, 154534, 53..."
308306,3082,700890,1,1678018406,6,"[0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 1.0, 7.4435, 4.7278, 0.37...","[135890, 7423, 100754, 77758, 56094, 142201, 4..."
312041,3082,703537,1,1678022407,6,"[0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.6215, 1.0689, 0.0, 1.22...","[117943, 53748, 115320, 167864, 39065, 72505, ..."


In [122]:
# save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

In [None]:
meta_df

### Item Metadata

In [None]:
meta_df_new = meta_df[meta_df['item_id'].apply(lambda x: x in item2id)] 
meta_df_new['item_id'] = meta_df_new['item_id'].apply(lambda x: item2id[x])


In [None]:
meta_df_new = meta_df_new.reset_index(drop=True)
meta_df_new

In [126]:
for column in meta_df_new.columns:
	mean_value = meta_df_new[column].mean()
	meta_df_new[column].fillna(mean_value, inplace=True)

In [None]:
meta_df_new

In [128]:
# save results

meta_df_new.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)

### User Metadata