In [1]:
import pandas as pd
import numpy as np

raw_data = pd.read_csv("ml-20m.inter", sep="\t")

In [5]:
print(raw_data)
# print interaction num
print("inter: ", len(raw_data), "U: ", len(raw_data["user_id:token"].unique()), "I: ", len(raw_data["item_id:token"].unique()))

          user_id:token  item_id:token  rating:float  timestamp:float
0                     1              2           3.5       1112486027
1                     1             29           3.5       1112484676
2                     1             32           3.5       1112484819
3                     1             47           3.5       1112484727
4                     1             50           3.5       1112484580
...                 ...            ...           ...              ...
20000258         138493          68954           4.5       1258126920
20000259         138493          69526           4.5       1259865108
20000260         138493          69644           3.0       1260209457
20000261         138493          70286           5.0       1258126944
20000262         138493          71619           2.5       1255811136

[20000263 rows x 4 columns]
inter:  20000263 U:  138493 I:  26744


In [6]:
raw_data = raw_data.loc[raw_data["rating:float"]>4]
print("inter: ", len(raw_data), "U: ", len(raw_data["user_id:token"].unique()), "I: ", len(raw_data["item_id:token"].unique()))
# 5-Core Filter
from collections import Counter
user_id_list = raw_data["item_id:token"].tolist()
user_id_count = Counter(user_id_list)
user_id_remove = [item_id for item_id, count in user_id_count.items() if count < 5]
raw_data = raw_data[~raw_data["item_id:token"].isin(user_id_remove)]

user_id_list = raw_data["user_id:token"].tolist()
user_id_count = Counter(user_id_list)
user_id_remove = [user_id for user_id, count in user_id_count.items() if count < 5]
raw_data = raw_data[~raw_data["user_id:token"].isin(user_id_remove)]

print("inter: ", len(raw_data), "U: ", len(raw_data["user_id:token"].unique()), "I: ", len(raw_data["item_id:token"].unique()))

inter:  4433484 U:  136472 I:  17218
inter:  4378719 U:  120685 I:  10810


In [5]:
raw_data_grouped = raw_data.groupby("user_id:token").agg(
    item_seq = pd.NamedAgg(column='item_id:token', aggfunc=list),
    time_seq = pd.NamedAgg(column='timestamp:float', aggfunc=list)
    )

In [6]:
for row in raw_data_grouped.index:
    seq = raw_data_grouped.loc[row, "item_seq"]
    time_seq = np.asarray(raw_data_grouped.loc[row, "time_seq"])
    unique_seq = []
    unique_indicies = []
    for idx, i in enumerate(seq):
        if i not in unique_seq:
            unique_seq.append(i)
            unique_indicies.append(idx)
    time_seq = time_seq[unique_indicies]
    # sort
    sorted_indicies = np.argsort(time_seq)
    raw_data_grouped.at[row, "item_seq"] = np.asarray(unique_seq)[sorted_indicies]
    raw_data_grouped.at[row, "time_seq"] = time_seq[sorted_indicies]

In [7]:
indices_to_drop = list()
for idx in raw_data_grouped.index:
    item_seq = raw_data_grouped.loc[idx, "item_seq"]
    if len(item_seq)<3 or len(item_seq)>51:
        indices_to_drop.append(idx)

raw_data_grouped.drop(index=indices_to_drop, inplace=True)
raw_data_grouped.reset_index(inplace=True)

In [8]:
raw_data_grouped

Unnamed: 0,user_id:token,item_seq,time_seq
0,1,"[8507, 5952, 1198, 7153, 4993, 1196, 8636]","[1094786027, 1112484619, 1112484624, 111248463..."
1,2,"[62, 1974, 1356, 1210, 589, 3513, 2948, 1259, ...","[974820598, 974820598, 974820598, 974820598, 9..."
2,5,"[62, 141, 780, 736, 671, 832, 1393, 590, 150, ...","[851526935, 851526935, 851526935, 851526935, 8..."
3,6,"[1, 17, 62, 141, 648, 7, 52]","[858275452, 858275452, 858275452, 858275452, 8..."
4,7,"[1196, 912, 1210, 4963, 4306, 1256, 4799, 2028...","[1011204572, 1011204596, 1011204654, 101120477..."
...,...,...,...
96721,138485,"[79132, 2571, 48780, 858, 1221, 58559, 2959, 2...","[1346728359, 1346728363, 1346728381, 134672840..."
96722,138487,"[866, 1172, 3094, 920, 2067, 265, 446, 25, 129...","[965499200, 965499200, 965499234, 965499258, 9..."
96723,138489,"[318, 858, 50, 2019, 912, 1221, 1193, 1212, 29...","[1352989275, 1352989278, 1352989283, 135298928..."
96724,138490,"[111, 3217, 1535, 593, 1041, 3006, 34, 314, 32...","[975542655, 975542655, 975542860, 975543071, 9..."


In [9]:
all_items = list()
for idx in raw_data_grouped.index:
    all_items.extend(raw_data_grouped.loc[idx, "item_seq"].tolist())
all_items = list(set(all_items))
item_ids = list(range(1, len(all_items)+1))
item_remap = dict()
for i in range(len(all_items)):
    item_remap[all_items[i]] = item_ids[i]

raw_data_grouped["item_seq"] = raw_data_grouped["item_seq"].apply(lambda x:list(map(lambda y:item_remap[y], x.tolist())))

In [10]:
len(all_items)

10154

In [14]:
train_file = open("ml-20m.train.inter", "w")
test_file = open("ml-20m.test.inter", "w")

train_file.write("session_id:token\titem_id_list:token_seq\titem_id:token\n")
test_file.write("session_id:token\titem_id_list:token_seq\titem_id:token\n")

for idx in raw_data_grouped.index:
    uid = idx
    item_seq = raw_data_grouped.loc[idx, "item_seq"]
    item_seq = list(map(lambda x:str(x), item_seq))
    while len(item_seq) > 51:
        right_seq = item_seq[51:]
        new_seq = item_seq[:51]
        test_file.write(f"{uid}\t"+" ".join(new_seq[:-1])+f"\t{new_seq[-1]}\n")
        train_file.write(f"{uid}\t"+" ".join(new_seq[:-2])+f"\t{new_seq[-2]}\n")
        item_seq = right_seq
    if not len(item_seq) < 3:
        test_file.write(f"{uid}\t"+" ".join(item_seq[:-1])+f"\t{item_seq[-1]}\n")
        train_file.write(f"{uid}\t"+" ".join(item_seq[:-2])+f"\t{item_seq[-2]}\n")

train_file.close()
test_file.close()