In [2]:
import scipy.sparse

"""
用留一法（leave-one-out）处理mat类型的原始数据集，训练集只有正样本，测试集包括每个用户的1个正样本和99个负样本。
raw data `.mat` -> `train.mat` 和 `test.mat`
"""

'\n用留一法（leave-one-out）处理mat类型的原始数据集，训练集只有正样本，测试集包括每个用户的1个正样本和99个负样本。\nraw data `.mat` -> `train.mat` 和 `test.mat`\n'

In [3]:
import os
from tqdm import tqdm
import scipy.io as spio
from scipy.sparse import csr_matrix, coo_matrix
import numpy as np
import random
import pandas as pd

seed = 2022

random.seed(seed)
np.random.seed(seed)
dataset_name = "citeulike-a"
target_data_path = "/data/tshuang/Projects/sample_sauc/datasets/"
raw_data_path = os.path.join(target_data_path, dataset_name, "raw")

In [4]:
data = pd.read_csv(os.path.join(raw_data_path, "citeulike.csv"), names=["uid", "iid"])
data

Unnamed: 0,uid,iid
0,0,11009
1,0,11650
2,0,12802
3,0,3332
4,0,12803
5,0,12804
6,0,3591
7,0,4871
8,0,12805
9,0,13187


In [5]:
print(data.uid.nunique())
print(data.iid.nunique())

5139
16968


In [6]:
# 去掉为空的行，即未交互的用户，再重置user_index
idx = -1
new_uid = []
for i in range(data.shape[0]):
    if i == 0 or data.iloc[i,0] != data.iloc[i-1, 0]:
        idx += 1
    new_uid.append(idx)


In [7]:
mat = coo_matrix(([1]*data.shape[0], (new_uid, data.iid)))
mat.A

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
# mat = spio.loadmat(os.path.join(raw_data_path, dataset_name + "data.mat"))['data']
# mat

# 交互数据的划分

In [9]:
# 按比例划分

# def split_test_matrix(mat, ratio=0.8):
#     mat = mat.tocsr()  #按行读取，即每一行为一个用户
#     m,n = mat.shape
#     train_data_indices = []
#     train_indptr = [0] * (m+1)
#     test_data_indices = []
#     test_indptr = [0] * (m+1)
#     with tqdm(range(m)) as temp:
#         for i in temp:
#             row = [(mat.indices[j], mat.data[j]) for j in range(mat.indptr[i], mat.indptr[i+1])]
#             train_idx = random.sample(range(len(row)), round(ratio * len(row)))
#             train_binary_idx = np.full(len(row), False)
#             train_binary_idx[train_idx] = True
#             test_idx = (~train_binary_idx).nonzero()[0]
#             for idx in train_idx:
#                 train_data_indices.append(row[idx])
#             train_indptr[i+1] = len(train_data_indices)
#             for idx in test_idx:
#                 test_data_indices.append(row[idx])
#             test_indptr[i+1] = len(test_data_indices)
#
#     [train_indices, train_data] = zip(*train_data_indices)
#     [test_indices, test_data] = zip(*test_data_indices)
#
#     train_mat = csr_matrix((train_data, train_indices, train_indptr), (m,n))
#     test_mat = csr_matrix((test_data, test_indices, test_indptr), (m,n))
#     # save
#     spio.savemat(os.path.join(target_data_path, dataset_name + "_train.mat"), {"train_mat": train_mat})
#     spio.savemat(os.path.join(target_data_path, dataset_name + "_test.mat"), {"test_mat": test_mat})

# mat = spio.loadmat(os.path.join(raw_data_path, dataset_name + "data.mat"))['data']
# split_test_matrix(mat, ratio=0.8)

In [10]:
import numpy as np
import pandas as pd
import scipy
# 留一法划分

def split_test_matrix_loo(mat: scipy.sparse.csr_matrix):
    mat = mat.tocsr()  #按行读取，即每一行为一个用户
    m,n = mat.shape
    # m = 5
    train_indices = []
    train_indptr = [0] * (m + 1)  # 每一行的在train_indices的start和end
    test_data = []
    with tqdm(range(m)) as temp:
        for u in temp:
            u_pos_list = [(mat.indices[j]) for j in range(mat.indptr[u], mat.indptr[u+1])]
            # u_pos_list = list(zip(u_pos_list))
            # print(u_pos_list)
            # 采一个正样本用于验证
            test_idx = random.sample(range(len(u_pos_list)), 1)
            u_test_data = [u_pos_list[test_idx[0]]]
            for _ in range(99):  # 99个负样本
                while True:
                    neg_idx = random.randint(0, n-1)
                    if neg_idx not in u_pos_list:
                        u_test_data.append(neg_idx)
                        break
            test_data.append(u_test_data)

            test_binary_idx = np.full(len(u_pos_list), False)
            test_binary_idx[test_idx] = True
            train_idx = (~test_binary_idx).nonzero()[0]  # 这一行啥意思？
            # 拼接成csr
            for idx in train_idx:
                train_indices.append(u_pos_list[idx])
            train_indptr[u + 1] = len(train_indices)
    print(train_indices[0], train_indptr[0])
    train_mat = csr_matrix(([1]*len(train_indices), train_indices, train_indptr), (m, n))
    test_data = pd.DataFrame(np.array(test_data))
    return train_mat, test_data

train_mat, test_data = split_test_matrix_loo(mat)


100%|██████████| 5139/5139 [00:44<00:00, 115.86it/s]

495 0





In [11]:
test_data  # 没有保留未交互的物品

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,16163,9467,14503,10159,1991,16979,13512,10264,401,14135,...,2875,7883,8455,8578,6101,9540,2069,9397,6024,4456
1,8990,14952,9635,6413,89,4995,10354,548,9162,13129,...,8589,1813,10088,14962,13343,16366,3600,981,3047,15248
2,5129,11987,14612,11662,1575,10902,6805,14948,4772,9766,...,12730,1421,14812,8846,11602,14333,9130,11391,4327,296
3,892,4139,15507,11203,5594,10624,6622,3957,15620,12547,...,631,2774,986,10758,1601,14356,1148,8232,6448,8556
4,11877,3528,8712,4145,1937,10024,2523,3681,8224,5161,...,9974,3051,15241,10728,6909,8516,114,12066,16918,13613
5,3033,7569,2509,6500,8898,15231,4014,4849,6313,10046,...,2112,7881,7579,8141,4136,3252,4743,5420,7661,13894
6,7495,14106,709,10542,11035,15960,16786,6212,11868,7382,...,14073,2014,15815,2470,16613,10201,6424,10628,3303,2894
7,3082,7231,8354,7614,16452,20,13194,10103,7434,14520,...,7626,16975,9250,2671,962,6133,456,2176,9335,6514
8,11900,296,12661,16289,6929,6658,4900,2285,10768,4098,...,11598,4964,3600,4623,2460,5815,5246,2557,12752,874
9,1128,1565,7502,1103,9138,8800,6464,4328,14253,7326,...,8376,14845,3978,9059,15680,15086,2781,9470,15038,12793


In [12]:
train_mat   # 保留了未交互的物品

<5139x16980 sparse matrix of type '<class 'numpy.int64'>'
	with 195727 stored elements in Compressed Sparse Row format>

In [13]:
# save
spio.savemat(os.path.join(target_data_path, dataset_name, "train.mat"), {"train_mat": train_mat})
test_data.to_csv(os.path.join(target_data_path, dataset_name, "test.csv"), index=False)
# print(train_mat)
# test_data

# 测试

In [14]:
train_mat_for_test = spio.loadmat(os.path.join(target_data_path, dataset_name, "train.mat"))["train_mat"]
test_mat_for_test = pd.read_csv(os.path.join(target_data_path, dataset_name, "test.csv"), names=None)

In [15]:
test_mat_for_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,16163,9467,14503,10159,1991,16979,13512,10264,401,14135,...,2875,7883,8455,8578,6101,9540,2069,9397,6024,4456
1,8990,14952,9635,6413,89,4995,10354,548,9162,13129,...,8589,1813,10088,14962,13343,16366,3600,981,3047,15248
2,5129,11987,14612,11662,1575,10902,6805,14948,4772,9766,...,12730,1421,14812,8846,11602,14333,9130,11391,4327,296
3,892,4139,15507,11203,5594,10624,6622,3957,15620,12547,...,631,2774,986,10758,1601,14356,1148,8232,6448,8556
4,11877,3528,8712,4145,1937,10024,2523,3681,8224,5161,...,9974,3051,15241,10728,6909,8516,114,12066,16918,13613
5,3033,7569,2509,6500,8898,15231,4014,4849,6313,10046,...,2112,7881,7579,8141,4136,3252,4743,5420,7661,13894
6,7495,14106,709,10542,11035,15960,16786,6212,11868,7382,...,14073,2014,15815,2470,16613,10201,6424,10628,3303,2894
7,3082,7231,8354,7614,16452,20,13194,10103,7434,14520,...,7626,16975,9250,2671,962,6133,456,2176,9335,6514
8,11900,296,12661,16289,6929,6658,4900,2285,10768,4098,...,11598,4964,3600,4623,2460,5815,5246,2557,12752,874
9,1128,1565,7502,1103,9138,8800,6464,4328,14253,7326,...,8376,14845,3978,9059,15680,15086,2781,9470,15038,12793


In [21]:
train_mat_for_test = train_mat_for_test.tocoo()
row = train_mat_for_test.row
col = train_mat_for_test.col
for i in range(len(row)):
    assert mat[row[i], col[i]] == 1

In [18]:
m, n = test_mat_for_test.shape
for i in range(m):
    for j in range(n):
        if j == 0:
            assert mat[i,test_mat_for_test.iloc[i, j]] == 1
        else:
            assert mat[i,test_mat_for_test.iloc[i, j]] == 0