In [1]:
import torch
import torch.utils.data as Data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, KBinsDiscretizer
def AmazonBookPreprocess(dataframe, seq_len=40):
    """
    数据集处理
    :param dataframe: 未处理的数据集
    :param seq_len: 数据序列长度
    :return data: 处理好的数据集
    """
    # 1.按'|'切割，用户历史购买数据，获取item的序列和类别的序列
    data = dataframe.copy()
    data['hist_item_list'] = dataframe.apply(lambda x: x['hist_item_list'].split('|'), axis=1)
    data['hist_cate_list'] = dataframe.apply(lambda x: x['hist_cate_list'].split('|'), axis=1)

    # 2.获取cate的所有种类，为每个类别设置一个唯一的编码
    cate_list = list(data['cateID'])
    _ = [cate_list.extend(i) for i in data['hist_cate_list'].values]
    # 3.将编码去重
    cate_set = set(cate_list + ['0'])  # 用 '0' 作为padding的类别

    # 4.截取用户行为的长度,也就是截取hist_cate_list的长度，生成对应的列名
    cols = ['hist_cate_{}'.format(i) for i in range(seq_len)]

    # 5.截取前40个历史行为，如果历史行为不足40个则填充0
    def trim_cate_list(x):
        if len(x) > seq_len:
            # 5.1历史行为大于40, 截取后40个行为
            return pd.Series(x[-seq_len:], index=cols)
        else:
            # 5.2历史行为不足40, padding到40个行为
            pad_len = seq_len - len(x)
            x = x + ['0'] * pad_len
            return pd.Series(x, index=cols)

    # 6.预测目标为试题的类别
    labels = data['label']
    data = data['hist_cate_list'].apply(trim_cate_list).join(data['cateID'])

    # 7.生成类别对应序号的编码器，如book->1,Russian->2这样
    cate_encoder = LabelEncoder().fit(list(cate_set))
    # 8.这里分为两步，第一步为把类别转化为数值，第二部为拼接上label
    data = data.apply(cate_encoder.transform).join(labels)
    return data

In [2]:
from conf.config import *
data = pd.read_csv(DataSet_Root + 'amazon-books-100k.txt')

In [3]:
data

Unnamed: 0,label,userID,itemID,cateID,hist_item_list,hist_cate_list
0,0,AZPJ9LUT0FEPY,B00AMNNTIA,Literature & Fiction,0307744434|0062248391|0470530707|0978924622|15...,Books|Books|Books|Books|Books
1,1,AZPJ9LUT0FEPY,0800731603,Books,0307744434|0062248391|0470530707|0978924622|15...,Books|Books|Books|Books|Books
2,0,A2NRV79GKAU726,B003NNV10O,Russian,0814472869|0071462074|1583942300|0812538366|B0...,Books|Books|Books|Books|Baking|Books|Books
3,1,A2NRV79GKAU726,B000UWJ91O,Books,0814472869|0071462074|1583942300|0812538366|B0...,Books|Books|Books|Books|Baking|Books|Books
4,0,A2GEQVDX2LL4V3,0321334094,Books,0743596870|0374280991|1439140634|0976475731,Books|Books|Books|Books
...,...,...,...,...,...,...
99995,1,A3I7LS4H993CXB,1481872060,Books,1936826135|1250014409|1480219851|1484823664|14...,Books|Books|Books|Books|Books|Literature & Fic...
99996,0,AP00RAQ20KM12,1414334095,Books,0312328796|0758207182|0739470140|1601621450|18...,Books|Books|Books|Books|Books|Books|Books|Book...
99997,1,AP00RAQ20KM12,B0063LINHW,Historical,0312328796|0758207182|0739470140|1601621450|18...,Books|Books|Books|Books|Books|Books|Books|Book...
99998,0,A1ZVJYANTLTLVP,0762419229,Books,0743470117|0395851580|1451661215|0312342020,Books|Books|Books|Books


In [4]:
data = AmazonBookPreprocess(data)
data

Unnamed: 0,hist_cate_0,hist_cate_1,hist_cate_2,hist_cate_3,hist_cate_4,hist_cate_5,hist_cate_6,hist_cate_7,hist_cate_8,hist_cate_9,...,hist_cate_32,hist_cate_33,hist_cate_34,hist_cate_35,hist_cate_36,hist_cate_37,hist_cate_38,hist_cate_39,cateID,label
0,142,142,142,142,142,0,0,0,0,0,...,0,0,0,0,0,0,0,0,751,0
1,142,142,142,142,142,0,0,0,0,0,...,0,0,0,0,0,0,0,0,142,1
2,142,142,142,142,97,142,142,0,0,0,...,0,0,0,0,0,0,0,0,1094,0
3,142,142,142,142,97,142,142,0,0,0,...,0,0,0,0,0,0,0,0,142,1
4,142,142,142,142,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,142,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,142,142,142,142,142,751,0,0,0,0,...,0,0,0,0,0,0,0,0,142,1
99996,142,142,142,142,142,142,142,142,142,142,...,0,0,0,0,0,0,0,0,142,0
99997,142,142,142,142,142,142,142,142,142,142,...,0,0,0,0,0,0,0,0,607,1
99998,142,142,142,142,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,142,0
