In [1]:
import pandas as pd

# 加载数据
data = pd.read_csv('../autodl-tmp/DataBook/Data_all_filter.csv')

# 准备交互数据
interactions = data[['group', 'parent_asin_encode', 'unix_timestamp']].copy()
interactions['rating'] = 1  # 假设所有交互都有评分1
interactions['group'] = interactions['group'].astype(int)
interactions['parent_asin_encode'] = interactions['parent_asin_encode'].astype(int)
interactions['unix_timestamp'] = interactions['unix_timestamp'].astype(float)
interactions['rating'] = interactions['rating'].astype(float)
interactions['group'] = interactions['group'].astype('category').cat.codes
interactions['parent_asin_encode'] = interactions['parent_asin_encode'].astype('category').cat.codes
interactions.columns = ['user_id:token', 'item_id:token', 'timestamp:float', 'rating:float']

# 按 user_id:token 分组，并按时间排序
interactions = interactions.sort_values(by=['user_id:token', 'timestamp:float'])
grouped = interactions.groupby('user_id:token')

# 初始化数据
train_data = []
valid_data = []
test_data = []

# 定义填充函数
def pad_sequence(seq, max_len=50):
    return [0] * (max_len - len(seq)) + seq if len(seq) < max_len else seq[:max_len]

# 遍历每个用户的交互数据
for user_id, group in grouped:
    if len(group) < 3:  # 如果用户交互少于 3 次，跳过
        continue
    # 提取用户的 item_id 列
    item_list = group['item_id:token'].tolist()

    # 分割数据
    train_items = item_list[:-3]  # 除去最后3项作为训练集
    valid_items = item_list[:-2]  # 除去最后2项作为验证集
    test_items = item_list[:-1]   # 1 作为测试集

    # 填充序列
    train_items = pad_sequence(train_items)
    valid_items = pad_sequence(valid_items)
    test_items = pad_sequence(test_items)

    # 添加训练集
    if len(train_items) > 0:
        train_data.append({
            'user_id:token': user_id,
            'item_id_list:token_seq': train_items,
            'item_id:token': item_list[-3]  # 训练集目标物品是倒数第二个
        })

    # 添加验证集
    valid_data.append({
        'user_id:token': user_id,
        'item_id_list:token_seq': valid_items,
        'item_id:token': item_list[-2]  # 验证集目标物品是最后一个
    })

    # 添加测试集
    test_data.append({
        'user_id:token': user_id,
        'item_id_list:token_seq': test_items,
        'item_id:token': item_list[-1]  # 测试集目标物品也是最后一个
    })

# 转换为 DataFrame
train_df = pd.DataFrame(train_data)
valid_df = pd.DataFrame(valid_data)
test_df = pd.DataFrame(test_data)

print("Train, valid, and test splits saved successfully!")


Train, valid, and test splits saved successfully!


In [2]:
valid_df

Unnamed: 0,user_id:token,item_id_list:token_seq,item_id:token
0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",24
1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",50
2,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",78
3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",106
4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",138
...,...,...,...
54705,54705,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2265
54706,54706,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",269039
54707,54707,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",262963
54708,54708,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",120992


In [3]:
interactions['user_id:token'].value_counts()

user_id:token
284      51
51891    50
19199    50
5702     50
2609     50
         ..
45310    10
45311    10
25298    10
45313    10
12825    10
Name: count, Length: 54710, dtype: int64

In [4]:
interactions.rename(columns={'rating:float': 'label:float'}, inplace=True)
interactions

Unnamed: 0,user_id:token,item_id:token,timestamp:float,label:float
0,0,0,1.416445e+09,1.0
1,0,1,1.417316e+09,1.0
2,0,2,1.417744e+09,1.0
3,0,3,1.417744e+09,1.0
4,0,4,1.417744e+09,1.0
...,...,...,...,...
842270,54709,32514,1.454371e+09,1.0
842271,54709,157114,1.454372e+09,1.0
842272,54709,22916,1.454372e+09,1.0
842273,54709,88817,1.454372e+09,1.0


In [5]:
import numpy as np
# 全部商品集合
all_items = set(interactions['item_id:token'].unique())

# 函数：为每个用户采样一个未交互的商品
def sample_negative_for_user(user_df):
    user_id = user_df['user_id:token'].iloc[0]
    interacted_items = set(user_df['item_id:token'])
    non_interacted_items = list(all_items - interacted_items)
    if non_interacted_items:
        sampled_item = np.random.choice(non_interacted_items)
    else:
        # 如果没有未交互商品（理论上不会发生），随机选择一个已有商品
        sampled_item = np.random.choice(list(all_items))
    return pd.DataFrame({
        'user_id:token': [user_id],
        'item_id:token': [sampled_item],
        'timestamp:float': [user_df['timestamp:float'].max() + 1],  # 时间戳设置为最大时间戳 + 1
        'label:float': [0.0]  # 未交互的商品标记为 0
    })

# 为每个用户采样一个未交互商品
negative_samples = interactions.groupby('user_id:token').apply(sample_negative_for_user).reset_index(drop=True)

result = pd.concat([interactions, negative_samples], ignore_index=True)
result = result.sort_values(by=['user_id:token', 'timestamp:float'])

KeyboardInterrupt: 

In [6]:
items = data[['parent_asin', 'parent_asin_encode', 'topic']].drop_duplicates()

In [7]:
items

Unnamed: 0,parent_asin,parent_asin_encode,topic
0,0446618462,1,12.0
1,1942002033,2,35.0
2,0373658419,3,4.0
3,194200205X,4,35.0
4,1476779724,5,4.0
...,...,...,...
842243,0240812182,269046,19.0
842244,B008W3GSJY,269047,34.0
842249,0240521609,269048,19.0
842257,0060087463,269049,19.0


In [8]:
items.nunique()

parent_asin           269050
parent_asin_encode    269050
topic                     50
dtype: int64

In [9]:
data.duplicated(subset=['user_id', 'parent_asin_encode', 'unix_timestamp']).sum()

0

In [10]:
interactions.groupby(['user_id:token']).count()

Unnamed: 0_level_0,item_id:token,timestamp:float,label:float
user_id:token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,26,26,26
1,26,26,26
2,28,28,28
3,28,28,28
4,32,32,32
...,...,...,...
54705,19,19,19
54706,13,13,13
54707,31,31,31
54708,12,12,12


In [11]:
interactions.to_csv('./dataset/mydata/mydata.inter', sep='\t', index=False)

In [10]:
valid_df.to_csv('./dataset/mydata/mydata.train.inter', sep='\t', index=False)
test_df.to_csv('./dataset/mydata/mydata.test.inter', sep='\t', index=False)

In [11]:
metadata = pd.read_csv('../autodl-tmp/DataMovie/Meta_Filtered.csv')
metadata = metadata.drop(columns=['Unnamed: 0'])

metadata.head()

  metadata = pd.read_csv('../autodl-tmp/DataMovie/Meta_Filtered.csv')


Unnamed: 0,main_category,title,subtitle,average_rating,rating_number,price,store,categories,details,parent_asin
0,Movies & TV,Pink Cadillac [DVD],,4.6,972.0,,"Clint Eastwood (Actor), Bernadette Peters...","['Movies & TV', 'Studio Specials', 'Warner Hom...","{'Genre': 'Action & Adventure', 'Format': 'Ana...",B00009N83U
1,Movies & TV,The Returned- Complete First Season,,4.5,488.0,14.97,"Anne Consigny (Actor), Clotilde Hesme (A...","['Movies & TV', 'Genre for Featured Categories...","{'Aspect Ratio': '1.77:1', 'Is Discontinued By...",B00GAZ1H1U
2,Prime Video,Cry-Baby,,4.7,11923.0,16.79,,"['Comedy', 'Biting', 'Campy', 'Nostalgic']","{'Content advisory': ['Violence', 'smoking', '...",B009CG06FW
3,Prime Video,Little White Lies,,4.3,850.0,,,"['Comedy', 'Drama', 'Cerebral', 'Emotional']","{'Content advisory': ['Violence', 'substance u...",B00BB8XEJY
4,Movies & TV,"People, Places, Things",,4.2,144.0,,"Jemaine Clement (Actor), Regina Hall (Ac...","['Movies & TV', 'Independently Distributed', '...","{'Genre': 'Comedy', 'Format': 'Multiple Format...",B00ZGPMB2E


In [15]:
meta_filter = pd.merge(items, metadata, on='parent_asin', how='left')
meta_filter.drop_duplicates(inplace=True)
meta_filter

Unnamed: 0,parent_asin,parent_asin_encode,topic,main_category,title,subtitle,average_rating,rating_number,price,store,categories,details
0,B00465I1BA,1,45.0,Movies & TV,Rocky & Bullwinkle & Friends: The Complete Series,,4.7,455.0,182.5,"Various (Actor), - (Director) Rated: ...","['Movies & TV', 'Studio Specials', 'Universal ...","{'Format': 'Multiple Formats, Animated, Box se..."
1,B005FQ1ONG,2,4.0,Movies & TV,"Mr. Magoo: The Television Collection, 1960-1977",,4.8,208.0,48.0,"Jim Backus (Actor), Mel Blanc (Actor), ...","['Movies & TV', 'Featured Categories', 'DVD', ...","{'Genre': 'Kids & Family, Animation', 'Format'..."
2,B0051GLDQS,3,34.0,Movies & TV,Quarantine 2: Terminal,,4.3,1931.0,9.99,"Mercedes Masohn (Actor), Josh Cooke (Act...","['Movies & TV', 'Studio Specials', 'Sony Pictu...","{'Genre': 'Horror, Thriller', 'Format': 'Multi..."
3,B0000E32V0,4,10.0,Movies & TV,Buffalo Soldiers,,4.2,331.0,14.99,"Joaquin Phoenix (Actor), Anna Paquin (Ac...","['Movies & TV', 'Studio Specials', 'Miramax Ho...","{'Format': 'Multiple Formats, Anamorphic, Clos..."
4,B0019L21GA,5,9.0,Movies & TV,The Godfather Part II - The Coppola Restoration,,4.8,496.0,7.59,"Al Pacino (Actor), Robert De Niro (Actor...","['Movies & TV', 'Paramount Home Entertainment'...","{'Genre': 'Action & Adventure', 'Format': 'Mul..."
...,...,...,...,...,...,...,...,...,...,...,...,...
72587,B07W7GVTGS,72588,15.0,Movies & TV,It's a Wonderful Life (4K UHD + Blu-ray + Digi...,,4.9,18668.0,19.19,"James Stewart (Actor), Donna Reed (Actor...","['Movies & TV', 'Featured Categories', 'Blu-ra...","{'Genre': 'Drama', 'Format': 'Subtitled, 4K, W..."
72588,B07GQ59F7R,72589,35.0,Movies & TV,HALLOWEEN III: Season of the Witch - Collector...,,4.8,1125.0,27.11,"Tom Atkins (Actor), Stacy Nelkin (Actor)...","['Movies & TV', 'Blu-ray', 'TV']","{'Genre': 'Horror', 'Format': '4K', 'Contribut..."
72589,B08N3X6751,72590,35.0,Movies & TV,"Curse of Frankenstein, The [Blu-ray]",,4.7,1671.0,17.49,"Peter Cushing (Actor), Christopher Lee (...","['Movies & TV', 'Blu-ray', 'Movies']","{'Genre': 'Horror', 'Format': 'NTSC, Subtitled..."
72590,B00BX8A918,72591,17.0,Movies & TV,Roustabout (1964),,4.6,79.0,22.99,"Various (Actor, Director) Format: DVD","['Movies & TV', 'Featured Categories', 'DVD', ...","{'Format': 'Multiple Formats, Color, AC-3, NTS..."


In [16]:
meta_filter = meta_filter[['parent_asin_encode','title' , 'categories']]
meta_filter['categories'] = (
    meta_filter['categories']
    .str.replace('</span>', '', regex=False)  # 删除 </span>
    .str.replace('[', '', regex=False)         # 删除 [
    .str.replace(']', '', regex=False)         # 删除 ]
    .str.replace("'", '', regex=False)         # 删除 ''
    .str.replace(",", '', regex=False)
    .str.strip()                               # 去掉前后空格
)
meta_filter.columns = ['item_id:token', 'title:token_seq', 'class:token_seq']
meta_filter

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_filter['categories'] = (


Unnamed: 0,item_id:token,title:token_seq,class:token_seq
0,1,Rocky & Bullwinkle & Friends: The Complete Series,Movies & TV Studio Specials Universal Studios ...
1,2,"Mr. Magoo: The Television Collection, 1960-1977",Movies & TV Featured Categories DVD Kids & Family
2,3,Quarantine 2: Terminal,Movies & TV Studio Specials Sony Pictures Home...
3,4,Buffalo Soldiers,Movies & TV Studio Specials Miramax Home Enter...
4,5,The Godfather Part II - The Coppola Restoration,Movies & TV Paramount Home Entertainment Classics
...,...,...,...
72587,72588,It's a Wonderful Life (4K UHD + Blu-ray + Digi...,Movies & TV Featured Categories Blu-ray Drama
72588,72589,HALLOWEEN III: Season of the Witch - Collector...,Movies & TV Blu-ray TV
72589,72590,"Curse of Frankenstein, The [Blu-ray]",Movies & TV Blu-ray Movies
72590,72591,Roustabout (1964),Movies & TV Featured Categories DVD Comedy


In [17]:
meta_filter.to_csv('./dataset/mydata/mydata.item', sep='\t', index=False)