In [None]:
import pandas as pd
import os

# ================= 配置区域 =================
RAW_PATH = "ml-1m"  
CTR_PATH = "./"     # 生成的 csv 放在当前目录

# 拼接数据文件路径
DATA_FILE = os.path.join(RAW_PATH, "ratings.dat")

print(f"Reading data from: {DATA_FILE}")
print(f"Current Working Directory: {os.getcwd()}")

# ================= 核心处理逻辑 =================
try:
    # 2. 读取数据
    df = pd.read_csv(DATA_FILE, sep='::', header=None, engine='python', 
                     names=['user_id', 'item_id', 'rating', 'timestamp'])
    print("✅ 数据读取成功！")
except FileNotFoundError:
    print(f"❌ 错误：找不到文件 {DATA_FILE}")
    print("请检查：是否已将 ml-1m 文件夹放入 data/MovieLens_1M/ 目录下？")
    raise

# 3. 数据清洗 (保留评分 >= 4)
df = df[df['rating'] >= 4].copy()
print(f"有效数据量 (Rating >= 4): {len(df)}")

# 4. ID 映射 (从 1 开始)
user2id = {uid: i+1 for i, uid in enumerate(sorted(df['user_id'].unique()))}
item2id = {iid: i+1 for i, iid in enumerate(sorted(df['item_id'].unique()))}
df['user_id'] = df['user_id'].map(user2id)
df['item_id'] = df['item_id'].map(item2id)

# 5. 时间切分
df['date'] = pd.to_datetime(df['timestamp'], unit='s')
min_date = df['date'].min()
df['day'] = (df['date'] - min_date).dt.days

split_day_1 = int(df['day'].max() * 0.8)
split_day_2 = int(df['day'].max() * 0.9)

train = df[df['day'] <= split_day_1].copy()
dev = df[(df['day'] > split_day_1) & (df['day'] <= split_day_2)].copy()
test = df[df['day'] > split_day_2].copy()

# 6. 冷启动过滤
train_users = set(train['user_id'].unique())
train_items = set(train['item_id'].unique())
dev = dev[dev['user_id'].isin(train_users) & dev['item_id'].isin(train_items)]
test = test[test['user_id'].isin(train_users) & test['item_id'].isin(train_items)]

# 7. 保存文件 
# 重命名列以匹配 ReChorus
train.rename(columns={'timestamp': 'time'}, inplace=True)
dev.rename(columns={'timestamp': 'time'}, inplace=True)
test.rename(columns={'timestamp': 'time'}, inplace=True)

# 保存
print("正在保存 CSV 文件...")
train[['user_id', 'item_id', 'time']].to_csv(os.path.join(CTR_PATH, 'train.csv'), sep='\t', index=False)
dev[['user_id', 'item_id', 'time']].to_csv(os.path.join(CTR_PATH, 'dev.csv'), sep='\t', index=False)
test[['user_id', 'item_id', 'time']].to_csv(os.path.join(CTR_PATH, 'test.csv'), sep='\t', index=False)

print("-" * 30)
print("✅ SUCCESS! 成功生成 train.csv, dev.csv, test.csv")
print("✅ 格式已确认为 Tab 分隔，可直接用于 ReChorus 训练。")

Reading data from: ml-1m\ratings.dat
Current Working Directory: d:\jianq\robot_learn\project\ReChorus\data\MovieLens_1M
✅ 数据读取成功！
有效数据量 (Rating >= 4): 575281
正在保存 CSV 文件...
------------------------------
✅ SUCCESS! 成功生成 train.csv, dev.csv, test.csv
✅ 格式已确认为 Tab 分隔，可直接用于 ReChorus 训练。
