# History Data生成

In [1]:
import pandas as pd
import numpy as np

# 设置随机种子以保证结果可复现
np.random.seed(42)

# 定义用户数和物品数
num_users = 400
num_items = 40

# 生成用户ID和物品ID
user_ids = np.arange(1, num_users + 1)
item_ids = np.arange(1, num_items + 1)

# 生成模拟评分数据
# 假设并非所有用户对所有物品都有评分，我们设置大约10%的用户-物品对有评分
num_ratings = int(num_users * num_items * 0.1)
user_ratings_ids = np.random.choice(user_ids, size=num_ratings)
item_ratings_ids = np.random.choice(item_ids, size=num_ratings)
ratings = np.random.randint(1, 6, size=num_ratings)  # 评分范围从1到5

# 生成时间戳
# 假设所有评分都在2023年内，这里使用pd.date_range来生成随机日期
timestamps = pd.date_range(start='2023-01-01', end='2023-12-31', periods=num_ratings).sort_values()
timestamps = np.random.choice(timestamps, size=num_ratings, replace=False)

# 创建DataFrame
data = {
    'user_id': user_ratings_ids,
    'item_id': item_ratings_ids,
    'rating': ratings,
    'timestamp': timestamps
}
ratings_df = pd.DataFrame(data)

# 显示前几行数据以检查
print(ratings_df.head())

# 可选：保存到CSV文件
ratings_df.to_csv('simulated_ratings.csv', index=False)


   user_id  item_id  rating                     timestamp
0      103        9       5 2023-01-29 00:00:00.000000000
1      349        8       1 2023-03-01 04:29:16.097560975
2      271       23       3 2023-08-17 18:43:54.146341464
3      107       29       3 2023-06-10 17:10:14.634146340
4       72       18       5 2023-10-14 14:26:20.487804876


# 用户画像构建

In [2]:
# 基本特征提取
ratings_df = pd.read_csv("./simulated_ratings.csv")
user_profiles = ratings_df.groupby('user_id').agg(
    average_rating=('rating', 'mean'),
    rating_count=('rating', 'count'),
    rating_std=('rating', 'std')
).reset_index()

# 填充评分标准差的缺失值（对于只有单一评分的用户）
user_profiles['rating_std'] = user_profiles['rating_std'].fillna(0)

# 示例：高级特征提取（如果有物品类别信息）
# 假设每个item_id对应一个类别，这里我们随机生成类别信息作为示例
np.random.seed(42)
item_categories = {item_id: np.random.choice(['A', 'B', 'C', 'D', 'E']) for item_id in item_ids}
ratings_df['category'] = ratings_df['item_id'].map(item_categories)

# 用户偏好类别的提取
user_favorite_category = ratings_df.groupby('user_id')['category'].agg(lambda x: x.mode()[0]).reset_index()
user_favorite_category.rename(columns={'category': 'favorite_category'}, inplace=True)

# 合并基本特征和高级特征
user_profiles = user_profiles.merge(user_favorite_category, on='user_id', how='left')

print(user_profiles.head())
user_profiles.to_csv("user_profiles.csv")


   user_id  average_rating  rating_count  rating_std favorite_category
0        1        3.750000             8    1.281740                 B
1        2        3.750000             4    1.892969                 D
2        3        4.000000             1    0.000000                 C
3        4        3.200000             5    2.049390                 D
4        5        3.714286             7    1.496026                 E


# 计算相似矩阵

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

# 加载用户画像
user_profiles = pd.read_csv("./user_profiles.csv")
encoder = LabelEncoder()

# 假设favorite_category是需要编码的分类特征
user_profiles["favorite_category_encoded"] = encoder.fit_transform(user_profiles["favorite_category"])
features = user_profiles.drop(['user_id', 'favorite_category'], axis=1)

# 计算相似度矩阵
similarity_matrix = cosine_similarity(features)
similarity_df = pd.DataFrame(similarity_matrix, index=user_profiles['user_id'], columns=user_profiles['user_id'])
similarity_df.to_csv("similarity_df.csv")

# 获取推荐物品

In [6]:
# 为指定用户推荐物品
def recommend_for_user(target_user_id, ratings_df, top_n=10):
    user_similarities = similarity_df[target_user_id].sort_values(ascending=False)
    similar_users = user_similarities.iloc[1:].index.tolist()  # 排除自身
    
    # 选取相似用户评分较高的物品
    similar_users_ratings = ratings_df[ratings_df['user_id'].isin(similar_users)]
    top_rated_items_by_similar_users = similar_users_ratings.groupby('item_id')['rating'].mean().sort_values(ascending=False).index.tolist()
    
    # 筛选目标用户未评分的物品
    target_user_rated_items = ratings_df[ratings_df['user_id'] == target_user_id]['item_id'].tolist()
    recommended_items = [item for item in top_rated_items_by_similar_users if item not in target_user_rated_items][:top_n]
    
    return recommended_items

# 为用户1推荐物品
target_user_id = 1
ratings_df = pd.read_csv("./simulated_ratings.csv")
recommended_items = recommend_for_user(target_user_id, ratings_df)
print(f"推荐给用户{target_user_id}的物品：", recommended_items)

推荐给用户1的物品： [39, 29, 22, 9, 32, 5, 25, 6, 8, 28]


In [7]:
similarity_df

user_id,1,2,3,4,5,6,7,8,9,10,...,391,392,393,394,395,396,397,398,399,400
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.867541,0.556569,0.846381,0.857077,0.672375,0.613187,0.576244,0.474957,0.487099,...,0.019640,0.010861,0.017493,0.013854,0.008556,0.018143,0.014404,0.017307,0.007757,0.014312
2,0.867541,1.000000,0.817377,0.956248,0.933658,0.881873,0.693206,0.755155,0.651234,0.630189,...,0.171110,0.161812,0.166049,0.164111,0.163765,0.167757,0.167455,0.169652,0.158812,0.167952
3,0.556569,0.817377,1.000000,0.786392,0.768627,0.843178,0.676982,0.755654,0.684868,0.705095,...,0.414881,0.407783,0.408941,0.410318,0.413905,0.410928,0.412285,0.414593,0.407144,0.413204
4,0.846381,0.956248,0.786392,1.000000,0.990555,0.959933,0.864012,0.897697,0.823693,0.809378,...,0.414307,0.405478,0.410193,0.407574,0.406016,0.411568,0.410620,0.412648,0.402432,0.411005
5,0.857077,0.933658,0.768627,0.990555,1.000000,0.949466,0.866257,0.890933,0.842847,0.813454,...,0.424690,0.415220,0.420432,0.417614,0.415874,0.421807,0.420519,0.422828,0.412578,0.421154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,0.018143,0.167757,0.410928,0.411568,0.421807,0.598203,0.792276,0.766372,0.814363,0.866462,...,0.999982,0.999967,0.999996,0.999981,0.999888,1.000000,0.999973,0.999979,0.999937,0.999957
397,0.014404,0.167455,0.412285,0.410620,0.420519,0.598576,0.790550,0.766151,0.814487,0.865713,...,0.999981,0.999967,0.999955,0.999967,0.999953,0.999973,1.000000,0.999992,0.999946,0.999995
398,0.017307,0.169652,0.414593,0.412648,0.422828,0.600181,0.792030,0.767404,0.815684,0.866977,...,0.999996,0.999949,0.999958,0.999967,0.999944,0.999979,0.999992,1.000000,0.999932,0.999991
399,0.007757,0.158812,0.407144,0.402432,0.412578,0.591056,0.785762,0.760108,0.808913,0.861471,...,0.999907,0.999986,0.999938,0.999980,0.999953,0.999937,0.999946,0.999932,1.000000,0.999931


In [24]:
df = pd.read_csv("./similarity_df.csv")
df.iloc[:, 0]

0      1.000000
1      0.867541
2      0.556569
3      0.846381
4      0.857077
         ...   
388    0.018143
389    0.014404
390    0.017307
391    0.007757
392    0.014312
Name: 1, Length: 393, dtype: float64