In [16]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def user_based_recommender(ratings_file, movies_file, user_id, top_n=10, similar_k=5):
    """
    用基于用户的协同过滤方法为指定用户推荐电影。

    参数：
    - ratings_file: str, 评分数据路径（如 'ratings_small.csv'）
    - movies_file: str, 电影元数据路径（如 'movies_metadata.csv'）
    - user_id: int, 要推荐的用户 ID
    - top_n: int, 返回推荐的电影数量
    - similar_k: int, 使用最相似的 k 个用户做加权推荐

    返回：
    - List of recommended movie titles
    """
    # 读取评分和电影信息
    ratings = pd.read_csv(ratings_file)
    movies = pd.read_csv(movies_file, low_memory=False)
    ratings = ratings.dropna()

    # 创建用户-电影评分矩阵
    user_movie_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
    matrix_filled = user_movie_matrix.fillna(0)

    # 用户之间的余弦相似度
    user_similarity = cosine_similarity(matrix_filled)
    sim_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

    # 找出最相似的 K 个用户（排除自己）
    similar_users = sim_df[user_id].sort_values(ascending=False)[1:similar_k+1]
    
    # 获取这些相似用户的评分数据
    similar_ratings = user_movie_matrix.loc[similar_users.index]
    
    # 加权评分（相似度 * 评分）
    weighted_ratings = similar_ratings.T.dot(similar_users)
    weighted_avg = weighted_ratings / similar_users.sum()

    # 去掉该用户已经评分过的电影
    watched = user_movie_matrix.loc[user_id].dropna().index
    recommendations = weighted_avg.drop(watched, errors='ignore').sort_values(ascending=False).head(top_n)

    # 匹配电影名称（注意要做类型转换）
    movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
    recommended_titles = movies[movies['id'].isin(recommendations.index)]['title'].dropna().tolist()

    return recommended_titles


recs = user_based_recommender(
    ratings_file='movielens_dataset/ratings_small.csv',
    movies_file='movielens_dataset/movies_metadata.csv',
    user_id=1,
    top_n=5
)

print("为用户 1 推荐的电影：")
for i, title in enumerate(recs, 1):
    print(f"{i}. {title}")


为用户 1 推荐的电影：
1. Four Rooms
2. Ariel
3. Shadows in Paradise


TF-IDF

In [20]:
import pandas as pd
import ast  # 用于解析字符串形式的 JSON

# 读取 CSV 文件
df = pd.read_csv('movielens_dataset/movies_metadata.csv', low_memory=False)

# 只保留有用的列：title、overview 和 genres
df = df[['title', 'overview', 'genres']]

# 去除缺失的文本
df = df.dropna(subset=['overview'])

# genres 是一个字符串化的 JSON，我们把它变成关键词列表
def parse_genres(genre_str):
    try:
        genres = ast.literal_eval(genre_str)
        return " ".join([g['name'] for g in genres])
    except:
        return ""

df['genres'] = df['genres'].apply(parse_genres)

# 合并 overview 和 genres 成一个文本字段用于文本建模
df['content'] = df['overview'] + " " + df['genres']

from sklearn.feature_extraction.text import TfidfVectorizer

# 使用 TF-IDF 处理文本，限制最大词汇数量
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['content'])

print(tfidf_matrix.shape)  # (电影数量, 特征维度)

from sklearn.metrics.pairwise import cosine_similarity

# 计算电影之间的余弦相似度
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 构建电影索引：标题 -> 索引
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

# 推荐函数
def recommend(title, top_n=5):
    matches = indices[indices.index == title]
    if matches.empty:
        print("电影没找到")
        return
    idx = matches.iloc[0]  # 取第一个匹配项
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

# 示例：输入一部电影名称
print(recommend("The Dark Knight"))


(44512, 5000)
4375     Everybody's All-American
35664       See You in Montevideo
13036                 The Express
11209                  Invincible
40724    I'm a Standard Supporter
Name: title, dtype: object
