In [1]:
import pandas as pd
# 加载数据集
metadata = pd.read_csv('movies_metadata.csv', low_memory=False)
# 创建基于内容的推荐器
def content_based_recommender(title, metadata):
    # 选择特征
    features = ['title', 'genres', 'cast', 'director']
    
    # 用选定的特征创建 DataFrame
    content = metadata[features]
    
    # 删除缺失值的行
    content = content.dropna()
    
    # 字符串并删除空格
    content['title'] = content['title'].str.lower()
    content['genres'] = content['genres'].str.lower()
    content['cast'] = content['cast'].str.lower()
    content['director'] = content['director'].str.lower()
    
    # 将特征合并为一个字符串
    content['combined'] = content['genres'] + ' ' + content['cast'] + ' ' + content['director']
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    # 定义 TF-IDF 向量器
    tfidf = TfidfVectorizer(stop_words='english')
    
    # 构建 TF-IDF 矩阵
    tfidf_matrix = tfidf.fit_transform(content['combined'])
    
    from sklearn.metrics.pairwise import linear_kernel
    
    # 计算余弦相似性矩阵
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
    # 获取与标题匹配的电影索引
    indices = pd.Series(content.index, index=content['title']).drop_duplicates()
    idx = indices[title]
    
    # 获取所有电影与该电影的成对相似度得分
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # 根据相似度得分对电影进行排序
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # 获得 10 部最相似电影的分数
    sim_scores = sim_scores[1:11]
    
    # 获取电影指数
    movie_indices = [i[0] for i in sim_scores]
    
    # 返回最相似的前 10 部电影
    return content['title'].iloc[movie_indices]
# 获取电影推荐
recommended_movies = content_based_recommender('avatar', metadata)
print(recommended_movies)

KeyError: "['cast', 'director'] not in index"

In [9]:
metadata = pd.read_csv('movies_metadata.csv', low_memory=False)
print(metadata.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


In [None]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import cross_validate

# 加载 MovieLens 数据集
data = Dataset.load_builtin('ml-100k')

# 创建基于用户的协同过滤模型
sim_options = {
    'name': 'cosine',
    'user_based': True  # 计算用户相似度
}

# 初始化 KNNBasic 算法
model = KNNBasic(sim_options=sim_options)

# 进行交叉验证
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
from surprise import accuracy
from surprise.model_selection import train_test_spli

data = Dataset.load_builtin('ml-100k')
# 将数据分成训练集和测试集
trainset, testset = train_test_split(data, test_size=0.25)
# 初始化并拟合模型
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)
# 进行预测
predictions = model.test(testset)
# 计算 RMSE
rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse}')