In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer # genre變化度不大
from sklearn.feature_extraction.text import TfidfVectorizer # tags稀有但有意義的詞權重重
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# 載入資料
movies = pd.read_csv("/Users/famil/Documents/csh/nschool/MovieRecommenderSystem/data/movies.csv")
# print(movies.head())
tags = pd.read_csv("/Users/famil/Documents/csh/nschool/MovieRecommenderSystem/data/tags.csv")
print(tags.head())

   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferrell  1445714992
3       2    89774     Boxing story  1445715207
4       2    89774              MMA  1445715200


In [3]:
# genres 特徵處理
# 把 genres 轉成文字特徵向量
movies['genres'] = movies['genres'].str.replace('|', ' ')

# genres 向量化 (CountVectorizer)
genre_vectorizer = CountVectorizer() # 文字轉數字向量
genre_matrix = genre_vectorizer.fit_transform(movies['genres']) # fit+transform

# 計算相似度矩陣
genres_sim = cosine_similarity(genre_matrix, genre_matrix) # 文字特徵使用餘弦相似度

In [4]:
# tags 特徵處理
# 合併 tags：把同一部電影的標籤合併成一個字串
tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: " ".join(x)).reset_index()

# 把 tags 合併到 movies
movies = movies.merge(tags_grouped, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')  # 沒有標籤的電影補空字串

# tags 向量化 (TF-IDF)
tags_vectorizer = TfidfVectorizer(stop_words='english')
tags_matrix = tags_vectorizer.fit_transform(movies['tag'])

# 計算相似度矩陣
tags_sim = cosine_similarity(tags_matrix, tags_matrix)

In [5]:
# 融合相似度 (可調整權重 a, b)
a = 0.3  # genres 權重
b = 0.7   # tags 權重
cosine_sim = a * genres_sim + b * tags_sim

In [22]:
# 建立推薦函數
def recommend_movies(title, n=5):
    # 找到輸入電影的 index
    if title not in movies['title'].values:
        return f"找不到電影：{title}"
    
    idx = movies[movies['title'] == title].index[0]
    
    # 找出相似度分數
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # 按分數排序 (最高在前，排除自己)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
    
    # 取得電影標題
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices].tolist()

In [23]:
# 5. 測試
print("=== 推薦結果 ===")
print(recommend_movies("Toy Story (1995)", n=5))

=== 推薦結果 ===
["Bug's Life, A (1998)", 'Toy Story 2 (1999)', 'The Lego Movie (2014)', 'Guardians of the Galaxy 2 (2017)', 'Up (2009)']
