In [None]:
import pandas as pd
import numpy as np
import ast
import scipy.sparse

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing       import MultiLabelBinarizer, StandardScaler
from sklearn.metrics.pairwise    import cosine_similarity

# ── 1. データ読み込み ─────────────────────────────────────────────
ratings = pd.read_csv("ratings.csv")
movies  = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")   # NEW: キャスト・クルー情報

# ── 2. credits の列名チェック＆リネーム ────────────────────────────
print("credits columns before rename:", credits.columns)

if "movie_id" in credits.columns:
    credits.rename(columns={"movie_id": "movieId"}, inplace=True)
elif "id" in credits.columns:
    credits.rename(columns={"id": "movieId"}, inplace=True)

print("credits columns after rename:", credits.columns)

# ── 3. movies 側も同様に id→movieId 整備 ─────────────────────────
if "id" in movies.columns:
    movies.rename(columns={"id": "movieId"}, inplace=True)

# 数値化
movies["movieId"]  = pd.to_numeric(movies["movieId"], errors="coerce")
credits["movieId"] = pd.to_numeric(credits["movieId"], errors="coerce")

# ── 4. マージ ───────────────────────────────────────────────────
movies = movies.merge(
    credits[["movieId", "cast", "crew"]],
    on="movieId", how="left"
)

# ── 5. 欠損値補完 ─────────────────────────────────────────────────
for col in ["genres", "keywords", "overview", "original_language",
            "cast", "crew", "production_countries", "release_date"]:
    if col in movies.columns:
        movies[col] = movies[col].fillna("[]" if col in ["genres","keywords","cast","crew","production_countries"] else "")

# ── 6. 列ごとにリスト化 ────────────────────────────────────────────
# (a) ジャンル
movies["genres_list"] = (
    movies["genres"]
    .apply(ast.literal_eval)
    .apply(lambda lst: [d["name"] for d in lst])
)

# (b) キーワード
movies["keywords_list"] = (
    movies["keywords"]
    .apply(ast.literal_eval)
    .apply(lambda lst: [d["name"] for d in lst])
)

# (c) キャスト（上位5名だけ）
movies["cast_list"] = (
    movies["cast"]
    .apply(ast.literal_eval)
    .apply(lambda lst: [d["name"] for d in lst][:5])
)

# (d) 監督（crewの中からJob=="Director"を抽出）
movies["director_list"] = (
    movies["crew"]
    .apply(ast.literal_eval)
    .apply(lambda lst: [d["name"] for d in lst if d.get("job")=="Director"])
)

# (e) 制作国
movies["country_list"] = (
    movies["production_countries"]
    .apply(ast.literal_eval)
    .apply(lambda lst: [d["name"] for d in lst])
)

# (f) 公開年
movies["release_year"] = (
    pd.to_datetime(movies["release_date"], errors="coerce")
      .dt.year
      .fillna(0)
      .astype(int)
)

# ── 7. 各特徴量のエンコーディング／ベクトル化 ───────────────────────
# — TF–IDF（あらすじ）
tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000,       # ←後で調整
    ngram_range=(1,3)        # ←後で調整
)
tfidf_mat = tfidf.fit_transform(movies["overview"])

# — One-hot 相当（genres/keywords/cast/director/country）
mlb_genre    = MultiLabelBinarizer()
mlb_keyword  = MultiLabelBinarizer()
mlb_cast     = MultiLabelBinarizer()
mlb_dir      = MultiLabelBinarizer()
mlb_country  = MultiLabelBinarizer()

genre_mat   = mlb_genre.fit_transform(movies["genres_list"])
keyword_mat = mlb_keyword.fit_transform(movies["keywords_list"])
cast_mat    = mlb_cast.fit_transform(movies["cast_list"])
dir_mat     = mlb_dir.fit_transform(movies["director_list"])
country_mat = mlb_country.fit_transform(movies["country_list"])

# — 数値特徴（公開年）の標準化
scaler     = StandardScaler()
year_scaled = scaler.fit_transform(movies[["release_year"]])
year_mat    = scipy.sparse.csr_matrix(year_scaled)

# ── 8. 重みづけ（任意で調整）───────────────────────────────────────
w_overview  = 1.0
w_genre     = 1.0 # genre重視にしたいなら値を大きく
w_keyword   = 1.0
w_cast      = 1.0
w_director  = 2.0
w_country   = 1.0
w_year      = 0.5   # 年を少し弱めにする例

tfidf_mat  *= w_overview
genre_mat  = scipy.sparse.csr_matrix(genre_mat * w_genre)
keyword_mat= scipy.sparse.csr_matrix(keyword_mat * w_keyword)
cast_mat   = scipy.sparse.csr_matrix(cast_mat * w_cast)
dir_mat    = scipy.sparse.csr_matrix(dir_mat * w_director)
country_mat= scipy.sparse.csr_matrix(country_mat * w_country)
year_mat   *= w_year

# ── 9. 特徴行列結合 ───────────────────────────────────────────────
features = scipy.sparse.hstack([
    tfidf_mat,
    genre_mat,
    keyword_mat,
    cast_mat,
    dir_mat,
    country_mat,
    year_mat
]).tocsr()

print("feature matrix shape:", features.shape)

# ── 10. ユーザーベクトル生成 & 推薦 ─────────────────────────────────
# (前回と同様)
liked       = ratings[ratings["rating"] >= 4]["movieId"].tolist()
liked_idx   = movies[movies["movieId"].isin(liked)].index

user_vec    = features[liked_idx].mean(axis=0)
user_vec    = np.asarray(user_vec).flatten()

sims = cosine_similarity(user_vec.reshape(1,-1), features).flatten()
sims[liked_idx] = -1

top5_idx = np.argsort(sims)[::-1][:5]
recs = movies.iloc[top5_idx][["movieId", "title", "genres_list"]]
recs.reset_index(drop=True, inplace=True)

from IPython.display import display
display(recs)


In [46]:
# ユーザーが既に見た（✔︎）映画リストを Python のリストとして定義
true_items = [
    "wicked",
    "apprentice",
    "Captain America: Brave new world",
    "野生の島のロズ",
    "ヘンゼル&グレーテル 魔女ハンター",
    "Dracula",
    "John wick",
    "Haywire",
    "Shatter Island",
    "Gladiator",
    "the order",
    "Jurassic world : fallen kingdom",
    "jack the giant killer",
    "doctor sleep",
    "Jurassic world",
    "The shinning",
    "triple frontier",
    "killers anonymous 本当に意味がわからなかった",
    "ボーダーランズ",
    "The Unbearable Weight of Massive Talent",
    "Repeated",
    "Sherlock Holmes",
    "SuperNova",
    "Central Intelligence",
    "Aquaman and The Lost Kingdom",
    "Kingsman: Golden Circle",
    "Kingsman",
    "Les Miserables",
    "Venom the last dance",
    "DEADPOOL AND WOLVERINE",
    "Speak no evil",
    "Venom the last dance",  # （重複している場合は後で set() で整理してもOK）
    "Alian",
    "テネット",
    "Imitation Game",
    "Trance",
    "It",
    "Mr.Glass",
    "The Greatest Show",
    "Adam",
    "Ghost busters",
    "Fight club",
    "RED ONE",
    "Lord of The Rings(all)",
    "赤ずきん",
    "The pop’s exsocist",
    "Atonement",
    "トランス",
    "モアナ",
    "落下の解剖学",
    "",
    "ミスターガラス",
    "フリーガイ",
    "dune1",
    "Batman",
    "Superman",
    "Aquaman",
    "The killer",
    "wanted",
    "mcu(30)",
    "X-MEN",
    "Sherlock",
    "Inception",
    "メメント",
]

# （オプション）重複を除去したい場合
true_titles = ground_truth["user_1"]               # ← 追加

# タイトルにマッチする movieId を取得
liked_ids = movies[movies["title"].isin(true_titles)]["movieId"].tolist()

# DataFrame の行インデックスに変換
liked_indices = movies[movies["movieId"].isin(liked_ids)].index.tolist()
print("liked_indices:", liked_indices)             # ← 動作確認用に一度出力すると安心
sims = cosine_similarity(user_vec.reshape(1, -1), features).flatten()

# ← ここを必ず liked_indices で指定してください
sims[liked_indices] = -1  

# ── 3. 上位 K 件を取り出す
topk_idx   = np.argsort(sims)[::-1][:K]
pred_items = movies.iloc[topk_idx]["title"].tolist()

# ── 8. ユーザーベクトル作成（以降は変更なし）─────────────────────────
user_vec = features[liked_indices].mean(axis=0)
user_vec = np.asarray(user_vec).flatten()


liked_indices: [96, 274, 356, 813, 914, 1167, 1590, 2002, 2495, 3879]


In [47]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 1. 評価指標関数の定義
def precision_at_k(true_items, pred_items, k):
    return len(set(true_items) & set(pred_items[:k])) / k

def recall_at_k(true_items, pred_items, k):
    return len(set(true_items) & set(pred_items[:k])) / len(true_items) if true_items else 0

def dcg_at_k(true_items, pred_items, k):
    dcg = 0.0
    for i, p in enumerate(pred_items[:k]):
        if p in true_items:
            dcg += 1.0 / np.log2(i + 2)
    return dcg

def idcg_at_k(true_items, k):
    ideal_rels = [1] * min(len(true_items), k)
    return sum(rel / np.log2(idx + 2) for idx, rel in enumerate(ideal_rels))

def ndcg_at_k(true_items, pred_items, k):
    idcg = idcg_at_k(true_items, k)
    return dcg_at_k(true_items, pred_items, k) / idcg if idcg > 0 else 0

# 2. 推薦リスト（pred_items）の作成
K = 5
# (a) ユーザーベクトル user_vec と features は既に作成済みとする
sims = cosine_similarity(user_vec.reshape(1, -1), features).flatten()
# (b) 好き映画は除外
sims[liked_idx] = -1
# (c) 上位 K 件のインデックス
topk_idx = np.argsort(sims)[::-1][:K]
pred_items = movies.iloc[topk_idx]["title"].tolist()

print("推薦リスト:", pred_items)

# 3. 真の好み（true_items）は先ほど定義済みの ground_truth["user_1"]
true_items = ground_truth["user_1"]

# 4. 指標を計算して表示
print(f"Precision@{K}:", precision_at_k(true_items, pred_items, K))
print(f"Recall@{K}   :", recall_at_k   (true_items, pred_items, K))
print(f"NDCG@{K}     :", ndcg_at_k     (true_items, pred_items, K))


推薦リスト: ["Amidst the Devil's Wings", 'Inception', 'Gladiator', 'Sherlock Holmes', 'Superman']
Precision@5: 0.8
Recall@5   : 0.06349206349206349
NDCG@5     : 0.6608397947263839


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 例：1番目に推薦された映画
movie_idx = top5_idx[0]

# 映画ベクトル
movie_vec = features[movie_idx].toarray().flatten()

# 寄与度ベクトル
contrib = user_vec * movie_vec

# TF–IDF 用語の寄与度部分
terms = vec.get_feature_names_out()
tfidf_contrib = contrib[:len(terms)]
top_tfidf_idx = np.argsort(tfidf_contrib)[::-1][:10]

# ジャンルの寄与度部分
genres = mlb.classes_
genre_contrib = contrib[len(terms):]
top_genre_idx = np.argsort(genre_contrib)[::-1][:10]

# DataFrame にまとめ
df_terms = pd.DataFrame({
    'term': terms[top_tfidf_idx],
    'contribution': tfidf_contrib[top_tfidf_idx]
})
df_genres = pd.DataFrame({
    'genre': genres[top_genre_idx],
    'contribution': genre_contrib[top_genre_idx]
})

# 可視化：TF–IDF
plt.figure(figsize=(8,4))
plt.barh(df_terms['term'][::-1], df_terms['contribution'][::-1])
plt.title('Top 10 貢献 TF–IDF 語')
plt.xlabel('寄与度')
plt.tight_layout()
plt.show()

# 可視化：ジャンル
plt.figure(figsize=(6,3))
plt.barh(df_genres['genre'][::-1], df_genres['contribution'][::-1])
plt.title('Top 10 貢献 ジャンル')
plt.xlabel('寄与度')
plt.tight_layout()
plt.show()




In [None]:
import pandas as pd
import ast

# 1. データ読み込み（ファイルパスは要調整）
ratings = pd.read_csv("ratings.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

print(credits.columns)
# 2. カラム名統一
credits.rename(columns={"movie_id": "movieId"}, inplace=True)

# 3. cast 列の JSON → リスト変換
credits["cast_list"] = (
    credits["cast"]
    .apply(ast.literal_eval)
    .apply(lambda cl: [c["name"] for c in cl])
)

# 4. ★4以上評価した映画ID を抽出
liked_ids = ratings[ratings["rating"] >= 4]["movieId"].tolist()

# 5. その映画の cast_list を “explode” して俳優名を一列に
liked_casts = credits[credits["movieId"].isin(liked_ids)].explode("cast_list")

# 6. 
top_actors = liked_casts["cast_list"].value_counts().head(10)
top_actors.plot.barh(figsize=(6,4), legend=False)
plt.xlabel("出現回数（★4以上映画に登場）")
plt.title("your favourite 10 actours")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
