In [9]:
import pandas as pd
import numpy as np
from collections import defaultdict

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
# İstersen şunu da kullanabilirsin:
# from sklearn.ensemble import GradientBoostingRegressor

# =========================
# 1) VERİ
# =========================
# Bunu kendi csv'ninle değiştir: df = pd.read_csv("ratings.csv")
df = pd.read_csv("random_user_item_ratings_1000.csv")

# =========================
# 2) LEAVE-ONE-OUT
# =========================
left_out_rows = []
train_rows = []

for uid, group in df.groupby("user_id"):
    left_out = group.tail(1)
    keep = group.iloc[:-1]
    left_out_rows.append(left_out)
    train_rows.append(keep)

left_out_df = pd.concat(left_out_rows, ignore_index=True)   # test benzeri
train_df = pd.concat(train_rows, ignore_index=True)         # train benzeri

# =========================
# 3) SKLEARN MODELİ
# =========================
# Özelliklerimiz sadece: user_id, item_id
feature_cols = ["user_id", "item_id"]

# Kategorik kolonları One-Hot'la
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), feature_cols)
    ]
)

# Basit bir regressor seçelim
regressor = RandomForestRegressor(
    n_estimators=120,
    random_state=42
)

# Hepsini pipeline'a koyalım
model = Pipeline(steps=[
    ("prep", preprocess),
    ("reg", regressor)
])

# Eğit
X_train = train_df[feature_cols]
y_train = train_df["rating"]
model.fit(X_train, y_train)

# =========================
# 4) TOP-N ÇIKARMA (model.predict ile)
# =========================
def get_top_n_sklearn(model, train_df, n=10, min_rating=4.0):
    topN = defaultdict(list)

    all_items = train_df["item_id"].unique().tolist()
    user2items = train_df.groupby("user_id")["item_id"].apply(set).to_dict()

    for user_id in train_df["user_id"].unique():
        seen = user2items.get(user_id, set())

        # Bu kullanıcının izlemediği item'lar için bir DataFrame hazırlayalım
        candidate_items = [it for it in all_items if it not in seen]

        if not candidate_items:
            continue

        cand_df = pd.DataFrame({
            "user_id": [user_id] * len(candidate_items),
            "item_id": candidate_items
        })

        # sklearn modelinden puanları al
        preds = model.predict(cand_df[["user_id", "item_id"]])

        # eşik üstüleri topla
        for item_id, est in zip(candidate_items, preds):
            if est >= min_rating:
                topN[user_id].append((item_id, float(est)))

        # sırala ve kırp
        topN[user_id].sort(key=lambda x: x[1], reverse=True)
        topN[user_id] = topN[user_id][:n]

    return topN

topN = get_top_n_sklearn(model, train_df, n=10, min_rating=4.0)

# =========================
# 5) METRİKLER
# =========================
def hit_rate(topN, left_out_df):
    hits = 0
    total = 0
    for _, row in left_out_df.iterrows():
        uid = row["user_id"]
        true_item = row["item_id"]
        user_list = topN.get(uid, [])
        hit = any(int(true_item) == int(item_id) for item_id, _ in user_list)
        if hit:
            hits += 1
        total += 1
    return hits / total if total > 0 else 0.0

def cumulative_hit_rate(topN, left_out_df, rating_cutoff=4.0):
    hits = 0
    total = 0
    for _, row in left_out_df.iterrows():
        uid = row["user_id"]
        true_item = row["item_id"]
        actual_rating = row["rating"]
        if actual_rating >= rating_cutoff:
            user_list = topN.get(uid, [])
            hit = any(int(true_item) == int(item_id) for item_id, _ in user_list)
            if hit:
                hits += 1
            total += 1
    return hits / total if total > 0 else 0.0

def average_reciprocal_hit_rank(topN, left_out_df):
    s = 0.0
    total = 0
    for _, row in left_out_df.iterrows():
        uid = row["user_id"]
        true_item = row["item_id"]
        user_list = topN.get(uid, [])
        rank = 0
        hit_rank = 0
        for item_id, _ in user_list:
            rank += 1
            if int(item_id) == int(true_item):
                hit_rank = rank
                break
        if hit_rank > 0:
            s += 1.0 / hit_rank
        total += 1
    return s / total if total > 0 else 0.0

# =========================
# 6) SONUÇLAR
# =========================
print("Top-N: \n", dict(topN))
print("HitRate:", hit_rate(topN, left_out_df))
print("CumulativeHitRate (>=4):", cumulative_hit_rate(topN, left_out_df, rating_cutoff=4.0))
print("ARHR:", average_reciprocal_hit_rank(topN, left_out_df))

# Örnek: kullanıcı bazlı öneri göster
for uid in sorted(topN.keys()):
    print(f"Kullanıcı {uid} için öneriler:", topN[uid])


Top-N: 
 {np.int64(1): [(49, 4.030555555555555)], np.int64(2): [], np.int64(3): [(31, 4.183333333333334), (49, 4.1)], np.int64(4): [(31, 4.280555555555556), (49, 4.222222222222222), (39, 4.113888888888889)], np.int64(5): [], np.int64(6): [(31, 4.158333333333333), (36, 4.029166666666667)], np.int64(7): [(49, 4.49375), (31, 4.4875), (36, 4.245833333333334), (39, 4.178333333333333), (65, 4.033194444444445), (17, 4.00875)], np.int64(8): [(49, 4.325), (31, 4.233333333333333)], np.int64(9): [(26, 4.155555555555555), (31, 4.13125)], np.int64(10): [(49, 4.165277777777778), (26, 4.069444444444445)], np.int64(11): [], np.int64(12): [(49, 4.520833333333333), (26, 4.42638888888889), (31, 4.341666666666667), (17, 4.004166666666666)], np.int64(13): [(49, 4.4), (31, 4.38611111111111), (36, 4.184722222222223)], np.int64(14): [(49, 4.25), (31, 4.108333333333333), (26, 4.010555555555555)], np.int64(15): [], np.int64(16): [(31, 4.225)], np.int64(17): [(26, 4.466666666666667), (31, 4.441666666666666), (39