# üé¨ H·ªá g·ª£i √Ω (Model-based CF) v·ªõi ALS (Explicit)

Notebook n√†y hu·∫•n luy·ªán **ALS t∆∞·ªùng minh (Explicit ALS)** cho b√†i to√°n d·ª± ƒëo√°n ƒëi·ªÉm phim v√† xu·∫•t d·ª± ƒëo√°n theo ƒë·ªãnh d·∫°ng **`Id,Score`** (b·∫Øt ƒë·∫ßu `Id` t·ª´ 1) gi·ªëng file m·∫´u.

**Quy tr√¨nh**
1. ƒê·ªçc `train.txt` / `test.txt`
2. √Ånh x·∫° ID g·ªëc ‚Üí ch·ªâ s·ªë li√™n t·ª•c (0..n-1)
3. T√≠nh **baseline** (Œº, ƒë·ªô l·ªách ng∆∞·ªùi d√πng b·µ§, ƒë·ªô l·ªách item b·µ¢) ƒë·ªÉ d·ª± ph√≤ng cold-start
4. X√¢y ma tr·∫≠n th∆∞a user‚Äìitem
5. Hu·∫•n luy·ªán **ALS t∆∞·ªùng minh** (t·ªëi thi·ªÉu h√≥a sai s·ªë b√¨nh ph∆∞∆°ng + L2)
6. (Tu·ª≥ ch·ªçn) ƒê√°nh gi√° nhanh RMSE tr√™n t·∫≠p holdout
7. D·ª± ƒëo√°n cho test v√† xu·∫•t `submission.csv`


In [None]:

import os
import numpy as np
import pandas as pd
from scipy import sparse
from pathlib import Path
from typing import Optional, Tuple

# --- ƒê∆∞·ªùng d·∫´n (s·ª≠a n·∫øu c·∫ßn) ---
TRAIN_PATH = "data/train.txt"   # v√≠ d·ª•: "/kaggle/input/.../train.txt"
TEST_PATH  = "data/test.txt"    # v√≠ d·ª•: "/kaggle/input/.../test.txt"
SUB_PATH   = "output/CF_ALS_submission.csv"   # n∆°i l∆∞u k·∫øt qu·∫£ d·ª± ƒëo√°n

print(Path(TRAIN_PATH).resolve())
print(Path(TEST_PATH).resolve())


In [None]:

# --- ƒê·ªçc d·ªØ li·ªáu ---
df_train = pd.read_csv(
    TRAIN_PATH,
    sep=r"\s+",
    header=None,
    names=["userid","movieid","rating"],
    engine="python",
)
df_test = pd.read_csv(
    TEST_PATH,
    sep=r"\s+",
    header=None,
    names=["userid","movieid"],
    engine="python",
)

print(df_train.shape, df_test.shape)
display(df_train.head())
display(df_test.head())


In [None]:

# --- √Ånh x·∫° ID g·ªëc ‚Üí ch·ªâ s·ªë li√™n t·ª•c (0..n-1), g·ªôp c·∫£ train & test ƒë·ªÉ tr√°nh thi·∫øu ---
all_u = pd.concat([df_train["userid"], df_test["userid"]], ignore_index=True)
all_i = pd.concat([df_train["movieid"], df_test["movieid"]], ignore_index=True)

uid_uniques = all_u.drop_duplicates()
iid_uniques = all_i.drop_duplicates()
uid_map = pd.Series(np.arange(len(uid_uniques), dtype=np.int32), index=uid_uniques.values)
iid_map = pd.Series(np.arange(len(iid_uniques), dtype=np.int32), index=iid_uniques.values)

df_train["u_idx"] = df_train["userid"].map(uid_map).astype(np.int32)
df_train["i_idx"] = df_train["movieid"].map(iid_map).astype(np.int32)
df_test["u_idx"]  = df_test["userid"].map(uid_map).astype(np.int32)
df_test["i_idx"]  = df_test["movieid"].map(iid_map).astype(np.int32)

n_users = uid_map.size
n_items = iid_map.size
print(f"n_users={n_users}, n_items={n_items}, train_rows={len(df_train)}, test_rows={len(df_test)}")


In [None]:

# --- Baseline (Œº + b_u + b_i) d√πng l√†m d·ª± ph√≤ng (fallback) cho cold-start ---
mu = df_train["rating"].mean()
lambda_reg_bias = 10.0

# Ma tr·∫≠n t·∫°m ƒë·ªÉ t√≠nh nhanh ch·ªâ m·ª•c ng∆∞·ªùi d√πng/item ƒë√£ c√≥ rating
R_tmp = sparse.coo_matrix(
    (df_train["rating"].astype(np.float32), (df_train["u_idx"], df_train["i_idx"])),
    shape=(n_users, n_items),
).tocsr()

users_items = [R_tmp[u].indices for u in range(n_users)]
items_users = [R_tmp[:, i].indices for i in range(n_items)]

b_u = np.zeros(n_users, dtype=np.float32)
b_i = np.zeros(n_items, dtype=np.float32)

# L·∫∑p v√†i v√≤ng coordinate-descent ƒë∆°n gi·∫£n
for _ in range(3):
    # c·∫≠p nh·∫≠t b_u
    for u in range(n_users):
        idx_i = users_items[u]
        if len(idx_i) == 0: 
            b_u[u] = 0.0
            continue
        r_ui = R_tmp[u, idx_i].toarray().ravel()
        b_u[u] = ((r_ui - mu - b_i[idx_i]).sum()) / (lambda_reg_bias + len(idx_i))
    # c·∫≠p nh·∫≠t b_i
    for i in range(n_items):
        idx_u = items_users[i]
        if len(idx_u) == 0:
            b_i[i] = 0.0
            continue
        r_ui = R_tmp[idx_u, i].toarray().ravel()
        b_i[i] = ((r_ui - mu - b_u[idx_u]).sum()) / (lambda_reg_bias + len(idx_u))

def predict_baseline(u: int, i: int, clip: Optional[Tuple[float,float]]=(1,5)) -> float:
    """D·ª± ƒëo√°n theo baseline: Œº + b_u + b_i (c√≥ clip v·ªÅ [1,5] n·∫øu c·∫ßn)."""
    val = mu + b_u[u] + b_i[i]
    if clip is not None:
        val = float(np.clip(val, clip[0], clip[1]))
    return float(val)

print("Baseline s·∫µn s√†ng. Global mean:", mu)


In [None]:

# --- T√°ch nhanh m·ªôt t·∫≠p holdout ƒë·ªÉ ∆∞·ªõc l∆∞·ª£ng RMSE (tu·ª≥ ch·ªçn) ---
rng = np.random.default_rng(2024)
mask = rng.random(len(df_train)) < 0.1  # l·∫•y 10% l√†m holdout
df_valid = df_train[mask].reset_index(drop=True)
df_tr    = df_train[~mask].reset_index(drop=True)
print("S·ªë d√≤ng train d√πng ALS:", df_tr.shape, "| holdout:", df_valid.shape)

# --- X√¢y ma tr·∫≠n th∆∞a CSR t·ª´ df_tr ---
R = sparse.coo_matrix(
    (df_tr["rating"].astype(np.float32), (df_tr["u_idx"], df_tr["i_idx"])),
    shape=(n_users, n_items),
).tocsr()

# Tu·ª≥ ch·ªçn: hu·∫•n luy·ªán tr√™n d·ªØ li·ªáu ƒë√£ tr·ª´ ƒëi trung b√¨nh to√†n c·ª•c
CENTER = True
r_mean = mu


In [None]:

# --- C√†i ƒë·∫∑t ALS t∆∞·ªùng minh (Explicit ALS) ---
def als_explicit(R_csr: sparse.csr_matrix, 
                 n_users: int, n_items: int, 
                 k: int = 64, 
                 reg: float = 0.1, 
                 n_iters: int = 10,
                 center: bool = True,
                 mean_val: float = 0.0,
                 random_state: int = 42):
    """
    B√†i to√°n: minimize \sum_{(u,i)‚ààObs} (r_ui - u_u^T v_i)^2 + reg*(||U||^2 + ||V||^2)
    N·∫øu center=True, m√¥ h√¨nh h·ªçc tr√™n (r_ui - mean_val) v√† c·ªông l·∫°i mean_val khi suy lu·∫≠n.
    R_csr: ma tr·∫≠n CSR (n_users x n_items) ch·ªâ ch·ª©a c√°c rating ƒë√£ quan s√°t.
    """
    rng = np.random.default_rng(random_state)
    U = 0.1 * rng.standard_normal((n_users, k)).astype(np.float32)
    V = 0.1 * rng.standard_normal((n_items, k)).astype(np.float32)

    # Danh s√°ch ch·ªâ m·ª•c ƒë·ªÉ truy c·∫≠p nhanh
    users_items = [R_csr[u].indices for u in range(n_users)]
    items_users = [R_csr[:, i].indices for i in range(n_items)]

    for it in range(n_iters):
        # --- C·∫≠p nh·∫≠t U (c·ªë ƒë·ªãnh V) ---
        VtV = V.T @ V + reg * np.eye(k, dtype=np.float32)
        for u in range(n_users):
            idx_i = users_items[u]
            if len(idx_i) == 0:
                continue
            V_i = V[idx_i]                              # (m, k)
            r_u = R_csr[u, idx_i].toarray().ravel()     # (m,)
            if center:
                r_u = r_u - mean_val
            # Gi·∫£i (V_i^T V_i + reg I) U_u = V_i^T r_u
            A = VtV - (reg * np.eye(k, dtype=np.float32)) + V_i.T @ V_i + reg * np.eye(k, dtype=np.float32)
            b = V_i.T @ r_u
            U[u] = np.linalg.solve(A, b)

        # --- C·∫≠p nh·∫≠t V (c·ªë ƒë·ªãnh U) ---
        UtU = U.T @ U + reg * np.eye(k, dtype=np.float32)
        for i in range(n_items):
            idx_u = items_users[i]
            if len(idx_u) == 0:
                continue
            U_u = U[idx_u]                              # (m, k)
            r_i = R_csr[idx_u, i].toarray().ravel()     # (m,)
            if center:
                r_i = r_i - mean_val
            # Gi·∫£i (U_u^T U_u + reg I) V_i = U_u^T r_i
            A = UtU - (reg * np.eye(k, dtype=np.float32)) + U_u.T @ U_u + reg * np.eye(k, dtype=np.float32)
            b = U_u.T @ r_i
            V[i] = np.linalg.solve(A, b)
        print(f"Ho√†n t·∫•t v√≤ng l·∫∑p ALS {it+1}/{n_iters}.")
    return U, V


In [None]:

# --- Hu·∫•n luy·ªán ALS ---
K = 64         # s·ªë chi·ªÅu ·∫©n
LAMBDA = 0.1   # h·ªá s·ªë L2
N_ITERS = 10   # s·ªë v√≤ng l·∫∑p ALS

U, V = als_explicit(R, n_users, n_items, k=K, reg=LAMBDA, n_iters=N_ITERS, center=CENTER, mean_val=r_mean, random_state=2025)
print("K√≠ch th∆∞·ªõc nh√¢n t·ªë:", U.shape, V.shape)


In [None]:

# --- ∆Ø·ªõc l∆∞·ª£ng RMSE tr√™n t·∫≠p holdout (tu·ª≥ ch·ªçn) ---
def predict_dot(u, i, U, V, center=True, mean_val=0.0):
    val = float(U[u] @ V[i])
    if center:
        val += mean_val
    return val

def rmse_holdout(df_valid, U, V, clip=(1,5), center=True, mean_val=0.0):
    se = 0.0
    n = 0
    for u, i, r in df_valid[["u_idx","i_idx","rating"]].itertuples(index=False):
        pred = predict_dot(u, i, U, V, center=center, mean_val=mean_val)
        if clip is not None:
            pred = np.clip(pred, clip[0], clip[1])
        se += (pred - r) ** 2
        n += 1
    return np.sqrt(se / max(n, 1))

val_rmse = rmse_holdout(df_valid, U, V, clip=(1,5), center=CENTER, mean_val=r_mean)
print(f"Holdout RMSE: {val_rmse:.5f}")


In [None]:

# --- D·ª± ƒëo√°n cho test k√®m fallback v·ªÅ baseline n·∫øu g·∫∑p cold-start ---
def predict_with_fallback(u, i, U, V, clip=(1,5)) -> float:
    has_u = np.any(U[u]) if (0 <= u < U.shape[0]) else False
    has_i = np.any(V[i]) if (0 <= i < V.shape[0]) else False
    if has_u and has_i:
        val = float(U[u] @ V[i])
        if CENTER:
            val += r_mean
        if clip is not None:
            val = float(np.clip(val, clip[0], clip[1]))
        return val
    # N·∫øu thi·∫øu user/item trong nh√¢n t·ªë, quay v·ªÅ baseline
    return predict_baseline(u, i, clip=clip)

CLIP = (1, 5)
preds = np.empty(len(df_test), dtype=np.float32)
for idx, (u, i) in enumerate(zip(df_test["u_idx"].values, df_test["i_idx"].values)):
    preds[idx] = predict_with_fallback(u, i, U, V, clip=CLIP)

print("V√≠ d·ª• 10 d·ª± ƒëo√°n ƒë·∫ßu:", preds[:10])


In [None]:

# --- Xu·∫•t file submission theo ƒë·ªãnh d·∫°ng m·∫´u (Id b·∫Øt ƒë·∫ßu t·ª´ 1) ---
submission = pd.DataFrame({
    "Id": np.arange(1, len(df_test) + 1, dtype=np.int64),
    "Score": preds
})
submission.to_csv(SUB_PATH, index=False, float_format="%.6f")
print("ƒê√£ l∆∞u:", SUB_PATH, submission.shape)
display(submission.head())
