In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# === 1. Load and preprocess data ===
df = pd.read_csv("steam-200k.csv", header=None)
df.columns = ["user_id", "game", "action", "playtime", "_"]
df = df[df["action"] == "play"].drop(columns=["action", "_"])
df = df[df["playtime"] > 0]  # optional: clean 0s


In [16]:
# Apply log1p transformation to playtime
df["log_playtime"] = np.log1p(df["playtime"])

# Create user and game ID maps
user_ids = df["user_id"].unique()
game_ids = df["game"].unique()
user_map = {uid: idx for idx, uid in enumerate(user_ids)}
game_map = {gid: idx for idx, gid in enumerate(game_ids)}
game_map_rev = {idx: gid for gid, idx in game_map.items()}



In [26]:
# === 2. User-wise train/test split ===
def userwise_train_test_split(df, test_size=0.2):
    train_rows, test_rows = [], []
    for uid, group in df.groupby("user_id"):
        if len(group) < 2:
            train_rows.append(group)
        else:
            train, test = train_test_split(group, test_size=test_size, random_state=42)
            train_rows.append(train)
            test_rows.append(test)
    return pd.concat(train_rows), pd.concat(test_rows)

df_train, df_test = userwise_train_test_split(df)

# === 3. Build user-item matrix ===
def build_matrix(df_subset, user_map, game_map, value_col="log_playtime"):
    R = np.zeros((len(user_map), len(game_map)))
    for _, row in df_subset.iterrows():
        u = user_map.get(row["user_id"])
        i = game_map.get(row["game"])
        if u is not None and i is not None:
            R[u, i] = row[value_col]
    return R

R_train = build_matrix(df_train, user_map, game_map)

# === 4. ALS Implementation ===
def als(R, num_factors=20, num_iters=10, lambda_reg=0.1):
    num_users, num_items = R.shape
    U = np.random.normal(scale=1./num_factors, size=(num_users, num_factors))
    V = np.random.normal(scale=1./num_factors, size=(num_items, num_factors))

    for it in range(num_iters):
        for u in range(num_users):
            V_i = V[R[u, :] > 0]
            R_u = R[u, R[u, :] > 0]
            if len(R_u) == 0: continue
            A = V_i.T @ V_i + lambda_reg * np.eye(num_factors)
            b = V_i.T @ R_u
            U[u] = np.linalg.solve(A, b)

        for i in range(num_items):
            U_u = U[R[:, i] > 0]
            R_i = R[R[:, i] > 0, i]
            if len(R_i) == 0: continue
            A = U_u.T @ U_u + lambda_reg * np.eye(num_factors)
            b = U_u.T @ R_i
            V[i] = np.linalg.solve(A, b)

        # Optional: training RMSE
        pred = U @ V.T
        mask = R > 0
        train_rmse = np.sqrt(((mask * (R - pred))**2).sum() / mask.sum())
        print(f"Iteration {it+1}: RMSE = {train_rmse:.4f}")

    return U, V

# === 5. Train ALS ===
U, V = als(R_train, num_factors=20, num_iters=10, lambda_reg=0.1)


Iteration 1: RMSE = 0.9008
Iteration 2: RMSE = 0.4624
Iteration 3: RMSE = 0.3737
Iteration 4: RMSE = 0.3276
Iteration 5: RMSE = 0.2975
Iteration 6: RMSE = 0.2757
Iteration 7: RMSE = 0.2592
Iteration 8: RMSE = 0.2461
Iteration 9: RMSE = 0.2354
Iteration 10: RMSE = 0.2264


In [28]:
# === 6. Recommend games for a user ===
def recommend(user_index, U, V, R, top_n=5):
    scores = U[user_index] @ V.T
    scores[R[user_index] > 0] = -np.inf
    top_items = np.argsort(scores)[-top_n:][::-1]
    return [(game_map_rev[i], scores[i]) for i in top_items]

# Example: Recommend for user 0
print("\n🎮 Top Recommendations for User 0:")
recs = recommend(0, U, V, R_train)
for game, score in recs:
    print(f"{game}: {score:.2f}")

# === 7. Evaluate on test set (RMSE & MAE) ===
def evaluate_on_test(df_test, U, V, user_map, game_map, value_col="log_playtime"):
    abs_errors = []
    squared_errors = []

    for _, row in df_test.iterrows():
        uid = row["user_id"]
        gid = row["game"]
        true_val = row[value_col]

        if uid not in user_map or gid not in game_map:
            continue

        u = user_map[uid]
        i = game_map[gid]
        pred = np.dot(U[u], V[i])

        abs_errors.append(abs(true_val - pred))
        squared_errors.append((true_val - pred)**2)

    mae = np.mean(abs_errors)
    rmse = np.sqrt(np.mean(squared_errors))
    print(f"\n📊 Test Evaluation:\nMAE  = {mae:.4f}\nRMSE = {rmse:.4f}")
    return mae, rmse

# Evaluate
evaluate_on_test(df_test, U, V, user_map, game_map)



def grid_search_als(R_train, df_test, user_map, game_map):
    best_rmse = float('inf')
    best_mae = float('inf')
    best_params_rmse = {}
    best_params_mae = {}

    num_factors_list = [10, 20]
    lambda_list = [0.01, 0.1]
    num_iters_list = [5, 10]

    print("🔍 Starting grid search...\n")

    for num_factors in num_factors_list:
        for lambda_reg in lambda_list:
            for num_iters in num_iters_list:
                print(f"▶ Trying: factors={num_factors}, lambda={lambda_reg}, iters={num_iters}")
                U, V = als(R_train, num_factors=num_factors, num_iters=num_iters, lambda_reg=lambda_reg)
                mae, rmse = evaluate_on_test(df_test, U, V, user_map, game_map)

                if rmse < best_rmse:
                    best_rmse = rmse
                    best_params_rmse = {
                        "num_factors": num_factors,
                        "lambda_reg": lambda_reg,
                        "num_iters": num_iters
                    }

                if mae < best_mae:
                    best_mae = mae
                    best_params_mae = {
                        "num_factors": num_factors,
                        "lambda_reg": lambda_reg,
                        "num_iters": num_iters
                    }

    print("\n✅ Grid Search Results:")
    print(f"🔹 Best RMSE = {best_rmse:.4f} with {best_params_rmse}")
    print(f"🔸 Best MAE  = {best_mae:.4f} with {best_params_mae}")
    return best_params_rmse, best_params_mae

# best_rmse_params, best_mae_params = grid_search_als(R_train, df_test, user_map, game_map)


🎮 Top Recommendations for User 0:
Overlord: 5.80
Half-Life Opposing Force: 5.62
Star Wars Jedi Knight Jedi Academy: 5.21
Resident Evil 5 / Biohazard 5: 5.19
Elite Dangerous: 4.94

📊 Test Evaluation:
MAE  = 1.9808
RMSE = 2.6276


In [18]:
train_games = set(df_train["game"])
test_games = set(df_test["game"])
cold_start_games = test_games - train_games
print(f"{len(cold_start_games)} cold-start games in test set")


225 cold-start games in test set


In [19]:
train_users = set(df_train["user_id"])
test_users = set(df_test["user_id"])
cold_start_users = test_users - train_users
print(f"{len(cold_start_users)} cold-start users in test set")


0 cold-start users in test set


In [27]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 1. Filter test set to games seen in training
known_games = set(df_train["game"])
filtered_test_df = df_test[df_test["game"].isin(known_games)].copy()

df["log_playtime"] = np.log1p(df["playtime"])

# 2. Map user_id and game to indices used in U and V
filtered_test_df["user_idx"] = filtered_test_df["user_id"].map(user_map)
filtered_test_df["game_idx"] = filtered_test_df["game"].map(game_map)

# 3. Predict using U and V
actuals, preds = [], []

for row in filtered_test_df.itertuples():
    u_idx = row.user_idx
    i_idx = row.game_idx

    if np.isnan(u_idx) or np.isnan(i_idx):
        continue  # skip invalid mappings

    pred = U[u_idx] @ V[i_idx].T
    actual = np.log1p(row.playtime)  # or row.log_playtime if using log scale

    preds.append(pred)
    actuals.append(actual)

# 4. Evaluate
rmse = np.sqrt(mean_squared_error(actuals, preds))
mae = mean_absolute_error(actuals, preds)

print(f"Filtered Test RMSE: {rmse:.4f}")
print(f"Filtered Test MAE : {mae:.4f}")


Filtered Test RMSE: 2.6398
Filtered Test MAE : 1.9910
