In [24]:
# =========================================================
# Cell 1 â€” Imports, paths, MLflow setup
# =========================================================
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.neighbors import NearestNeighbors

import mlflow
import mlflow.sklearn

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("âœ… Imports OK")

# ---------------------------------------------------------
# Robust project root detection
# ---------------------------------------------------------
ROOT = Path().resolve()  # current dir (probably .../models)

if not (ROOT / "data").exists():
    ROOT = ROOT.parent  # go up to project root

DATA_RAW = ROOT / "data" / "raw"
DATA_PROCESSED = ROOT / "data" / "processed"

print("Project root:", ROOT)
print("Raw data folder:", DATA_RAW)
print("Processed data folder:", DATA_PROCESSED)

# ---------------------------------------------------------
# MLflow tracking: use SAME DB as scripts
# ---------------------------------------------------------
mlflow.set_tracking_uri(f"sqlite:///{ROOT / 'mlflow.db'}")
mlflow.set_experiment("MovieLens_Recs_Models")

print("MLflow tracking URI:", mlflow.get_tracking_uri())
print("âœ… MLflow configured")



âœ… Imports OK
Project root: C:\Users\hibaz\PycharmProjects\MLOPSmovierecommendation
Raw data folder: C:\Users\hibaz\PycharmProjects\MLOPSmovierecommendation\data\raw
Processed data folder: C:\Users\hibaz\PycharmProjects\MLOPSmovierecommendation\data\processed
MLflow tracking URI: sqlite:///C:\Users\hibaz\PycharmProjects\MLOPSmovierecommendation\mlflow.db
âœ… MLflow configured


In [21]:
# =========================================================
# Cell 2 â€” Load data & build userâ€“item matrix
# =========================================================
# Ratings: we only need userId, movieId, rating
ratings_raw = pd.read_csv(DATA_RAW / "ratings.csv")
ratings = ratings_raw[["userId", "movieId", "rating"]].copy()

# Movies: use processed if available (better genres); fallback to raw
movies_path_processed = DATA_PROCESSED / "movies_processed.csv"
if movies_path_processed.exists():
    movies = pd.read_csv(movies_path_processed)
else:
    movies = pd.read_csv(DATA_RAW / "movies.csv")

print("Ratings shape:", ratings.shape)
print("Movies shape:", movies.shape)

# ------- Train/Test split on ratings (for RMSE) -------
train_ratings, test_ratings = train_test_split(
    ratings,
    test_size=0.2,
    random_state=RANDOM_STATE
)

print("Train ratings:", train_ratings.shape)
print("Test ratings:", test_ratings.shape)

# ------- Build userâ€“item matrix from TRAIN ONLY -------
user_item_matrix = train_ratings.pivot_table(
    index="userId",
    columns="movieId",
    values="rating",
    fill_value=0.0
)

print("Train matrix shape:", user_item_matrix.shape)
print("âœ… Data & matrix ready")

C:\Users\hibaz\PycharmProjects\MLOPSmovierecommendation\models
False
['Implementationmodels.ipynb', 'mlruns', 'model_experiments.ipynb', 'svd_embeddings.pkl', 'tfidf_vectorizer.pkl']


In [25]:
# =========================================================
# Cell 3 â€” Helper metrics for Top-N recommendation
# =========================================================
def precision_at_k(rec, rel, k=10):
    return len(set(rec[:k]) & set(rel)) / float(k) if k > 0 else 0.0


def recall_at_k(rec, rel, k=10):
    return len(set(rec[:k]) & set(rel)) / float(len(rel)) if rel else 0.0


def hit_rate_at_k(rec, rel, k=10):
    return 1.0 if len(set(rec[:k]) & set(rel)) > 0 else 0.0


def ndcg_at_k(rec, rel, k=10):
    dcg = 0.0
    for i, m in enumerate(rec[:k]):
        if m in rel:
            dcg += 1.0 / np.log2(i + 2)
    ideal_len = min(len(rel), k)
    if ideal_len == 0:
        return 0.0
    idcg = sum(1.0 / np.log2(i + 2) for i in range(ideal_len))
    return dcg / idcg if idcg > 0 else 0.0


def evaluate_recommender(ratings_df, recommend_fn, k=10, max_users=200):
    """
    ratings_df: DataFrame with userId, movieId, rating
    recommend_fn: function(user_id, k) -> list of movieIds
    """
    precision, recall, hit, ndcg = [], [], [], []

    for user in ratings_df["userId"].unique()[:max_users]:
        liked = ratings_df[
            (ratings_df["userId"] == user) &
            (ratings_df["rating"] >= 4.0)
            ]["movieId"].tolist()

        if len(liked) < 2:
            continue

        recs = recommend_fn(user, k=k)
        if not recs:
            continue

        precision.append(precision_at_k(recs, liked, k))
        recall.append(recall_at_k(recs, liked, k))
        hit.append(hit_rate_at_k(recs, liked, k))
        ndcg.append(ndcg_at_k(recs, liked, k))

    if not precision:
        return {
            "precision_at_10": 0.0,
            "recall_at_10": 0.0,
            "hit_rate_at_10": 0.0,
            "ndcg_at_10": 0.0,
        }

    return {
        "precision_at_10": float(np.mean(precision)),
        "recall_at_10": float(np.mean(recall)),
        "hit_rate_at_10": float(np.mean(hit)),
        "ndcg_at_10": float(np.mean(ndcg)),
    }


print("âœ… Metrics helpers ready")

âœ… Metrics helpers ready


In [26]:

# =========================================================
# Cell 4 â€” Baseline model (global popularity)
# =========================================================
def train_baseline(train_ratings_df):
    """
    Baseline: predict using mean rating per movie (popularity).
    Returns a Series indexed by movieId, sorted descending.
    """
    movie_mean = train_ratings_df.groupby("movieId")["rating"].mean()
    movie_mean = movie_mean.sort_values(ascending=False)
    return movie_mean


def rmse_baseline(train_ratings_df, test_ratings_df):
    movie_mean = train_ratings_df.groupby("movieId")["rating"].mean()
    global_mean = train_ratings_df["rating"].mean()

    def predict(row):
        return movie_mean.get(row["movieId"], global_mean)

    preds = test_ratings_df.apply(predict, axis=1)
    mse = mean_squared_error(test_ratings_df["rating"], preds)
    return float(np.sqrt(mse))


# ----- Train + evaluate + log in MLflow -----
with mlflow.start_run(run_name="baseline_popularity"):
    print("Training Popularity Baseline...")

    movie_rankings = train_baseline(train_ratings)
    rmse_b = rmse_baseline(train_ratings, test_ratings)


    def baseline_rec_fn(user_id, k=10):
        return list(movie_rankings.head(k).index)


    baseline_metrics = evaluate_recommender(ratings, baseline_rec_fn, k=10)

    # Log to MLflow
    mlflow.log_param("model_type", "baseline_popularity")
    mlflow.log_metric("rmse", rmse_b)
    for mk, mv in baseline_metrics.items():
        mlflow.log_metric(mk, mv)

    print("Baseline RMSE:", rmse_b)
    print("Baseline metrics:", baseline_metrics)

print("âœ… Baseline model finished")

Training Popularity Baseline...
Baseline RMSE: 0.9827389937822489
Baseline metrics: {'precision_at_10': 0.001, 'recall_at_10': 7.402753496503496e-05, 'hit_rate_at_10': 0.01, 'ndcg_at_10': 0.0011200278075758777}
âœ… Baseline model finished


In [27]:

# =========================================================
# Cell 5 â€” Content-based TF-IDF model
# =========================================================
def build_tfidf_content_model(movies_df, max_features=5000):
    """
    Content-based model: TF-IDF on title + genres.
    Returns (tfidf_vectorizer, tfidf_matrix, cosine_sim, movie_ids)
    """
    df = movies_df.copy()

    # Make sure columns exist
    if "genres" not in df.columns:
        df["genres"] = ""
    if "title" not in df.columns:
        df["title"] = ""

    df["genres"] = df["genres"].fillna("")
    df["title"] = df["title"].fillna("")
    df["combined_text"] = df["title"] + " " + df["genres"]

    tfidf = TfidfVectorizer(stop_words="english", max_features=max_features)
    tfidf_matrix = tfidf.fit_transform(df["combined_text"])

    cosine_sim = cosine_similarity(tfidf_matrix)
    movie_ids = df["movieId"].values

    return tfidf, tfidf_matrix, cosine_sim, movie_ids


with mlflow.start_run(run_name="content_tfidf"):
    print("Training TF-IDF content model...")

    tfidf, tfidf_matrix, cosine_sim, cb_movie_ids = build_tfidf_content_model(
        movies, max_features=5000
    )

    # index mapping: movieId -> row index in cosine_sim
    movieid_to_index = {m: i for i, m in enumerate(cb_movie_ids)}


    def tfidf_rec_fn(user_id, k=10):
        user_ratings = ratings[
            (ratings["userId"] == user_id) &
            (ratings["rating"] >= 4.0)
            ]
        liked_movies = user_ratings["movieId"].tolist()
        if not liked_movies:
            return []

        anchor = liked_movies[0]
        if anchor not in movieid_to_index:
            return []

        anchor_idx = movieid_to_index[anchor]
        sims = cosine_sim[anchor_idx]

        # sort all movies by similarity
        sorted_idx = np.argsort(sims)[::-1]

        # remove movies already rated by user
        rated_movies = ratings[ratings["userId"] == user_id]["movieId"].unique().tolist()
        rated_set = set(rated_movies)

        rec_ids = []
        for idx in sorted_idx:
            movie_id = cb_movie_ids[idx]
            if movie_id in rated_set:
                continue
            rec_ids.append(movie_id)
            if len(rec_ids) >= k:
                break

        return rec_ids


    content_metrics = evaluate_recommender(ratings, tfidf_rec_fn, k=10)

    # Log to MLflow
    mlflow.log_param("model_type", "content_tfidf")
    mlflow.log_param("tfidf_max_features", 5000)
    for mk, mv in content_metrics.items():
        mlflow.log_metric(mk, mv)

    print("Content-based metrics:", content_metrics)

print("âœ… Content-based model finished")

Training TF-IDF content model...
Content-based metrics: {'precision_at_10': 0.0, 'recall_at_10': 0.0, 'hit_rate_at_10': 0.0, 'ndcg_at_10': 0.0}
âœ… Content-based model finished


In [28]:

# =========================================================
# Cell 6 â€” SVD Collaborative Filtering
# =========================================================
def train_svd_cf(user_item_mat, n_components=50):
    """
    SVD on userâ€“item rating matrix (train).
    Returns dict with factors + RMSE on test set.
    """
    # Matrix R (users x movies)
    R = user_item_mat.values.astype(np.float32)
    user_ids = user_item_mat.index.values
    movie_ids = user_item_mat.columns.values

    svd = TruncatedSVD(n_components=n_components, random_state=RANDOM_STATE)
    user_factors = svd.fit_transform(R)
    movie_factors = svd.components_.T  # shape: n_movies x n_components

    model_dict = {
        "user_ids": user_ids,
        "movie_ids": movie_ids,
        "user_factors": user_factors,
        "movie_factors": movie_factors,
        "n_components": n_components,
    }

    return model_dict


def svd_rmse(model_dict, train_ratings_df, test_ratings_df):
    user_ids = model_dict["user_ids"]
    movie_ids = model_dict["movie_ids"]
    user_factors = model_dict["user_factors"]
    movie_factors = model_dict["movie_factors"]

    user_index_map = {u: i for i, u in enumerate(user_ids)}
    movie_index_map = {m: i for i, m in enumerate(movie_ids)}

    preds = []
    actual = []

    for row in test_ratings_df.itertuples():
        u = row.userId
        m = row.movieId
        r = row.rating

        if (u not in user_index_map) or (m not in movie_index_map):
            continue

        u_idx = user_index_map[u]
        m_idx = movie_index_map[m]

        pred = float(np.dot(user_factors[u_idx], movie_factors[m_idx]))
        preds.append(pred)
        actual.append(r)

    if not preds:
        return None

    mse = mean_squared_error(actual, preds)
    return float(np.sqrt(mse))


with mlflow.start_run(run_name="svd_cf"):
    print("Training SVD CF model...")

    n_components = 50
    model_svd = train_svd_cf(user_item_matrix, n_components=n_components)

    # RMSE
    rmse_svd = svd_rmse(model_svd, train_ratings, test_ratings)

    user_ids_svd = model_svd["user_ids"]
    movie_ids_svd = model_svd["movie_ids"]
    user_factors_svd = model_svd["user_factors"]
    movie_factors_svd = model_svd["movie_factors"]

    user_index_map_svd = {u: i for i, u in enumerate(user_ids_svd)}
    movie_index_map_svd = {m: i for i, m in enumerate(movie_ids_svd)}


    def svd_rec_fn(user_id, k=10):
        if user_id not in user_index_map_svd:
            return []

        u_idx = user_index_map_svd[user_id]
        scores = np.dot(user_factors_svd[u_idx], movie_factors_svd.T)

        scores_series = pd.Series(scores, index=movie_ids_svd)

        if user_id in user_item_matrix.index:
            already_rated = user_item_matrix.loc[user_id]
            scores_series = scores_series[already_rated == 0]

        top_movies = scores_series.sort_values(ascending=False).head(k).index
        return list(top_movies)


    svd_metrics = evaluate_recommender(ratings, svd_rec_fn, k=10)

    # Log to MLflow
    mlflow.log_param("model_type", "svd_cf")
    mlflow.log_param("n_components", n_components)
    if rmse_svd is not None:
        mlflow.log_metric("rmse", rmse_svd)
    for mk, mv in svd_metrics.items():
        mlflow.log_metric(mk, mv)

    print("SVD RMSE:", rmse_svd)
    print("SVD metrics:", svd_metrics)

print("âœ… SVD CF model finished")

Training SVD CF model...
SVD RMSE: 3.1668501056771423
SVD metrics: {'precision_at_10': 0.20200000000000004, 'recall_at_10': 0.043289030557360365, 'hit_rate_at_10': 0.755, 'ndcg_at_10': 0.23131219349474694}
âœ… SVD CF model finished


In [29]:

# =========================================================
# Cell 7 â€” NMF Collaborative Filtering (FIXED)
# =========================================================

from sklearn.decomposition import NMF

def train_nmf_cf(user_item_mat, n_components=40, max_iter=200):
    """
    NMF-based CF: factorization of userâ€“item matrix with non-negative factors.
    """
    R = user_item_mat.values.astype(np.float32)
    user_ids = user_item_mat.index.values
    movie_ids = user_item_mat.columns.values

    nmf = NMF(
        n_components=n_components,
        init="random",
        max_iter=max_iter,
        random_state=RANDOM_STATE,
    )
    user_factors = nmf.fit_transform(R)
    movie_factors = nmf.components_.T  # n_movies x n_components

    model_dict = {
        "user_ids": user_ids,
        "movie_ids": movie_ids,
        "user_factors": user_factors,
        "movie_factors": movie_factors,
        "n_components": n_components,
    }

    return model_dict


def nmf_rmse(model_dict, train_ratings_df, test_ratings_df):
    user_ids = model_dict["user_ids"]
    movie_ids = model_dict["movie_ids"]
    user_factors = model_dict["user_factors"]
    movie_factors = model_dict["movie_factors"]

    user_index_map = {u: i for i, u in enumerate(user_ids)}
    movie_index_map = {m: i for i, m in enumerate(movie_ids)}

    preds = []
    actual = []

    for row in test_ratings_df.itertuples():
        u = row.userId
        m = row.movieId
        r = row.rating

        if (u not in user_index_map) or (m not in movie_index_map):
            continue

        u_idx = user_index_map[u]
        m_idx = movie_index_map[m]

        pred = float(np.dot(user_factors[u_idx], movie_factors[m_idx]))
        preds.append(pred)
        actual.append(r)

    if not preds:
        return None

    mse = mean_squared_error(actual, preds)
    return float(np.sqrt(mse))


# ---------- RUN + LOG TO MLFLOW ----------
with mlflow.start_run(run_name="nmf_cf"):
    print("Training NMF CF model...")

    n_components_nmf = 40
    model_nmf = train_nmf_cf(user_item_matrix, n_components=n_components_nmf)

    rmse_nmf = nmf_rmse(model_nmf, train_ratings, test_ratings)

    user_ids_nmf = model_nmf["user_ids"]
    movie_ids_nmf = model_nmf["movie_ids"]
    user_factors_nmf = model_nmf["user_factors"]
    movie_factors_nmf = model_nmf["movie_factors"]

    user_index_map_nmf = {u: i for i, u in enumerate(user_ids_nmf)}

    def nmf_rec_fn(user_id, k=10):
        if user_id not in user_index_map_nmf:
            return []

        u_idx = user_index_map_nmf[user_id]
        scores = np.dot(user_factors_nmf[u_idx], movie_factors_nmf.T)
        scores_series = pd.Series(scores, index=movie_ids_nmf)

        if user_id in user_item_matrix.index:
            already_rated = user_item_matrix.loc[user_id]
            scores_series = scores_series[already_rated == 0]

        top_movies = scores_series.sort_values(ascending=False).head(k).index
        return list(top_movies)

    nmf_metrics = evaluate_recommender(ratings, nmf_rec_fn, k=10)

    mlflow.log_param("model_type", "nmf_cf")
    mlflow.log_param("n_components", n_components_nmf)

    if rmse_nmf is not None:
        mlflow.log_metric("rmse", rmse_nmf)

    for mk, mv in nmf_metrics.items():
        mlflow.log_metric(mk, mv)

    print("NMF RMSE:", rmse_nmf)
    print("NMF metrics:", nmf_metrics)

print("âœ… NMF CF model finished")


Training NMF CF model...
NMF RMSE: 3.1282046075529926
NMF metrics: {'precision_at_10': 0.1865, 'recall_at_10': 0.04039343523428452, 'hit_rate_at_10': 0.715, 'ndcg_at_10': 0.21350527968302185}
âœ… NMF CF model finished


In [30]:
# =========================================================
# Cell 8 â€” Item-based KNN Collaborative Filtering
# =========================================================
def train_item_knn_cf(user_item_mat, n_neighbors=40):
    """
    Item-based CF using cosine similarity between movie rating vectors.
    """
    # Transpose: movies x users
    item_matrix = user_item_mat.T  # shape: n_movies x n_users
    movie_ids = item_matrix.index.values

    knn_model = NearestNeighbors(
        n_neighbors=n_neighbors,
        metric="cosine",
        algorithm="brute"
    )
    knn_model.fit(item_matrix.values)

    return knn_model, movie_ids


with mlflow.start_run(run_name="item_knn_cf"):
    print("Training item-based KNN CF model...")

    n_neighbors_knn = 40
    knn_model, knn_movie_ids = train_item_knn_cf(user_item_matrix, n_neighbors=n_neighbors_knn)

    movie_index_map_knn = {m: i for i, m in enumerate(knn_movie_ids)}


    def item_knn_rec_fn(user_id, k=10):
        # movies user already rated (from TRAIN)
        if user_id not in user_item_matrix.index:
            return []
        user_ratings_vec = user_item_matrix.loc[user_id]
        rated_movies = user_ratings_vec[user_ratings_vec > 0].index.tolist()
        if not rated_movies:
            return []

        scores_accum = {}

        for m in rated_movies:
            if m not in movie_index_map_knn:
                continue
            m_idx = movie_index_map_knn[m]
            movie_vec = user_item_matrix.T.values[m_idx].reshape(1, -1)
            distances, indices = knn_model.kneighbors(movie_vec, n_neighbors=n_neighbors_knn)

            for dist, idx in zip(distances.flatten(), indices.flatten()):
                neighbor_movie = knn_movie_ids[idx]
                if neighbor_movie in rated_movies:
                    continue
                sim = 1.0 - dist  # cosine distance â†’ similarity
                scores_accum[neighbor_movie] = scores_accum.get(neighbor_movie, 0.0) + sim

        if not scores_accum:
            return []

        scores_series = pd.Series(scores_accum)
        top_movies = scores_series.sort_values(ascending=False).head(k).index
        return list(top_movies)


    knn_cf_metrics = evaluate_recommender(ratings, item_knn_rec_fn, k=10)

    mlflow.log_param("model_type", "item_knn_cf")
    mlflow.log_param("n_neighbors", n_neighbors_knn)
    for mk, mv in knn_cf_metrics.items():
        mlflow.log_metric(mk, mv)

    print("Item KNN CF metrics:", knn_cf_metrics)

print("âœ… Item-based KNN CF finished")

Training item-based KNN CF model...
Item KNN CF metrics: {'precision_at_10': 0.163, 'recall_at_10': 0.03579994611193533, 'hit_rate_at_10': 0.67, 'ndcg_at_10': 0.18358496430193874}
âœ… Item-based KNN CF finished


In [33]:
# =========================================================
# Cell 9 â€” Simple hybrid: SVD + TF-IDF fusion
# =========================================================
with mlflow.start_run(run_name="hybrid_svd_tfidf"):
    print("Training Hybrid model (SVD + TF-IDF)...")


    # We reuse SVD and TF-IDF artifacts from above cells:
    #   - svd_rec_fn: rating-based recs
    #   - tfidf_rec_fn: similarity-based recs
    # For a hybrid, we combine scores.

    # Precompute for efficiency:
    #  - We'll build score dictionaries for each user on the fly.
    #    To keep it simple, we combine rankings (not continuous scores).

    def hybrid_rec_fn(user_id, k=10):
        rec_svd = svd_rec_fn(user_id, k=50) or []
        rec_cb = tfidf_rec_fn(user_id, k=50) or []

        scores = {}

        # Higher weight for SVD (CF is usually stronger)
        for rank, m in enumerate(rec_svd):
            scores[m] = scores.get(m, 0.0) + (50 - rank)

        for rank, m in enumerate(rec_cb):
            scores[m] = scores.get(m, 0.0) + 0.5 * (50 - rank)

        if not scores:
            return []

        scores_series = pd.Series(scores)
        top_movies = scores_series.sort_values(ascending=False).head(k).index
        return list(top_movies)


    hybrid_metrics = evaluate_recommender(ratings, hybrid_rec_fn, k=10)

    mlflow.log_param("model_type", "hybrid_svd_tfidf")
    for mk, mv in hybrid_metrics.items():
        mlflow.log_metric(mk, mv)

    print("Hybrid metrics:", hybrid_metrics)

print("âœ… Hybrid model finished")
print("ðŸŽ‰ All notebook experiments completed. Open MLflow UI from project root with:  mlflow ui")

Training Hybrid model (SVD + TF-IDF)...
Hybrid metrics: {'precision_at_10': 0.196, 'recall_at_10': 0.04146593289036713, 'hit_rate_at_10': 0.755, 'ndcg_at_10': 0.21651720340362204}
âœ… Hybrid model finished
ðŸŽ‰ All notebook experiments completed. Open MLflow UI from project root with:  mlflow ui
