# Content-Based Recommender (TF‑IDF) — No Train/Test Split
This notebook builds a pure **content-based** recommender using only the course catalog (`data/processed/courses_with_id.csv`).
It includes a few **sample tests** (interest prompts → recommended courses) and prints short, human-readable explanations based on overlapping keywords.

Why no train/test split here? Because offline metrics like Precision@k / NDCG@k require held-out user interactions. This notebook focuses on a working baseline + sanity checks.

In [1]:
# 1) Imports + Load catalog
from __future__ import annotations

from pathlib import Path
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

ROOT = Path.cwd()
CATALOG_PATH = ROOT.parent / "data" / "processed" / "courses_with_id.csv"
assert CATALOG_PATH.exists(), f"Missing: {CATALOG_PATH}"

df = pd.read_csv(CATALOG_PATH)
print("Catalog shape:", df.shape)
print("Columns:", list(df.columns))
df.head(3)

Catalog shape: (3408, 10)
Columns: ['course_id', 'Course Name', 'University', 'Difficulty Level', 'Course Rating', 'Course URL', 'Course Description', 'skills_cleaned', 'difficulty_numeric', 'recommended_grade']


Unnamed: 0,course_id,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,skills_cleaned,difficulty_numeric,recommended_grade
0,0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,peering film dialogue creative writing wri...,1,70
1,1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",finance business plan persona (user experien...,1,70
2,2,Silicon Thin Film Solar Cells,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics solar energy film lambda...,3,85


In [2]:
# 2) Build a text field for each course
TEXT_COLS_PREFERRED = ["Course Name", "University", "Course Description", "skills_cleaned"]
missing = [c for c in TEXT_COLS_PREFERRED if c not in df.columns]
if missing:
    print("Warning: missing expected columns:", missing)

def clean_text(s: str) -> str:
    s = str(s) if s is not None else ""
    s = s.replace("\uFFFD", " ")  # replacement char
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

def canonical_difficulty(x: str) -> str | None:
    """Map difficulty strings to a stable token (beginner/intermediate/advanced)."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return None
    s = clean_text(x)
    if not s:
        return None
    if "beginner" in s:
        return "beginner"
    if "intermediate" in s:
        return "intermediate"
    if "advanced" in s:
        return "advanced"
    # fall back to cleaned raw string if it's something unexpected
    return s

def build_course_text(row) -> str:
    parts = []
    for c in TEXT_COLS_PREFERRED:
        if c in row and pd.notna(row[c]):
            parts.append(clean_text(row[c]))

    # Optional: add canonical difficulty token for consistency
    if "Difficulty Level" in df.columns and "Difficulty Level" in row and pd.notna(row["Difficulty Level"]):
        d = canonical_difficulty(row["Difficulty Level"])
        if d:
            parts.append(f"difficulty_{d}")

    return " \n".join([p for p in parts if p])

df["_text"] = df.apply(build_course_text, axis=1)
empty_rate = (df["_text"].str.len() == 0).mean()
print("Empty _text rate:", round(float(empty_rate), 4))
df[["course_id","Course Name","skills_cleaned"]].head(3)

Empty _text rate: 0.0


Unnamed: 0,course_id,Course Name,skills_cleaned
0,0,Write A Feature Length Screenplay For Film Or ...,peering film dialogue creative writing wri...
1,1,Business Strategy: Business Model Canvas Analy...,finance business plan persona (user experien...
2,2,Silicon Thin Film Solar Cells,chemistry physics solar energy film lambda...


In [3]:
# 3) Fit TF-IDF over course texts
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9,
)
X = vectorizer.fit_transform(df["_text"])
print("TF-IDF matrix:", X.shape)
print("Vocab size:", len(vectorizer.vocabulary_))

TF-IDF matrix: (3408, 53323)
Vocab size: 53323


In [4]:
# 4) Recommender helpers (by interest text, or by similar course)
FEATURE_NAMES = np.array(vectorizer.get_feature_names_out())

def top_overlap_terms(query_vec, item_vec, top_n=8):
    """Return top terms contributing to similarity via overlap of TF-IDF weights."""
    # element-wise product keeps only overlapping features; stays sparse
    overlap = query_vec.multiply(item_vec)
    if overlap.nnz == 0:
        return []
    # pick top weights among non-zeros
    coo = overlap.tocoo()
    idx = np.argsort(coo.data)[::-1][:top_n]
    feat_idx = coo.col[idx]
    return FEATURE_NAMES[feat_idx].tolist()

def _minmax_0_1(scores: np.ndarray) -> np.ndarray:
    scores = np.asarray(scores, dtype=float)
    if scores.size == 0:
        return scores
    lo = float(np.min(scores))
    hi = float(np.max(scores))
    if hi <= lo:
        return np.ones_like(scores)
    return (scores - lo) / (hi - lo)

def recommend_by_interest_text(
    query: str,
    top_k=10,
    *,
    filter_zero: bool = True,
    normalize_scores: bool = False,
    explain_top_n_terms: int = 8,
 ):
    """Recommend courses for a free-text interest query.

    - `filter_zero=True` returns only courses with non-zero similarity (sparse top-k).
    - `normalize_scores=True` min-max normalizes returned scores to [0, 1].
    """
    query = clean_text(query)
    qv = vectorizer.transform([query])

    if filter_zero:
        # Sparse dot product stays sparse; only overlapping courses become non-zero.
        sims_sparse = (qv @ X.T).tocsr()
        if sims_sparse.nnz == 0:
            cols = ["course_id", "Course Name", "University", "Difficulty Level", "Course Rating", "Course URL", "skills_cleaned", "score", "why_keywords"]
            return pd.DataFrame(columns=cols)
        idxs = sims_sparse.indices
        vals = sims_sparse.data
        order = np.argsort(vals)[::-1]
        order = order[: min(int(top_k), order.size)]
        top_idx = idxs[order]
        top_scores = vals[order]
    else:
        # Dense fallback (includes zeros); ok for small catalogs / debugging.
        sims = cosine_similarity(qv, X).ravel()
        top_idx = np.argsort(sims)[::-1][:top_k]
        top_scores = sims[top_idx]

    out = df.loc[top_idx, ["course_id", "Course Name", "University", "Difficulty Level", "Course Rating", "Course URL", "skills_cleaned"]].copy()
    out["score"] = top_scores
    if normalize_scores:
        out["score"] = _minmax_0_1(out["score"].to_numpy())

    # explanations
    expl = []
    for i in top_idx:
        terms = top_overlap_terms(qv, X[int(i)], top_n=explain_top_n_terms)
        expl.append(", ".join(terms))
    out["why_keywords"] = expl
    return out.reset_index(drop=True)

def recommend_similar_to_course(
    course_id: int,
    top_k=10,
    *,
    filter_zero: bool = True,
    normalize_scores: bool = False,
 ):
    if "course_id" not in df.columns:
        raise ValueError("course_id column not found in catalog")
    matches = df.index[df["course_id"] == course_id].tolist()
    if not matches:
        raise ValueError(f"course_id {course_id} not found")
    i0 = int(matches[0])

    if filter_zero:
        sims_sparse = (X[i0] @ X.T).tocsr()
        if sims_sparse.nnz == 0:
            cols = ["course_id", "Course Name", "University", "Difficulty Level", "Course Rating", "Course URL", "skills_cleaned", "score"]
            return pd.DataFrame(columns=cols)
        idxs = sims_sparse.indices
        vals = sims_sparse.data
        # exclude itself (if present among nonzeros)
        keep = idxs != i0
        idxs = idxs[keep]
        vals = vals[keep]
        if vals.size == 0:
            cols = ["course_id", "Course Name", "University", "Difficulty Level", "Course Rating", "Course URL", "skills_cleaned", "score"]
            return pd.DataFrame(columns=cols)
        order = np.argsort(vals)[::-1]
        order = order[: min(int(top_k), order.size)]
        top_idx = idxs[order]
        top_scores = vals[order]
    else:
        sims = cosine_similarity(X[i0], X).ravel()
        sims[i0] = -1
        top_idx = np.argsort(sims)[::-1][:top_k]
        top_scores = sims[top_idx]

    out = df.loc[top_idx, ["course_id", "Course Name", "University", "Difficulty Level", "Course Rating", "Course URL", "skills_cleaned"]].copy()
    out["score"] = top_scores
    if normalize_scores:
        out["score"] = _minmax_0_1(out["score"].to_numpy())
    return out.reset_index(drop=True)

## Optional: Precompute top-K similar courses (faster repeated queries)
If you call `recommend_similar_to_course` a lot, you can precompute neighbors once using `NearestNeighbors(metric="cosine")` and then do O(k) lookups.

In [5]:
from sklearn.neighbors import NearestNeighbors

# Fit neighbors on TF-IDF (cosine distance; smaller is more similar)
TOPK_NEIGHBORS = 50  # precompute this many neighbors per course
nn = NearestNeighbors(n_neighbors=TOPK_NEIGHBORS + 1, metric="cosine", algorithm="brute")
nn.fit(X)

# Indices and distances for each course (includes self at position 0)
distances, indices = nn.kneighbors(X, return_distance=True)
similarities = 1.0 - distances

def fast_similar_to_course(course_id: int, top_k=10):
    matches = df.index[df["course_id"] == course_id].tolist()
    if not matches:
        raise ValueError(f"course_id {course_id} not found")
    i0 = int(matches[0])

    neigh_idx = indices[i0]
    neigh_sim = similarities[i0]

    # drop self (first entry)
    neigh_idx = neigh_idx[1:]
    neigh_sim = neigh_sim[1:]

    top_k = min(int(top_k), len(neigh_idx))
    top_idx = neigh_idx[:top_k]
    top_scores = neigh_sim[:top_k]

    out = df.loc[top_idx, ["course_id", "Course Name", "University", "Difficulty Level", "Course Rating", "Course URL", "skills_cleaned"]].copy()
    out["score"] = top_scores
    return out.reset_index(drop=True)

# Example usage:
# display(fast_similar_to_course(int(df.loc[0, "course_id"]), top_k=8))

## Sample tests (no split)
These are simple sanity checks: provide an interest prompt and inspect the top recommendations + `why_keywords` explanations.


In [6]:
tests = [
    "biology ecology environment design drawing architecture landscape",
    "data analysis python statistics machine learning visualization",
    "finance accounting budgeting business strategy management",
    "creative writing storytelling film screenwriting dialogue",
    "health nutrition fitness mental wellbeing psychology",
    "islamic studies quran hadith fiqh theology",
    "python development programming coding software engineering",
]

for q in tests:
    print("\n" + "="*90)
    print("QUERY:", q)
    recs = recommend_by_interest_text(q, top_k=5)
    display(recs)


QUERY: biology ecology environment design drawing architecture landscape


Unnamed: 0,course_id,Course Name,University,Difficulty Level,Course Rating,Course URL,skills_cleaned,score,why_keywords
0,2382,Ecology: from cells to Gaia,National Research Tomsk State University,Intermediate,4.3,https://www.coursera.org/learn/ecology,ecosystems ecosystem sustainability ecology...,0.242658,"ecology, biology, environment"
1,93,Making Architecture,IE Business School,Intermediate,4.8,https://www.coursera.org/learn/making-architec...,modeling architecture thought analysis,0.168555,"architecture, design, environment"
2,2702,Systems Biology and Biotechnology Capstone,Icahn School of Medicine at Mount Sinai,Advanced,4.2,https://www.coursera.org/learn/systems-biology...,computational biology systems design biology,0.153731,"biology, design"
3,1678,Ecology: Ecosystem Dynamics and Conservation,American Museum of Natural History,Beginner,4.8,https://www.coursera.org/learn/ecology-conserv...,ecology ecosystems thought biology biodive...,0.152531,"ecology, biology"
4,1239,"Biological Diversity (Theories, Measures and D...",National Research Tomsk State University,Intermediate,4.2,https://www.coursera.org/learn/biological-dive...,ecology biodiversity evolution climate chan...,0.15042,"ecology, biology ecology, biology"



QUERY: data analysis python statistics machine learning visualization


Unnamed: 0,course_id,Course Name,University,Difficulty Level,Course Rating,Course URL,skills_cleaned,score,why_keywords
0,1614,Exploratory Data Analysis,Coursera Project Network,Beginner,4.3,https://www.coursera.org/learn/exploratory-dat...,general statistics exploratory data analysis ...,0.340398,"data analysis, analysis, data, visualization, ..."
1,1995,Data Analysis with Python,IBM,Conversant,4.6,https://www.coursera.org/learn/data-analysis-w...,data model regression python programming re...,0.28483,"data analysis, analysis python, data, analysis..."
2,2473,Statistical Data Visualization in Python,Coursera Project Network,Advanced,3.7,https://www.coursera.org/learn/statistical-vis...,project mine chart project computer graphic...,0.278912,"visualization, statistics machine, python, mac..."
3,2280,Exploratory Data Analysis With Python and Pandas,Coursera Project Network,Beginner,4.7,https://www.coursera.org/learn/exploratory-dat...,data analysis analysis exploratory data anal...,0.26214,"data analysis, analysis, python, analysis pyth..."
4,2464,Introduction to Applied Machine Learning,Alberta Machine Intelligence Institute,Intermediate,4.7,https://www.coursera.org/learn/machine-learnin...,algorithms test set machine learning ordere...,0.259973,"machine, machine learning, learning, data, dat..."



QUERY: finance accounting budgeting business strategy management


Unnamed: 0,course_id,Course Name,University,Difficulty Level,Course Rating,Course URL,skills_cleaned,score,why_keywords
0,630,Finance for Non-Financial Professionals,"University of California, Irvine",Conversant,4.5,https://www.coursera.org/learn/finance-for-non...,analysis cost finance financial statement ...,0.285476,"finance, budgeting, accounting, finance accoun..."
1,2984,Strategy Formulation,Copenhagen Business School,Beginner,4.5,https://www.coursera.org/learn/strategy-formul...,leadership and management business strategy ...,0.263314,"business strategy, strategy, business, management"
2,1819,Advanced Business Strategy,University of Virginia,Beginner,4.7,https://www.coursera.org/learn/uva-darden-Adva...,strategy business strategy strategic managem...,0.252424,"strategy, business strategy, business, management"
3,829,Financial Reporting Capstone,University of Illinois at Urbana-Champaign,Advanced,4.5,https://www.coursera.org/learn/financial-repor...,finance accounting financial statement,0.210628,"accounting, finance accounting, finance"
4,1120,Core Concepts of Accounting � Numbers and People,Moscow Institute of Physics and Technology,Advanced,4.8,https://www.coursera.org/learn/core-concepts-o...,inventory cost financial accounting finance...,0.208611,"accounting, finance, business, management"



QUERY: creative writing storytelling film screenwriting dialogue


Unnamed: 0,course_id,Course Name,University,Difficulty Level,Course Rating,Course URL,skills_cleaned,score,why_keywords
0,2355,Writing Stories About Ourselves,Wesleyan University,Advanced,3.7,https://www.coursera.org/learn/writing-about-o...,film writing essay writing storytelling cr...,0.425466,"dialogue, storytelling, film, creative, writin..."
1,1411,Creative Writing: The Craft of Setting and Des...,Wesleyan University,Advanced,4.7,https://www.coursera.org/learn/craft-of-settin...,copywriting storytelling human learning fil...,0.314068,"creative writing, creative, writing, storytell..."
2,2911,Writing a Personal Essay,Wesleyan University,Advanced,4.4,https://www.coursera.org/learn/personal-essay,chemistry film essay writing creative writi...,0.30305,"creative writing, writing, writing storytellin..."
3,330,Memoir and Personal Essay: Managing Your Relat...,Wesleyan University,Advanced,4.2,https://www.coursera.org/learn/memoir-reader-r...,relative change and difference film essay wr...,0.287111,"writing, writing storytelling, creative writin..."
4,523,Writing in First Person Point of View,Wesleyan University,Advanced,4.6,https://www.coursera.org/learn/first-person-pov,poetry writing film storytelling writing c...,0.246754,"writing, storytelling, creative writing, film,..."



QUERY: health nutrition fitness mental wellbeing psychology


Unnamed: 0,course_id,Course Name,University,Difficulty Level,Course Rating,Course URL,skills_cleaned,score,why_keywords
0,409,Positive Psychiatry and Mental Health,The University of Sydney,Intermediate,4.7,https://www.coursera.org/learn/positive-psychi...,leadership and management mindfulness stress...,0.308314,"mental, health, wellbeing, psychology"
1,689,The Social Context of Mental Health and Illness,University of Toronto,Beginner,4.5,https://www.coursera.org/learn/mental-health,mental health social psychology anthropology,0.283688,"mental, health, psychology"
2,422,Biohacking Your Brain's Health,Emory University,Beginner,4.7,https://www.coursera.org/learn/biohacking-your...,neuroscience nutrition exercise mental heal...,0.237779,"health, nutrition, fitness, mental"
3,2710,Hacking Exercise For Health. The surprising ne...,McMaster University,Intermediate,4.7,https://www.coursera.org/learn/hacking-exercis...,protein training mental health exercise,0.2316,"fitness, health, mental"
4,2161,Major Depression in the Population: A Public H...,Johns Hopkins University,Intermediate,4.7,https://www.coursera.org/learn/public-health-d...,public health mental health clinical study d...,0.20307,"mental, health"



QUERY: islamic studies quran hadith fiqh theology


Unnamed: 0,course_id,Course Name,University,Difficulty Level,Course Rating,Course URL,skills_cleaned,score,why_keywords
0,3210,Constitutional Struggles in the Muslim World,University of Copenhagen,Beginner,4.8,https://www.coursera.org/learn/muslim-world,law international law religion reason econ...,0.15671,islamic
1,2962,Toledo: Deciphering Secrets of Medieval Spain,University of Colorado System,Advanced,4.2,https://www.coursera.org/learn/toledo-decipher...,culture history poetry writing r&d manageme...,0.137719,islamic
2,3330,Re-imaging God in Korean Context,Yonsei University,Intermediate,4.7,https://www.coursera.org/learn/god-korean-context,religion interpretation korean language,0.111337,theology
3,462,Understanding Medical Research: Your Facebook ...,Yale University,Beginner,4.9,https://www.coursera.org/learn/medical-research,experiment relative change and difference ge...,0.065896,studies
4,2777,Intellectual Humility: Practice,The University of Edinburgh,Beginner,4.6,https://www.coursera.org/learn/intellectual-hu...,intellectual educational psychology evaluati...,0.054278,"theology, islamic"



QUERY: python development programming coding software engineering


Unnamed: 0,course_id,Course Name,University,Difficulty Level,Course Rating,Course URL,skills_cleaned,score,why_keywords
0,61,Python Tricks and Hacks for Productivity,Coursera Project Network,Advanced,4.1,https://www.coursera.org/learn/python-hacks,computer programming python programming soft...,0.215293,"python, coding, software engineering, software..."
1,2263,Python Data Analysis,Rice University,Advanced,4.6,https://www.coursera.org/learn/python-analysis,data visualization jpeg python programming ...,0.193238,"python, python development, programming, devel..."
2,722,Python Data Representations,Rice University,Advanced,4.7,https://www.coursera.org/learn/python-represen...,python programming process iteration comput...,0.189139,"python, python development, programming, devel..."
3,1137,Software Development Processes and Methodologies,University of Minnesota,Advanced,4.4,https://www.coursera.org/learn/software-processes,modeling software scrum (software developmen...,0.172159,"software, software engineering, development, e..."
4,3386,Requirements Gathering for Secure Software Dev...,University of Colorado System,Beginner,4.6,https://www.coursera.org/learn/requirements-ga...,software agile software development software...,0.166632,"software, development, software engineering, e..."


In [7]:
# Similar-items test: pick a random course and find close neighbors
rng = np.random.default_rng(42)
i = int(rng.integers(0, len(df)))
seed = df.loc[i]
print("Seed course:")
display(seed[["course_id","Course Name","University","Difficulty Level","Course Rating","skills_cleaned"]])

print("\nMost similar courses:")
display(recommend_similar_to_course(int(seed["course_id"]), top_k=8))

Seed course:


course_id                                                         304
Course Name                                    Astro 101: Black Holes
University                                      University of Alberta
Difficulty Level                                             Beginner
Course Rating                                                     4.8
skills_cleaned      path (variable)  angular  energy  materials  p...
Name: 304, dtype: object


Most similar courses:


Unnamed: 0,course_id,Course Name,University,Difficulty Level,Course Rating,Course URL,skills_cleaned,score
0,1797,The Evolving Universe,Caltech,Intermediate,4.6,https://www.coursera.org/learn/evolvinguniverse,astronomy evolution process theory of relat...,0.256373
1,2528,Introduction into General Theory of Relativity,National Research University Higher School of ...,Advanced,4.6,https://www.coursera.org/learn/general-relativity,theory of relativity astronomy tailored acce...,0.212041
2,200,Data-driven Astronomy,The University of Sydney,Advanced,4.8,https://www.coursera.org/learn/data-driven-ast...,computer programming python programming sql ...,0.175112
3,753,The Sun and the Total Eclipse of August 2017,University of Colorado Boulder,Beginner,4.6,https://www.coursera.org/learn/eclipse,planning solar systems angular process rec...,0.15714
4,1809,Analyzing the Universe,Rutgers the State University of New Jersey,Beginner,4.6,https://www.coursera.org/learn/analyze,astronomy sources mechanical engineering qu...,0.155273
5,445,AstroTech: The Science and Technology behind A...,The University of Edinburgh,Beginner,4.7,https://www.coursera.org/learn/astronomy-techn...,solar systems energy astronomy chemistry p...,0.151155
6,2469,Physical Basics of Quantum Computing,Saint Petersburg State University,Intermediate,4.2,https://www.coursera.org/learn/physical-basis-...,quantum mechanics physics,0.113803
7,981,Image Processing with Python,Coursera Project Network,Beginner,4.2,https://www.coursera.org/learn/image-processin...,opencv image processing project python prog...,0.102313


## What to look for in the sample tests
- Does `why_keywords` match your query intent?
- Are recommendations diverse enough, or too repetitive?
- If everything looks "samey", increase `ngram_range`, include more columns, or lower `min_df`.

## Export model artifacts
This saves the fitted TF‑IDF vectorizer and the course TF‑IDF matrix so you can reuse them in the API without refitting.
It also saves a minimal course table needed to map indices back to course metadata.

In [8]:
from pathlib import Path
import json
import joblib

EXPORT_DIR = Path.cwd().parent / "models" / "content_based"
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

artifacts_path = EXPORT_DIR / "tfidf_artifacts.joblib"
meta_path = EXPORT_DIR / "tfidf_metadata.json"

# Keep only what we need to serve recommendations
catalog_min = df[["course_id", "Course Name", "University", "Difficulty Level", "Course Rating", "Course URL", "skills_cleaned"]].copy()

joblib.dump(
    {
        "vectorizer": vectorizer,
        "X": X,
        "catalog": catalog_min,
    },
    artifacts_path,
    compress=3,
)

meta = {
    "created_utc": __import__("datetime").datetime.utcnow().isoformat() + "Z",
    "catalog_rows": int(len(df)),
    "tfidf_shape": [int(X.shape[0]), int(X.shape[1])],
    "text_columns": TEXT_COLS_PREFERRED,
    "catalog_source": str(CATALOG_PATH),
    "sklearn_version": __import__("sklearn").__version__,
}
meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")

print("Saved:", artifacts_path)
print("Saved:", meta_path)

Saved: c:\Users\Jed\Desktop\9raya\ML\models\content_based\tfidf_artifacts.joblib
Saved: c:\Users\Jed\Desktop\9raya\ML\models\content_based\tfidf_metadata.json


In [18]:
from pathlib import Path
import re
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics.pairwise import cosine_similarity

ARTIFACTS_PATH = Path.cwd().parent / "models" / "content_based" / "tfidf_artifacts.joblib"
assert ARTIFACTS_PATH.exists(), f"Missing: {ARTIFACTS_PATH}"

bundle = joblib.load(ARTIFACTS_PATH)
vectorizer2 = bundle["vectorizer"]
X2 = bundle["X"]
catalog2 = bundle["catalog"]  # minimal catalog saved during export
feature_names2 = np.array(vectorizer2.get_feature_names_out())

def clean_text2(s: str) -> str:
    s = str(s) if s is not None else ""
    s = s.replace("\uFFFD", " ")
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

def top_overlap_terms2(query_vec, item_vec, top_n=8):
    overlap = query_vec.multiply(item_vec)
    if overlap.nnz == 0:
        return []
    coo = overlap.tocoo()
    idx = np.argsort(coo.data)[::-1][:top_n]
    return feature_names2[coo.col[idx]].tolist()

def recommend_loaded(query: str, top_k=10):
    q = clean_text2(query)
    qv = vectorizer2.transform([q])
    sims = cosine_similarity(qv, X2).ravel()
    top_idx = np.argsort(sims)[::-1][:top_k]
    out = catalog2.iloc[top_idx].copy()
    out["score"] = sims[top_idx]
    out["why_keywords"] = [", ".join(top_overlap_terms2(qv, X2[i], top_n=8)) for i in top_idx]
    return out.reset_index(drop=True)

print("Loaded:", ARTIFACTS_PATH)
print("Catalog rows:", len(catalog2), "TF-IDF shape:", X2.shape)

display(recommend_loaded("math python", top_k=5))

Loaded: c:\Users\Jed\Desktop\9raya\ML\models\content_based\tfidf_artifacts.joblib
Catalog rows: 3408 TF-IDF shape: (3408, 53323)


Unnamed: 0,course_id,Course Name,University,Difficulty Level,Course Rating,Course URL,skills_cleaned,score,why_keywords
0,354,Data Science Math Skills,Duke University,Conversant,4.4,https://www.coursera.org/learn/datasciencemath...,general statistics probability euler's totie...,0.438081,math
1,111,Improving Math Engagement with Prodigy,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/teaching-prodigy,accounting mathematics child strategy proj...,0.384888,math
2,2263,Python Data Analysis,Rice University,Advanced,4.6,https://www.coursera.org/learn/python-analysis,data visualization jpeg python programming ...,0.275204,python
3,2579,Math behind Moneyball,University of Houston,Advanced,4.2,https://www.coursera.org/learn/mathematics-sport,regression sports microsoft excel mathemati...,0.267617,math
4,16,Python Programming Essentials,Rice University,Beginner,4.8,https://www.coursera.org/learn/python-programming,semantics python programming problem solving...,0.264439,python


In [10]:
# Evaluate content-based model using user_interactions.csv (leave-one-out per user)
import time
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import normalize
import scipy.sparse as sp

K = 10
MAX_USERS = 5000  # set None for all users; keep small if it runs slow
RANDOM_SEED = 42

# Prefer exported artifacts if loaded; otherwise fall back to in-memory objects
X_eval = X2 if "X2" in globals() else X
catalog_eval = (
    catalog2
    if "catalog2" in globals()
    else df[["course_id", "Course Name", "University", "Difficulty Level", "Course Rating", "Course URL", "skills_cleaned"]].copy()
 )
course_ids = catalog_eval["course_id"].astype(int).to_numpy()
id_to_index = {int(cid): i for i, cid in enumerate(course_ids)}
valid_course_ids = set(int(x) for x in course_ids)

INTERACTIONS_PATH = Path.cwd().parent / "data" / "processed" / "user_interactions.csv"
assert INTERACTIONS_PATH.exists(), f"Missing: {INTERACTIONS_PATH}"
inter = pd.read_csv(INTERACTIONS_PATH)
print("Interactions shape:", inter.shape)
print("Interactions columns:", list(inter.columns))

# Guess user and item columns
lower = {c.lower(): c for c in inter.columns}
def pick_col(candidates):
    for cand in candidates:
        for lc, orig in lower.items():
            if lc == cand or lc.endswith(cand) or cand in lc:
                return orig
    return None

user_col = pick_col(["user_id", "userid", "user"])
item_col = pick_col(["course_id", "program_id", "item_id", "content_id", "course"])
if user_col is None or item_col is None:
    raise ValueError(
        f"Could not infer user/item columns. Found user={user_col}, item={item_col}. Columns={list(inter.columns)}"
    )

# Keep only interactions that map to known course_ids
inter = inter[[user_col, item_col]].dropna()
inter[user_col] = inter[user_col].astype(int)
inter[item_col] = inter[item_col].astype(int)
inter = inter[inter[item_col].isin(valid_course_ids)]
print("After filtering to known courses:", inter.shape)

# Require at least 2 interactions per user to do leave-one-out
counts = inter.groupby(user_col)[item_col].count()
eligible_users = counts[counts >= 2].index.to_numpy()
print("Eligible users (>=2 interactions):", len(eligible_users))
if MAX_USERS is not None and len(eligible_users) > MAX_USERS:
    rng = np.random.default_rng(RANDOM_SEED)
    eligible_users = rng.choice(eligible_users, size=MAX_USERS, replace=False)
    print("Sampled users:", len(eligible_users))

# Build per-user lists of item indices
user_items = (
    inter[inter[user_col].isin(eligible_users)]
    .groupby(user_col)[item_col]
    .apply(lambda s: [id_to_index[int(x)] for x in s.values])
    .to_dict()
 )

t0 = time.perf_counter()
hits = 0
prec_sum = 0.0
ndcg_sum = 0.0
n_eval = 0

for _, items in user_items.items():
    if len(items) < 2:
        continue
    # leave-one-out: last item as test (random would also be OK)
    test_i = items[-1]
    train_items = items[:-1]

    # build user profile as mean of training item vectors
    profile = X_eval[train_items].mean(axis=0)
    profile = profile.tocsr() if sp.issparse(profile) else sp.csr_matrix(profile)
    profile = normalize(profile)

    # cosine similarity = dot product for l2-normalized vectors
    scores = (profile @ X_eval.T).toarray().ravel()
    # exclude training items from ranking
    scores[train_items] = -np.inf

    # top-K indices
    if K >= len(scores):
        top_idx = np.argsort(scores)[::-1]
    else:
        cand = np.argpartition(scores, -K)[-K:]
        top_idx = cand[np.argsort(scores[cand])[::-1]]

    # find position of held-out item if present in top-k
    where = np.where(top_idx == test_i)[0]
    if where.size > 0:
        pos0 = int(where[0])  # 0-based position within top-k
        hits += 1
        prec_sum += 1.0 / K
        ndcg_sum += 1.0 / np.log2(pos0 + 2)  # rank=pos0+1 => DCG = 1/log2(rank+1) = 1/log2(pos0+2)
    n_eval += 1

dt = time.perf_counter() - t0
print("\nEvaluation summary")
print("Users evaluated:", n_eval)
print(f"HitRate@{K}: {hits / max(n_eval, 1):.4f}")
print(f"Precision@{K}: {prec_sum / max(n_eval, 1):.6f}")
print(f"NDCG@{K}: {ndcg_sum / max(n_eval, 1):.6f}")
print("Seconds:", round(dt, 2))

Interactions shape: (11935, 4)
Interactions columns: ['user_id', 'course_id', 'interaction', 'score']
After filtering to known courses: (11935, 2)
Eligible users (>=2 interactions): 600

Evaluation summary
Users evaluated: 600
HitRate@10: 0.0067
Precision@10: 0.000667
NDCG@10: 0.002797
Seconds: 6.5
