In [13]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

## 1. LOAD DATA

In [14]:
df = pd.read_excel("online_course_recommendation_v2.xlsx")

print("Data shape:", df.shape)
print(df.head())

Data shape: (100000, 14)
   user_id  course_id                       course_name       instructor  \
0    15796       9366              Python for Beginners      Emma Harris   
1      861       1928   Cybersecurity for Professionals  Alexander Young   
2    38159       9541  DevOps and Continuous Deployment   Dr. Mia Walker   
3    44733       3708   Project Management Fundamentals   Benjamin Lewis   
4    11285       3361       Ethical Hacking Masterclass     Daniel White   

   course_duration_hours certification_offered difficulty_level  rating  \
0                   39.1                   Yes         Beginner     5.0   
1                   36.3                   Yes         Beginner     4.3   
2                   13.4                   Yes         Beginner     3.9   
3                   58.3                   Yes         Beginner     3.1   
4                   30.8                   Yes         Beginner     2.8   

   enrollment_numbers  course_price  feedback_score study_material_

## 2. BASIC CLEANING

In [15]:
# We'll focus on these columns for recommendation:
# user_id, course_id, rating, enrollment_numbers, course_price, course_duration_hours, etc.

necessary_cols = ["user_id", "course_id", "rating", "course_name",
                  "enrollment_numbers", "course_price", "course_duration_hours"]

for col in necessary_cols:
    if col not in df.columns:
        print(f"WARNING: {col} not found in dataset!")

# Drop rows with missing user_id / course_id / rating
df = df.dropna(subset=["user_id", "course_id", "rating"])

# Keep only relevant columns (others are still available if needed)
df = df[necessary_cols]

# Ensure types
df["user_id"] = df["user_id"].astype(int)
df["course_id"] = df["course_id"].astype(int)
df["rating"] = df["rating"].astype(float)

# Fill missing numeric with 0 (for business features like enrollment, price)
for col in ["enrollment_numbers", "course_price", "course_duration_hours"]:
    if col in df.columns:
        df[col] = df[col].fillna(0)

print("\nAfter cleaning:")
print(df.head())


After cleaning:
   user_id  course_id  rating                       course_name  \
0    15796       9366     5.0              Python for Beginners   
1      861       1928     4.3   Cybersecurity for Professionals   
2    38159       9541     3.9  DevOps and Continuous Deployment   
3    44733       3708     3.1   Project Management Fundamentals   
4    11285       3361     2.8       Ethical Hacking Masterclass   

   enrollment_numbers  course_price  course_duration_hours  
0               21600        317.50                   39.1  
1               15379         40.99                   36.3  
2                6431        380.81                   13.4  
3               48245        342.80                   58.3  
4               34556        381.01                   30.8  


## 3. TRAIN/TEST SPLIT (FOR EVALUATION OF RECOMMENDER)

In [16]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("\nTrain interactions:", train_df.shape)
print("Test interactions:", test_df.shape)


Train interactions: (80000, 7)
Test interactions: (20000, 7)


## 4. BUILD USER-COURSE MATRIX (TRAIN ONLY)

In [17]:
# Pivot: rows = users, columns = courses, values = rating
user_course_matrix = train_df.pivot_table(
    index="user_id",
    columns="course_id",
    values="rating"
).fillna(0.0)

print("\nUser-Course Matrix shape:", user_course_matrix.shape)


User-Course Matrix shape: (40003, 9997)


## 5. ITEM-BASED COLLABORATIVE FILTERING (COURSE-COURSE SIMILARITY)

In [18]:
# We compute cosine similarity between course vectors (columns)
# transpose to shape: courses x users
course_user_matrix = user_course_matrix.T  # rows = course_id, columns = user_id

# Compute cosine similarity
course_similarity = cosine_similarity(course_user_matrix)
course_similarity_df = pd.DataFrame(
    course_similarity,
    index=course_user_matrix.index,
    columns=course_user_matrix.index
)

print("\nCourse-Course similarity matrix shape:", course_similarity_df.shape)


Course-Course similarity matrix shape: (9997, 9997)


## 6. BUSINESS FEATURES: POPULARITY & AVG RATING

In [19]:
# Popularity: number of enrollments / number of ratings
course_stats = train_df.groupby("course_id").agg(
    avg_rating=("rating", "mean"),
    num_ratings=("rating", "count"),
    avg_price=("course_price", "mean"),
    avg_duration=("course_duration_hours", "mean"),
    avg_enrollment=("enrollment_numbers", "mean")
).reset_index()

print("\nCourse business stats:")
print(course_stats.head())

# Merge course name (if available)
course_names = df[["course_id", "course_name"]].drop_duplicates()
course_stats = course_stats.merge(course_names, on="course_id", how="left")


Course business stats:
   course_id  avg_rating  num_ratings   avg_price  avg_duration  \
0          1    3.855556            9  205.188889     52.933333   
1          2    4.125000            8  285.628750     57.900000   
2          3    3.950000            2  358.815000     48.800000   
3          4    3.857143            7  322.171429     51.157143   
4          5    3.866667            6  233.215000     61.000000   

   avg_enrollment  
0    23704.111111  
1    29101.875000  
2    48984.000000  
3    21731.142857  
4    23480.833333  


## 7. HELPER: GET TOP SIMILAR COURSES

In [20]:
def get_similar_courses(course_id, top_n=10, min_similarity=0.0):
    """
    Returns top N courses similar to given course_id based on CF similarity.
    Includes avg_rating and num_ratings for business use.
    """
    if course_id not in course_similarity_df.index:
        print(f"Course {course_id} not found in similarity matrix.")
        return pd.DataFrame()

    # All similarity scores for this course
    sims = course_similarity_df.loc[course_id].drop(course_id)  # drop self

    # Filter by minimum similarity
    sims = sims[sims >= min_similarity]

    # Sort descending
    sims = sims.sort_values(ascending=False).head(top_n)

    sim_df = sims.reset_index()
    sim_df.columns = ["course_id", "similarity"]

    # Attach course stats (avg_rating, num_ratings, name)
    sim_df = sim_df.merge(course_stats, on="course_id", how="left")

    return sim_df


# Example:
print("\nExample similar courses:")
example_course_id = course_stats["course_id"].iloc[0]
print("Example course_id:", example_course_id)
print(get_similar_courses(example_course_id, top_n=5))


Example similar courses:
Example course_id: 1
    course_id  similarity  avg_rating  num_ratings   avg_price  avg_duration  \
0        9720    0.177031    3.960000            5  286.990000     64.260000   
1        9720    0.177031    3.960000            5  286.990000     64.260000   
2        9720    0.177031    3.960000            5  286.990000     64.260000   
3        9720    0.177031    3.960000            5  286.990000     64.260000   
4        9720    0.177031    3.960000            5  286.990000     64.260000   
5        8004    0.157632    3.877778            9  266.477778     61.944444   
6        8004    0.157632    3.877778            9  266.477778     61.944444   
7        8004    0.157632    3.877778            9  266.477778     61.944444   
8        8004    0.157632    3.877778            9  266.477778     61.944444   
9        8004    0.157632    3.877778            9  266.477778     61.944444   
10       8004    0.157632    3.877778            9  266.477778     61.944

## 8. BUSINESS-STYLE USER RECOMMENDATION

In [21]:
def recommend_for_user(user_id, top_n=10,
                       alpha_sim=0.6, alpha_pop=0.2, alpha_rating=0.2):
    """
    Recommend courses to a user using:
      - CF similarity (item-based)
      - Popularity (num_ratings or enrollments)
      - Avg rating

    Business objective:
      score = alpha_sim * similarity
             + alpha_pop * normalized_popularity
             + alpha_rating * normalized_avg_rating

    - alpha_* coefficients decide the business focus:
        sim high  -> very personalized
        pop high  -> push popular/high-traffic courses
        rating high -> push high quality courses
    """
    # If user has no interactions in train, fallback to popular courses
    if user_id not in user_course_matrix.index:
        print(f"User {user_id} not found in training data. Returning popular courses.")
        # Fallback: business ranking only
        popular = course_stats.copy()
        # Use num_ratings as popularity if avg_enrollment is missing
        popular["popularity"] = popular["avg_enrollment"].fillna(0) + \
                                popular["num_ratings"].fillna(0)
        # Normalize features
        popular["popularity_norm"] = (popular["popularity"] - popular["popularity"].min()) / \
                                     (popular["popularity"].max() - popular["popularity"].min() + 1e-9)
        popular["avg_rating_norm"] = (popular["avg_rating"] - popular["avg_rating"].min()) / \
                                     (popular["avg_rating"].max() - popular["avg_rating"].min() + 1e-9)
        # No similarity info here, so alpha_sim not used
        popular["business_score"] = alpha_pop * popular["popularity_norm"] + \
                                    alpha_rating * popular["avg_rating_norm"]
        popular = popular.sort_values("business_score", ascending=False).head(top_n)
        return popular[["course_id", "course_name", "avg_rating", "num_ratings", "business_score"]]

    # Courses user already interacted with
    user_ratings = user_course_matrix.loc[user_id]
    watched_courses = user_ratings[user_ratings > 0].index.tolist()

    if len(watched_courses) == 0:
        print(f"User {user_id} has no rated courses. Returning popular courses.")
        return recommend_for_user(user_id=-1, top_n=top_n)  # trigger cold-start branch

    # Weighted sum of similarities: courses similar to those user liked
    sim_scores = pd.Series(dtype=float)

    for course in watched_courses:
        rating = user_ratings[course]
        # Get similarity scores for 'course'
        sims = course_similarity_df.loc[course]

        # Weight by user's rating (more weight for highly rated courses)
        sims = sims * rating

        # Accumulate
        sim_scores = sim_scores.add(sims, fill_value=0.0)

    # Remove courses already watched
    sim_scores = sim_scores.drop(labels=watched_courses, errors="ignore")

    # Turn into DataFrame
    rec_df = sim_scores.reset_index()
    rec_df.columns = ["course_id", "sim_score"]

    # Attach business stats
    rec_df = rec_df.merge(course_stats, on="course_id", how="left")

    # Fill missing stats
    rec_df["num_ratings"] = rec_df["num_ratings"].fillna(0)
    rec_df["avg_rating"] = rec_df["avg_rating"].fillna(rec_df["avg_rating"].mean())

    # Popularity proxy: enrollments + num_ratings
    rec_df["popularity"] = rec_df["avg_enrollment"].fillna(0) + rec_df["num_ratings"]

    # Normalize features for business scoring
    # (avoid division by zero with small epsilon)
    def normalize(col):
        return (col - col.min()) / (col.max() - col.min() + 1e-9)

    rec_df["sim_norm"] = normalize(rec_df["sim_score"])
    rec_df["pop_norm"] = normalize(rec_df["popularity"])
    rec_df["rating_norm"] = normalize(rec_df["avg_rating"])

    # Business score: combine personalization + popularity + rating
    rec_df["business_score"] = (
        alpha_sim * rec_df["sim_norm"] +
        alpha_pop * rec_df["pop_norm"] +
        alpha_rating * rec_df["rating_norm"]
    )

    # Sort by business_score
    rec_df = rec_df.sort_values("business_score", ascending=False).head(top_n)

    return rec_df[[
        "course_id",
        "course_name",
        "business_score",
        "sim_score",
        "avg_rating",
        "num_ratings",
        "avg_enrollment",
        "avg_price",
        "avg_duration"
    ]]


# Example: recommend for some user
example_user_id = user_course_matrix.index[0]
print("\nExample recommendations for user:", example_user_id)
print(recommend_for_user(example_user_id, top_n=5))


Example recommendations for user: 1
       course_id                           course_name  business_score  \
78321       9955       Cybersecurity for Professionals        0.768074   
78322       9955        Fitness and Nutrition Coaching        0.768074   
78323       9955   Stock Market and Trading Strategies        0.768074   
78320       9955     Mobile App Development with Swift        0.768074   
78319       9955  Networking and System Administration        0.768074   

       sim_score  avg_rating  num_ratings  avg_enrollment  avg_price  \
78321   0.783244         3.9            5         17821.2     99.194   
78322   0.783244         3.9            5         17821.2     99.194   
78323   0.783244         3.9            5         17821.2     99.194   
78320   0.783244         3.9            5         17821.2     99.194   
78319   0.783244         3.9            5         17821.2     99.194   

       avg_duration  
78321         37.68  
78322         37.68  
78323         37.68

## 9. SIMPLE EVALUATION: HIT-RATE @ K

In [22]:
def hit_rate_at_k(k=10):
    """
    Simple offline metric:
      For each user in test, check if at least one of the test courses
      appears in top-k recommendations from train-based model.
      Metric ~ "what fraction of users got at least one relevant course in top-k?"
    """
    # Users that exist in train user_course_matrix
    users_in_train = set(user_course_matrix.index)

    # group test interactions by user
    user_test_courses = test_df.groupby("user_id")["course_id"].apply(set)

    hits = 0
    total = 0

    for user_id, test_courses in user_test_courses.items():
        if user_id not in users_in_train:
            continue  # skip cold-start users for this metric
        total += 1

        recs = recommend_for_user(user_id, top_n=k)
        recommended_courses = set(recs["course_id"].tolist())

        # hit if any of the test courses is in recommended list
        if len(recommended_courses.intersection(test_courses)) > 0:
            hits += 1

    if total == 0:
        print("No overlapping users between train & test for evaluation.")
        return None

    return hits / total

hr10 = hit_rate_at_k(k=10)
print("\nHit-Rate @ 10:", hr10)


Hit-Rate @ 10: 0.0005308258133009783


## 10. SAVE OBJECTS FOR DEPLOYMENT (OPTIONAL)

In [23]:
import joblib

joblib.dump(user_course_matrix, "user_course_matrix.joblib")
joblib.dump(course_similarity_df, "course_similarity_df.joblib")
joblib.dump(course_stats, "course_stats.joblib")

print("\nSaved:")
print("- user_course_matrix.joblib")
print("- course_similarity_df.joblib")
print("- course_stats.joblib")


Saved:
- user_course_matrix.joblib
- course_similarity_df.joblib
- course_stats.joblib


## 11. SIMPLE FLASK API (RECOMMENDATION SERVICE)
Save the following block into a file named `recommender_api.py`.

In [24]:
flask_code = r"""
from flask import Flask, request, jsonify
import joblib
import pandas as pd

# Load saved artifacts
user_course_matrix = joblib.load('user_course_matrix.joblib')
course_similarity_df = joblib.load('course_similarity_df.joblib')
course_stats = joblib.load('course_stats.joblib')

app = Flask(__name__)

def normalize(col):
    return (col - col.min()) / (col.max() - col.min() + 1e-9)

def recommend_for_user_api(user_id, top_n=10,
                           alpha_sim=0.6, alpha_pop=0.2, alpha_rating=0.2):
    if user_id not in user_course_matrix.index:
        popular = course_stats.copy()
        popular["popularity"] = popular["avg_enrollment"].fillna(0) + popular["num_ratings"].fillna(0)
        popular["popularity_norm"] = normalize(popular["popularity"])
        popular["avg_rating_norm"] = normalize(popular["avg_rating"])
        popular["business_score"] = alpha_pop * popular["popularity_norm"] + \
                                    alpha_rating * popular["avg_rating_norm"]
        popular = popular.sort_values("business_score", ascending=False).head(top_n)
        return popular[["course_id", "course_name", "avg_rating", "num_ratings", "business_score"]].to_dict(orient="records")

    user_ratings = user_course_matrix.loc[user_id]
    watched_courses = user_ratings[user_ratings > 0].index.tolist()

    if len(watched_courses) == 0:
        return recommend_for_user_api(-1, top_n=top_n)

    sim_scores = pd.Series(dtype=float)
    for course in watched_courses:
        rating = user_ratings[course]
        sims = course_similarity_df.loc[course] * rating
        sim_scores = sim_scores.add(sims, fill_value=0.0)

    sim_scores = sim_scores.drop(labels=watched_courses, errors="ignore")
    rec_df = sim_scores.reset_index()
    rec_df.columns = ["course_id", "sim_score"]

    rec_df = rec_df.merge(course_stats, on="course_id", how="left")
    rec_df["num_ratings"] = rec_df["num_ratings"].fillna(0)
    rec_df["avg_rating"] = rec_df["avg_rating"].fillna(rec_df["avg_rating"].mean())
    rec_df["popularity"] = rec_df["avg_enrollment"].fillna(0) + rec_df["num_ratings"]

    rec_df["sim_norm"] = normalize(rec_df["sim_score"])
    rec_df["pop_norm"] = normalize(rec_df["popularity"])
    rec_df["rating_norm"] = normalize(rec_df["avg_rating"])

    rec_df["business_score"] = (
        alpha_sim * rec_df["sim_norm"] +
        alpha_pop * rec_df["pop_norm"] +
        alpha_rating * rec_df["rating_norm"]
    )

    rec_df = rec_df.sort_values("business_score", ascending=False).head(top_n)

    return rec_df[[
        "course_id",
        "course_name",
        "business_score",
        "sim_score",
        "avg_rating",
        "num_ratings",
        "avg_enrollment",
        "avg_price",
        "avg_duration"
    ]].to_dict(orient="records")

@app.route("/recommend", methods=["GET"])
def recommend():
    user_id = request.args.get("user_id", type=int)
    top_n = request.args.get("top_n", default=10, type=int)

    if user_id is None:
        return jsonify({"error": "user_id is required as query param"}), 400

    recs = recommend_for_user_api(user_id, top_n=top_n)
    return jsonify({"user_id": user_id, "recommendations": recs})

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000, debug=True)
"""

with open("recommender_api.py", "w") as f:
    f.write(flask_code)

print("\nFlask recommender API written to 'recommender_api.py'.")
print("Run:  python recommender_api.py  and hit:  /recommend?user_id=123&top_n=5")


Flask recommender API written to 'recommender_api.py'.
Run:  python recommender_api.py  and hit:  /recommend?user_id=123&top_n=5


## 9. SIMPLE EVALUATION: HIT-RATE @ K

In [10]:
def hit_rate_at_k(k=10):
    """
    Simple offline metric:
      For each user in test, check if at least one of the test courses
      appears in top-k recommendations from train-based model.
      Metric ~ "what fraction of users got at least one relevant course in top-k?"
    """
    # Users that exist in train user_course_matrix
    users_in_train = set(user_course_matrix.index)

    # group test interactions by user
    user_test_courses = test_df.groupby("user_id")["course_id"].apply(set)

    hits = 0
    total = 0

    for user_id, test_courses in user_test_courses.items():
        if user_id not in users_in_train:
            continue  # skip cold-start users for this metric
        total += 1

        recs = recommend_for_user(user_id, top_n=k)
        recommended_courses = set(recs["course_id"].tolist())

        # hit if any of the test courses is in recommended list
        if len(recommended_courses.intersection(test_courses)) > 0:
            hits += 1

    if total == 0:
        print("No overlapping users between train & test for evaluation.")
        return None

    return hits / total

hr10 = hit_rate_at_k(k=10)
print("\nHit-Rate @ 10:", hr10)


Hit-Rate @ 10: 0.0005308258133009783


## 10. SAVE OBJECTS FOR DEPLOYMENT (OPTIONAL)

In [11]:
import joblib

joblib.dump(user_course_matrix, "user_course_matrix.joblib")
joblib.dump(course_similarity_df, "course_similarity_df.joblib")
joblib.dump(course_stats, "course_stats.joblib")

print("\nSaved:")
print("- user_course_matrix.joblib")
print("- course_similarity_df.joblib")
print("- course_stats.joblib")


Saved:
- user_course_matrix.joblib
- course_similarity_df.joblib
- course_stats.joblib


## 11. SIMPLE FLASK API (RECOMMENDATION SERVICE)
Save the following block into a file named `recommender_api.py`.

In [12]:
flask_code = r"""
from flask import Flask, request, jsonify
import joblib
import pandas as pd

# Load saved artifacts
user_course_matrix = joblib.load('user_course_matrix.joblib')
course_similarity_df = joblib.load('course_similarity_df.joblib')
course_stats = joblib.load('course_stats.joblib')

app = Flask(__name__)

def normalize(col):
    return (col - col.min()) / (col.max() - col.min() + 1e-9)

def recommend_for_user_api(user_id, top_n=10,
                           alpha_sim=0.6, alpha_pop=0.2, alpha_rating=0.2):
    if user_id not in user_course_matrix.index:
        popular = course_stats.copy()
        popular["popularity"] = popular["avg_enrollment"].fillna(0) + popular["num_ratings"].fillna(0)
        popular["popularity_norm"] = normalize(popular["popularity"])
        popular["avg_rating_norm"] = normalize(popular["avg_rating"])
        popular["business_score"] = alpha_pop * popular["popularity_norm"] + \
                                    alpha_rating * popular["avg_rating_norm"]
        popular = popular.sort_values("business_score", ascending=False).head(top_n)
        return popular[["course_id", "course_name", "avg_rating", "num_ratings", "business_score"]].to_dict(orient="records")

    user_ratings = user_course_matrix.loc[user_id]
    watched_courses = user_ratings[user_ratings > 0].index.tolist()

    if len(watched_courses) == 0:
        return recommend_for_user_api(-1, top_n=top_n)

    sim_scores = pd.Series(dtype=float)
    for course in watched_courses:
        rating = user_ratings[course]
        sims = course_similarity_df.loc[course] * rating
        sim_scores = sim_scores.add(sims, fill_value=0.0)

    sim_scores = sim_scores.drop(labels=watched_courses, errors="ignore")
    rec_df = sim_scores.reset_index()
    rec_df.columns = ["course_id", "sim_score"]

    rec_df = rec_df.merge(course_stats, on="course_id", how="left")
    rec_df["num_ratings"] = rec_df["num_ratings"].fillna(0)
    rec_df["avg_rating"] = rec_df["avg_rating"].fillna(rec_df["avg_rating"].mean())
    rec_df["popularity"] = rec_df["avg_enrollment"].fillna(0) + rec_df["num_ratings"]

    rec_df["sim_norm"] = normalize(rec_df["sim_score"])
    rec_df["pop_norm"] = normalize(rec_df["popularity"])
    rec_df["rating_norm"] = normalize(rec_df["avg_rating"])

    rec_df["business_score"] = (
        alpha_sim * rec_df["sim_norm"] +
        alpha_pop * rec_df["pop_norm"] +
        alpha_rating * rec_df["rating_norm"]
    )

    rec_df = rec_df.sort_values("business_score", ascending=False).head(top_n)

    return rec_df[[
        "course_id",
        "course_name",
        "business_score",
        "sim_score",
        "avg_rating",
        "num_ratings",
        "avg_enrollment",
        "avg_price",
        "avg_duration"
    ]].to_dict(orient="records")

@app.route("/recommend", methods=["GET"])
def recommend():
    user_id = request.args.get("user_id", type=int)
    top_n = request.args.get("top_n", default=10, type=int)

    if user_id is None:
        return jsonify({"error": "user_id is required as query param"}), 400

    recs = recommend_for_user_api(user_id, top_n=top_n)
    return jsonify({"user_id": user_id, "recommendations": recs})

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000, debug=True)
"""

with open("recommender_api.py", "w") as f:
    f.write(flask_code)

print("\nFlask recommender API written to 'recommender_api.py'.")
print("Run:  python recommender_api.py  and hit:  /recommend?user_id=123&top_n=5")


Flask recommender API written to 'recommender_api.py'.
Run:  python recommender_api.py  and hit:  /recommend?user_id=123&top_n=5
