In [3]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
from pulp import LpMaximize, LpProblem, LpVariable, lpSum, LpContinuous
from sklearn.preprocessing import StandardScaler

In [None]:
# Helper Functions
def extract_courses_for_skill(course_df, skill_name):
    """
    Extracts all courses that correspond to the given skill name from the course dataset.
    """
    return course_df[course_df["skill"].str.lower() == skill_name.lower()].copy()

def standardize_focus_scores(skill_list):
    """
    Standardizes the focus scores in the skill list so they sum to 1.
    Parameters:
    - skill_list (list): A list of skills with their focus-score and confidence level.
    Returns:
    - list: A standardized skill list where focus scores sum to 1.
    """
    # Extract focus scores
    focus_scores = np.array([entry[1] for entry in skill_list], dtype=np.float64)

    # Normalize so the sum equals 1
    total_focus = np.sum(focus_scores)
    if total_focus > 0:
        normalized_focus_scores = focus_scores / total_focus
    else:
        normalized_focus_scores = focus_scores  # If total is 0, keep them unchanged

    # Update skill list with standardized focus scores
    standardized_skill_list = [[skill[0], float(focus), skill[2]] for skill, focus in zip(skill_list, normalized_focus_scores)]

    return standardized_skill_list

# Function to compute match score for a course based on skill embeddings
def compute_match_score(course, main_skill, skill_list_with_weights, skill_embeddings, job_title_embedding):
    """
    Compute the match score for a given course based on:
    - Similarity scores (not binary presence).
    - Adjusted weights based on Coursera/Udemy.
    - Job title similarity added (0.5 weight for both title & description).
    """
    # Safe cosine similarity function to avoid NaN values
    def safe_cosine_similarity(vec1, vec2):
        norm1 = np.linalg.norm(vec1)
        norm2 = np.linalg.norm(vec2)

        # If either vector has zero norm, return 0 similarity
        if norm1 == 0 or norm2 == 0:
            return 0.0

        return np.dot(vec1, vec2) / (norm1 * norm2)
    
    # Extract course embeddings
    description_emb = np.array(course["description_embedding"])
    title_emb = np.array(course["title_embedding"])

    # Check if course is from Coursera (structured) or Udemy (unstructured)
    is_coursera = course["description"].startswith("list")

    # Initialize total score
    total_score = 0

    # Process each skill (main + side skills)
    for skill, focus, confidence in skill_list_with_weights:
        skill_emb = np.array(skill_embeddings.get(skill.lower(), np.zeros_like(description_emb)))

        # Compute safe cosine similarities
        desc_similarity = safe_cosine_similarity(description_emb, skill_emb)
        title_similarity = safe_cosine_similarity(title_emb, skill_emb)

        # Assign different weights based on Coursera vs. Udemy
        if skill == main_skill:
            weight_desc, weight_title = 0.6, 0.6
        else:
            weight_desc, weight_title = (0.8, 0.8) if is_coursera else (1.0, 0.8)

        # Compute weighted contribution
        total_score += (desc_similarity * weight_desc + title_similarity * weight_title) * focus * (1 - confidence)

    # Compute job title similarity safely
    job_desc_similarity = safe_cosine_similarity(description_emb, job_title_embedding)
    job_title_similarity = safe_cosine_similarity(title_emb, job_title_embedding)

    # Add job title similarity contribution
    total_score += 0.5 * (job_desc_similarity + job_title_similarity)

    return total_score

def compute_difficulty_scores(row, main_skill, skill_list):
    """
    Apply difficulty score computation to all courses in the dataset using the confidence level of the main skill.
    Parameters:
    - course_df (DataFrame): The dataset containing course information.
    - main_skill (str): The main skill for which courses are being evaluated.
    - skill_list (list): A list of skills with their focus-score and confidence level.
    Returns:
    - DataFrame: The original DataFrame with an added 'difficulty_score' column.
    """

    # Function to determine the ideal difficulty based on confidence
    def get_ideal_difficulty(confidence):
        if confidence >= 0.5:
            diff = 3
        elif confidence >= 0.3:
            diff = 2
        elif confidence >= 0.2:
            diff = 1.5
        elif confidence >= 0.1:
            diff = 1
        else:
            diff = 0
        return diff

    # Function to compute difficulty score for a single course row
    def get_difficulty_score(course_difficulty, user_confidence, is_pro_certificate):
        ideal_difficulty = get_ideal_difficulty(user_confidence)
        difficulty_penalty = 1 * np.abs(ideal_difficulty - course_difficulty)
        if is_pro_certificate and user_confidence >= 0.6:
            certificate_score = 10
        elif is_pro_certificate and user_confidence <= 0.3:
            certificate_score = -10
        else:
            certificate_score = 0
        return 1-difficulty_penalty + certificate_score

    
    main_skill_confidence = next((conf for skill, focus, conf in skill_list if skill.lower() == main_skill.lower()), 0)
    return get_difficulty_score(
        course_difficulty=row["difficulty_numeric"],
        user_confidence=main_skill_confidence,
        is_pro_certificate=(row["course_type"] == "Certificate")
    )

# Function to solve the ILP for selecting courses with a dynamic duration constraint
def solve_course_selection_pulp(course_df, skill, D_ideal, alpha, beta, lambda_, gamma):
    """
    Solves the ILP for selecting courses that maximize skill match and difficulty score while minimizing price,
    while ensuring the total selected duration stays within ±10% of the ideal duration.

    Parameters:
    - course_df (DataFrame): The dataset containing courses with relevant fields.
    - skill (str): The skill being optimized in this iteration.
    - skill_list (list): A list of skills with their focus-score and confidence level.
    - D_ideal (float): The ideal duration for the selected courses.
    - alpha, gamma, lambda_, beta: Weight parameters for optimization.

    Returns:
    - Selected courses as a list of course indices.
    """

    # Extract relevant courses for the given skill
    relevant_courses = course_df[course_df["skill"].str.lower() == skill.lower()].copy()
    if relevant_courses.empty:
        return []  # No courses available for this skill

    # Define decision variables (binary: select or not)
    x = {i: LpVariable(f"x_{i}", cat="Binary") for i in relevant_courses.index}

    # Define the ILP problem
    problem = LpProblem("Course_Selection", LpMaximize)

    match_score = lpSum(relevant_courses.loc[i, "match_score"] * x[i] for i in relevant_courses.index)
    difficulty_score = lpSum(relevant_courses.loc[i, "difficulty_score"] * x[i] for i in relevant_courses.index)
    wilson_score = lpSum(relevant_courses.loc[i, "wilson_score"] * x[i] for i in relevant_courses.index)
    price_score = lpSum(relevant_courses.loc[i, "price"] * x[i] for i in relevant_courses.index)
    number_score = lpSum(x[i] for i in relevant_courses.index)

    # Objective Function: Maximize average skill match, difficulty, and review score while minimizing price
    problem += (
        match_score +
        alpha * difficulty_score +
        beta * wilson_score -
        lambda_ * price_score -
        gamma *  number_score
    )
    

    # Skill Coverage Constraint: At least one course must be selected
    problem += lpSum(x[i] for i in relevant_courses.index) >= 1

    # Duration Constraint: Total selected duration must be within ±10% of the ideal duration
    duration_tolerance = 0.1 * D_ideal
    problem += lpSum(relevant_courses.loc[i, "duration"] * x[i] for i in relevant_courses.index) <= (D_ideal + duration_tolerance)
    problem += lpSum(relevant_courses.loc[i, "duration"] * x[i] for i in relevant_courses.index) >= (D_ideal - duration_tolerance)

    # Solve the ILP
    problem.solve()

    # Extract selected courses
    selected_courses = [i for i in relevant_courses.index if x[i].varValue == 1]

    return selected_courses

def suggest_courses(course_df, skill_list, total_weeks, weekly_hours, job_title, portion = 0.8, alpha = 0.5, beta = 0.5, lambda_ = 0.5, gamma = 1):
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    result = []
    skill_list = standardize_focus_scores(skill_list)
    scaler = StandardScaler()
    for [skill, focus, confidence] in skill_list:
        main_skill = skill
        courses = extract_courses_for_skill(course_df, main_skill)
        courses["description_embedding"] = courses["description"].apply(lambda text: embedding_model.encode(text, convert_to_tensor=True))
        courses["title_embedding"] = courses["title"].apply(lambda text: embedding_model.encode(text, convert_to_tensor=True))
        skill_embeddings = {skill[0].lower(): embedding_model.encode(skill[0].lower(), convert_to_tensor=True) for skill in skill_list}
        job_title_embedding = np.array(skill_embeddings.get(job_title.lower(), np.zeros_like(next(iter(skill_embeddings.values())))))
        courses["match_score"] = courses.apply(lambda row: compute_match_score(row, main_skill, skill_list, skill_embeddings, job_title_embedding), axis=1)
        courses["difficulty_score"] = courses.apply(lambda row: compute_difficulty_scores(row, main_skill, skill_list), axis = 1)
        courses["match_score"] = scaler.fit_transform(courses[["match_score"]])
        courses["difficulty_score"] = scaler.fit_transform(courses[["difficulty_score"]])
        courses["price"] = scaler.fit_transform(courses[["price"]])
        courses["wilson_score"] = scaler.fit_transform(courses[["wilson_score"]])
        D_ideal = portion * total_weeks * weekly_hours * focus

        list = solve_course_selection_pulp(courses, main_skill, D_ideal, alpha, beta, lambda_, gamma)
        result.append((skill, list))
    return result

In [40]:
# Load the Coursera and Udemy course df
coursera_file_path = "../dataset-scrappers/cleaned-df/df_coursera.csv"
udemy_file_path = "../dataset-scrappers/cleaned-df/df_udemy.csv"
new_column_order = [
    "skill", "link", "image_link", "partner", "title", "description", "rating",
    "num_review", "duration", "price", "difficulty_numeric", "course_type", "wilson_score", 
]
# Read CSV files into Pandas DataFrames
coursera_df = pd.read_csv(coursera_file_path)[new_column_order]
udemy_df = pd.read_csv(udemy_file_path)[new_column_order]
course_df = pd.concat([coursera_df, udemy_df], axis=0).reset_index()
course_df["difficulty"] = course_df["difficulty_numeric"]
course_df["price"] = round(course_df["price"], 0) + 0.99


In [50]:
# Input
# skill_list = [["Statistical Analysis", 0.4, 0.7],  
#             ["Python", 0.5, 0.5], 
#             ["Machine Learning (ML)", 0.4, 0.1], 
#             ["Tableau Desktop", 0.2, 0], 
#             ["Excel", 0.3, 0.3], 
#             ["SQL Queries", 0.4, 0.2]] # [skillname, focus-score, confidence level]
skill_list = [
            ["Python", 0.5, 0.6], 
            ["Excel", 0.3, 0.4], 
            ["Pytorch", 0.2, 0]]
total_weeks = 10
weekly_hours = 20
job_title = "Data Analyst"
suggestions = suggest_courses(course_df, skill_list, total_weeks, weekly_hours, job_title, portion = 1)

df = pd.DataFrame({"00":["job title","total hours", "weekly hours"], "ok":["Analyst", "10", "20"]})
df

Unnamed: 0,00,ok
0,job title,Analyst
1,total hours,10
2,weekly hours,20


In [44]:
course_df.loc[suggestions[2][1], ["title", "difficulty", "duration", "course_type", "price", "rating", "num_review"]]

Unnamed: 0,title,difficulty,duration,course_type,price,rating,num_review
61744,Facial Expression Recognition with PyTorch,1.0,2.0,Course,1.99,3.8,53
200976,Practical Deep Learning: Master PyTorch in 15 ...,1.0,16.5,Course,14.99,4.7,85
200979,PyTorch Ultimate 2024: From Basics to Cutting-...,1.5,19.0,Course,14.99,4.7,702
