In [20]:
# !pip install -q scikit-learn pandas numpy sentence-transformers transformers tqdm

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import tqdm
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('course_preprocessed.csv')
descriptions = df['description'].fillna('').tolist()
print(f"Loaded {len(descriptions)} course descriptions.")

Loaded 93 course descriptions.


In [3]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_embeddings = tfidf_vectorizer.fit_transform(descriptions)
print(f"TF-IDF shape: {tfidf_embeddings.shape}")

TF-IDF shape: (93, 4173)


In [4]:
# Load GloVe embeddings
def load_glove(filepath):
    glove_dict = {}
    with open(filepath, 'r', encoding='utf8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vec = np.array(parts[1:], dtype='float32')
            glove_dict[word] = vec
    return glove_dict

glove = load_glove('./data/glove.6B.300d.txt')

# Embed each description
def average_glove_embedding(texts, embedding_dict, dim=300):
    vectors = []
    for text in tqdm.tqdm(texts):
        tokens = text.lower().split()
        word_vecs = [embedding_dict[word] for word in tokens if word in embedding_dict]
        if word_vecs:
            avg = np.mean(word_vecs, axis=0)
        else:
            avg = np.zeros(dim)
        vectors.append(avg)
    return np.vstack(vectors)

glove_embeddings = average_glove_embedding(descriptions, glove)
print(f"GloVe Embeddings shape: {glove_embeddings.shape}")

100%|██████████| 93/93 [00:00<00:00, 2951.66it/s]

GloVe Embeddings shape: (93, 300)





In [5]:
minilm_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
minilm_embeddings = minilm_model.encode(descriptions, show_progress_bar=True)
print(f"MiniLM shape: {minilm_embeddings.shape}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


MiniLM shape: (93, 384)


In [6]:
tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-small-v2")
model = AutoModel.from_pretrained("intfloat/e5-small-v2")

def e5_embed(texts):
    all_embeddings = []
    for text in tqdm.tqdm(texts):
        text = "passage: " + text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        hidden = outputs.last_hidden_state
        mask = inputs['attention_mask'].unsqueeze(-1).expand(hidden.size()).float()
        summed = torch.sum(hidden * mask, 1)
        count = torch.clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed / count
        all_embeddings.append(mean_pooled.squeeze().numpy())
    return np.vstack(all_embeddings)

e5_embeddings = e5_embed(descriptions)
print(f"E5-small-v2 shape: {e5_embeddings.shape}")

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

100%|██████████| 93/93 [00:30<00:00,  3.10it/s]

E5-small-v2 shape: (93, 384)





In [8]:
np.save('./data/embeddings_tfidf.npy', tfidf_embeddings.toarray())
np.save('./data/embeddings_glove.npy', glove_embeddings)
np.save('./data/embeddings_minilm.npy', minilm_embeddings)
np.save('./data/embeddings_e5.npy', e5_embeddings)

## Model Evaluation
For each model, we randomly select five courses (the same across all models) and evaluate them based on their respective five recommended courses.

We use two methods to assess model performance.

1. Objective: Check how many recommended courses share similar fulfills domains. (1: Courses share the same domain / 0: Courses do not share the same domain)
2. Subjective: Use human judgment to see if the recommendations make sense for each course. (1: The user is likely highly interested in the course / 0.5: The user may be somewhat interested, but the two courses are not strongly related / 0: The user is likely to show low interest in the course)

Based on this evaluation criterion, the maximum score for a course is 5 and the minimum is 0. We will average the scores to compare performance across all models.

In [9]:
def get_top_k_indices(index, embeddings, k=5):
    query = embeddings[index].reshape(1, -1)
    sims = cosine_similarity(query, embeddings)[0]
    sorted_indices = np.argsort(sims)[::-1]
    top_k = [i for i in sorted_indices if i != index][:k]
    return top_k

In [21]:
def evaluate_selected_courses(df, model_name, selected_titles, k=5):
    """
    Print top-k recommendations (with fulfills) for selected courses using the specified model.
    You will manually evaluate the relevance based on fulfills and human intuition.
    """
    # Select embedding matrix
    if model_name == 'tfidf':
        embeddings = tfidf_embeddings.toarray()
    elif model_name == 'glove':
        embeddings = glove_embeddings
    elif model_name == 'minilm':
        embeddings = minilm_embeddings
    elif model_name == 'e5':
        embeddings = e5_embeddings
    else:
        raise ValueError("Invalid model name. Choose from 'tfidf', 'glove', 'minilm', 'e5'")

    print(f"\n{'=' * 90}")
    print(f"MODEL: {model_name.upper()}")
    print(f"{'=' * 90}")

    for title in selected_titles:
        idx = df[df['title'].str.lower() == title.lower()].index
        if idx.empty:
            print(f"Course not found: {title}")
            continue

        idx = idx[0]
        print("\n" + "-" * 80)
        print(f"Course: {df.loc[idx, 'title']}")
        print(f"Fulfills: {df.loc[idx, 'fulfills']}")
        print("Top 5 Recommendations:\n")

        top_k = get_top_k_indices(idx, embeddings, k=k)
        for i, rec_idx in enumerate(top_k, 1):
            rec_title = df.loc[rec_idx, 'title']
            rec_fulfills = df.loc[rec_idx, 'fulfills']
            print(f"{i}. {rec_title}")
            print(f"   Fulfills: {rec_fulfills}\n")

In [26]:
selected_courses = [
    "MPCS 53113 Natural Language Processing (Summer 2025)",
    "MPCS 51030 iOS Application Development (Winter 2026)",
    "MPCS 53110 Foundations of Computational Data Analysis (Spring 2026)",
    "MPCS 55005 Advanced Algorithms (Spring 2026)",
    "MPCS 57200 Generative AI (Winter 2026)"
]

### TF-IDF (Baseline model)



In [27]:
evaluate_selected_courses(df, model_name='tfidf', selected_titles=selected_courses)


MODEL: TFIDF

--------------------------------------------------------------------------------
Course: MPCS 53113 Natural Language Processing (Summer 2025)
Fulfills: ElectiveSpecialization - Data Analytics (DA-2)
Top 5 Recommendations:

1. MPCS 53120 Applied Data Analysis (Winter 2026)
   Fulfills: ElectiveSpecialization - High Performance Computing (HPC-2)

2. MPCS 53120 Applied Data Analysis (Spring 2026)
   Fulfills: ElectiveSpecialization - High Performance Computing (HPC-2)

3. MPCS 53111 Machine Learning (Spring 2026)
   Fulfills: ElectiveSpecialization - Data Analytics (DA-1)Specialization - High Performance Computing (HPC-2)

4. MPCS 53111 Machine Learning (Autumn 2025)
   Fulfills: ElectiveSpecialization - Data Analytics (DA-1)Specialization - High Performance Computing (HPC-2)

5. MPCS 50101 Concepts of Programming (Autumn 2025)
   Fulfills: Immersion Programming


--------------------------------------------------------------------------------
Course: MPCS 51030 iOS Applica

**Objective score**: 2 + 1 + 3 + 4 + 1 = 11

**Subjective score**: 4.5 + 3 + 5 + 4.5 + 1 = 18

### GloVe

In [28]:
evaluate_selected_courses(df, model_name='glove', selected_titles=selected_courses)


MODEL: GLOVE

--------------------------------------------------------------------------------
Course: MPCS 53113 Natural Language Processing (Summer 2025)
Fulfills: ElectiveSpecialization - Data Analytics (DA-2)
Top 5 Recommendations:

1. MPCS 53120 Applied Data Analysis (Winter 2026)
   Fulfills: ElectiveSpecialization - High Performance Computing (HPC-2)

2. MPCS 53120 Applied Data Analysis (Spring 2026)
   Fulfills: ElectiveSpecialization - High Performance Computing (HPC-2)

3. MPCS 51042 Python Programming (Winter 2026)
   Fulfills: Core Programming

4. MPCS 53112 Advanced Data Analytics (Autumn 2025)
   Fulfills: ElectiveSpecialization - Data Analytics (DA-2)

5. MPCS 51042 Python Programming (Autumn 2025)
   Fulfills: Core Programming


--------------------------------------------------------------------------------
Course: MPCS 51030 iOS Application Development (Winter 2026)
Fulfills: ElectiveSpecialization - Application Development (APP-1)
Top 5 Recommendations:

1. MPCS 510

**Objective score**: 1 + 1 + 3 + 2 + 2 = 9

**Subjective score**: 4 + 1.5 + 5 + 2 + 1.5 = 14

### MiniLM

In [29]:
evaluate_selected_courses(df, model_name='minilm', selected_titles=selected_courses)


MODEL: MINILM

--------------------------------------------------------------------------------
Course: MPCS 53113 Natural Language Processing (Summer 2025)
Fulfills: ElectiveSpecialization - Data Analytics (DA-2)
Top 5 Recommendations:

1. MPCS 57200 Generative AI (Winter 2026)
   Fulfills: ElectiveSpecialization - Data Analytics (DA-2)

2. MPCS 57200 Generative AI (Autumn 2025)
   Fulfills: ElectiveSpecialization - Data Analytics (DA-2)

3. MPCS 51042 Python Programming (Winter 2026)
   Fulfills: Core Programming

4. MPCS 51087 High Performance Computing (Winter 2026)
   Fulfills: ElectiveSpecialization - High Performance Computing (HPC-1)

5. MPCS 51042 Python Programming (Autumn 2025)
   Fulfills: Core Programming


--------------------------------------------------------------------------------
Course: MPCS 51030 iOS Application Development (Winter 2026)
Fulfills: ElectiveSpecialization - Application Development (APP-1)
Top 5 Recommendations:

1. MPCS 51032 Advanced iOS Applicati

**Objective score**: 2 + 4 + 3 + 2 + 2 = 13

**Subjective score**: 3 + 4 + 5 + 5 + 2 = 19

### E5

In [30]:
evaluate_selected_courses(df, model_name='e5', selected_titles=selected_courses)


MODEL: E5

--------------------------------------------------------------------------------
Course: MPCS 53113 Natural Language Processing (Summer 2025)
Fulfills: ElectiveSpecialization - Data Analytics (DA-2)
Top 5 Recommendations:

1. MPCS 53120 Applied Data Analysis (Winter 2026)
   Fulfills: ElectiveSpecialization - High Performance Computing (HPC-2)

2. MPCS 53120 Applied Data Analysis (Spring 2026)
   Fulfills: ElectiveSpecialization - High Performance Computing (HPC-2)

3. MPCS 57200 Generative AI (Winter 2026)
   Fulfills: ElectiveSpecialization - Data Analytics (DA-2)

4. MPCS 57200 Generative AI (Autumn 2025)
   Fulfills: ElectiveSpecialization - Data Analytics (DA-2)

5. MPCS 53112 Advanced Data Analytics (Autumn 2025)
   Fulfills: ElectiveSpecialization - Data Analytics (DA-2)


--------------------------------------------------------------------------------
Course: MPCS 51030 iOS Application Development (Winter 2026)
Fulfills: ElectiveSpecialization - Application Developm

**Objective score**: 3 + 3 + 3 + 4 + 3 = 16

**Subjective score**: 5 + 3 + 5 + 4 + 3 = 21

According to the above model experiment and evaluation, the E5 model has the highest objective and subjective scores. As result, we will use the E5 model to build a course recommendation system.