In [1]:
import faiss
from canvasapi import Canvas
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
API_URL = 'https://canvas.ubc.ca/'
API_KEY = ''
canvas = Canvas(API_URL, API_KEY)
course_id = 161721
course = canvas.get_course(course_id)
# Lists to store discussion and reply data
discussion_data = []
reply_data = []
# Get all discussions for the course
discussions = course.get_discussion_topics()
for discussion in discussions:
    discussion_data.append({
        'id': discussion.id,
        'title': discussion.title,
        'message': discussion.message,
        'posted_at': discussion.posted_at,
        'user_name': discussion.user_name,
        'discussion_type': discussion.discussion_type,
        'published': discussion.published,
        'locked': discussion.locked,
        'locked_for_user': discussion.locked_for_user,
        'delayed_post_at': discussion.delayed_post_at,
        'require_initial_post': discussion.require_initial_post,
        'subscribed': discussion.subscribed,
        'read_state': discussion.read_state,
        'assignment_id': discussion.assignment_id,
        'group_category_id': discussion.group_category_id,
        'root_topic_id': discussion.root_topic_id
    })
    
    # Get all entries (replies) for the current discussion using get_topic_entries()
    entries = discussion.get_topic_entries()
    for entry in entries:
        reply_data.append({
            'discussion_id': discussion.id,
            'entry_id': entry.id,
            'user_id': entry.user_id,
            'user_name': entry.user_name,
            'created_at': entry.created_at,
            'updated_at': entry.updated_at,
            'message': entry.message,
            'read_state': entry.read_state,
            'parent_entry_id': entry.parent_id
        })
discussions_df = pd.DataFrame(discussion_data)
replies_df = pd.DataFrame(reply_data)

In [3]:
discussions_df.head()

Unnamed: 0,id,title,message,posted_at,user_name,discussion_type,published,locked,locked_for_user,delayed_post_at,require_initial_post,subscribed,read_state,assignment_id,group_category_id,root_topic_id
0,2379816,Innovation Hub,<p>The Innovation Hub is an inclusive and coll...,2024-10-17T18:21:55Z,Tue Hoang,threaded,True,False,False,,,False,read,,,
1,2379658,AI: How-to setup Local and Cloud-Based Environ...,"<p>Responsible AI:</p>\n<div class=""page"" titl...",2024-10-17T17:13:17Z,Tue Hoang,threaded,True,False,False,,,False,read,,,
2,2375553,HELP: Adding additional group members,"<p>Hi, I created a group and added 1 other gro...",2024-10-12T22:18:55Z,Susannah Sun,threaded,True,False,False,,,False,read,,,
3,2349487,Data Questions,<p>Ask your data questions here!&nbsp;</p>,2024-10-16T19:05:20Z,,threaded,True,False,False,,,False,read,,,
4,2349486,General Hackathon Questions,<p>Any questions about the hackathon that we h...,2024-09-25T19:05:20Z,,threaded,True,False,False,,,False,read,,,


In [4]:
# Function to extract text inside <p> and links
def extract_message_and_links(html_text):
    # Extract content inside <p> tags
    message_content = re.findall(r'<p>(.*?)</p>', html_text, re.DOTALL)
    message_text = " ".join(message_content)  # Join in case of multiple <p> tags
    
    # Extract links after href=
    links = re.findall(r'href="(.*?)"', html_text)
    links_text = ", ".join(links)  # Join multiple links

    # Combine message and links
    cleaned_message = f"{message_text}. Links: {links_text}"
    return cleaned_message

# Apply the extraction function to the message column
discussions_df['cleaned_message'] = discussions_df['message'].apply(extract_message_and_links)

# Create the new dataframe
discussions_ready_for_model = pd.DataFrame({
    'discussion_id': discussions_df['id'],
    'discussion_text': "Title: " + discussions_df['title'] + ", Message: " + discussions_df['cleaned_message']
})

In [5]:
df = discussions_ready_for_model.copy()
# Load the SentenceTransformer model (auto-detects GPU if available)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the discussion data
embeddings = model.encode(df['discussion_text'].tolist(), batch_size=16, show_progress_bar=True)

# Convert to numpy array
embeddings = np.array(embeddings)

# Define the dimension of embeddings
d = embeddings.shape[1]

# Create the FAISS index
index = faiss.IndexFlatL2(d)  # L2 distance (Euclidean)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
index.add(embeddings)
print(f"Total vectors indexed: {index.ntotal}")

# Example query
query_text = "What is the best condiment?"

# Generate embedding for the query
query_embedding = model.encode([query_text])

k = 5  # Search for top 5 neighbors initially
threshold = 1.5  # Example threshold for L2 distance

# Perform the search
distances, indices = index.search(np.array(query_embedding), k)

# Filter results based on the threshold
filtered_results = [(idx, dist) for idx, dist in zip(indices[0], distances[0]) if dist <= threshold]

Total vectors indexed: 7


In [7]:
# Display the results
print("\nQuery:", query_text)
print("\nMatching discussions below the threshold:")
for idx, dist in filtered_results:
    print(f"Discussion ID: {df.iloc[idx]['discussion_id']}, Text: '{df.iloc[idx]['discussion_text']}', Distance: {dist}")

if not filtered_results:
    print("No discussions found below the threshold.")


Query: What is the best condiment?

Matching discussions below the threshold:
Discussion ID: 2349485, Text: 'Title: Ketchup or Mustard?, Message: Do you prefer ketchup or mustard? Why?. Links: ', Distance: 1.045323371887207
