In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [5]:
# Step 1: Fetch the main program page
url = "https://2024.pycon.org.au/program/list/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Step 2: Extract URLs within the <main> section
main_section = soup.find('main')
session_links = main_section.find_all('a', href=True)

# Step 3 & 4: Extract the title and div content from each session page
data = []
base_url = "https://2024.pycon.org.au"

for link in session_links:
    session_url = base_url + link['href']
    session_response = requests.get(session_url)
    session_soup = BeautifulSoup(session_response.text, 'html.parser')
    
    # Extract title
    title = session_soup.find('title').text.strip()
    
    # Extract content in the div with class "lede astro-QLKTC6HR"
    lede_content = session_soup.find('div', class_='lede astro-QLKTC6HR')
    if lede_content:
        description = lede_content.text.strip()
    else:
        description = "No description available"
    
    data.append([title, description])

# Step 5: Create DataFrame
df = pd.DataFrame(data, columns=['Title', 'Description'])


In [6]:
df

Unnamed: 0,Title,Description
0,A Lazy Person's Guide to Building REST Clients...,Sick of writing lots of dedicated methods to p...
1,Adding File System context to pathlib—PyCon AU...,Adding file system specific information (e.g. ...
2,AI perceptions of gender—PyCon AU 2024,Why does AI perceive gender? Is this something...
3,As a Teacher. I have no Time to learn Programm...,Teaching is a demanding job that takes away th...
4,"Astronomy with Python, for non-astronomers—PyC...",Astronomers have been dealing with digital dat...
...,...,...
56,What Python Can Learn From Other Languages—PyC...,Python is great! It's been a mainstay of web d...
57,Where am I? What am I doing? Mobile App develo...,A modern mobile phone isn't just a powerful CP...
58,Who tests the testers? Making and testing pipe...,You likely have a good pipeline that either ru...
59,Why UUIDs are Secretly Incredibly Fascinating—...,What even is '06462f89-b4ef-7f7d-8000-edda1bba...


In [8]:
import streamlit as st
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch

# Load your data
# df = pd.read_csv('session_data.csv')  # Assuming you have the DataFrame from the previous step

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for all descriptions
embeddings = model.encode(df['Description'].tolist(), convert_to_tensor=True)

# Streamlit UI to select a title
selected_title = st.selectbox("Select a session title", df['Title'].values)

# Get the selected description
selected_description = df[df['Title'] == selected_title]['Description'].values[0]

# Compute embedding for the selected description
selected_embedding = model.encode(selected_description, convert_to_tensor=True)

# Compute cosine similarities between the selected description and all others
cosine_similarities = util.pytorch_cos_sim(selected_embedding, embeddings).flatten()

# Find the top N most similar descriptions
N = st.slider("Number of similar sessions to display", 1, 10, 5)
top_n_indices = torch.topk(cosine_similarities, N + 1).indices[1:]  # Skip the first as it's the selected one

# Display the most similar descriptions
st.write(f"Top {N} most similar sessions to '{selected_title}':")
for index in top_n_indices:
    index = int(index)  # Convert tensor index to int
    st.write(f"**{df.iloc[index]['Title']}**")
    st.write(df.iloc[index]['Description'])
    st.write("----")
