In [1]:
# main variables
old_db_path = "data/FS2025.db"
new_db_path = "data/HS2025.db"

persons_url = "https://services.ini.uzh.ch/admin/modules/uzh/people.php"
profile_url_prefix = "https://services.ini.uzh.ch/"
colloquia_url = "https://services.ini.uzh.ch/admin/modules/uzh/colloquia"

In [2]:
# Create db for new semester that is based on old semester's
import os.path as path
import shutil

assert not path.exists(new_db_path)
shutil.copy2(old_db_path, new_db_path)

'data/HS2025.db'

In [3]:
# Update person information with the information available online
from bs4 import BeautifulSoup
import requests
import sqlite3
import time
from tqdm import tqdm
from urllib.parse import urljoin

timestamp = time.strftime('%Y-%m-%d %H-%M-%S')
log_path = f"logs/{timestamp}.txt"
f_log = open(log_path, "w")

# Step 1: Get the person information available online. 
response = requests.get(persons_url)
soup = BeautifulSoup(response.text, "html.parser")

persons_data = []
for section in tqdm(soup.find_all("div", class_="headerfirst peopleTable")):
    try:
        h2 = section.find("h2")
        category = h2.get_text()
        # We need only the information under ["PhD Students", "NSC Master Student"] tables
        assert category in ["PhD Students", "NSC Master Students"] 
        table = section.find("table")
        tbody = table.find("tbody")
        assert tbody
    except:
        continue

    for tr in tbody.find_all("tr"):
        tds = tr.find_all("td")
        if len(tds) < 2:
            continue
        
        name_tag = tds[0].find("a")
        hard_to_read_name = name_tag.get_text(strip=True) if name_tag else None
        hard_to_read_name_parts = [
            part.strip() for part in hard_to_read_name.split(",")
        ]
        assert len(hard_to_read_name_parts) == 2
        easy_to_read_name = " ".join(hard_to_read_name_parts[::-1]).title()

        profile_url = name_tag["href"] if name_tag else None
        if not profile_url.startswith(profile_url_prefix):
            profile_url = urljoin(profile_url_prefix, profile_url)
        
        email_address_tag = tds[1].find("a", href=lambda x: x and x.startswith("mailto:"))
        email_address = email_address_tag.get_text(strip=True) if email_address_tag else None

        position = category.rstrip("s")

        persons_data.append((easy_to_read_name, position, email_address, profile_url))

# Step 3: Update existing persons positions, email addresses, and profile urls + Insert new persons
conn = sqlite3.connect(new_db_path)
cursor = conn.cursor()

for name, position, email_address, profile_url in tqdm(persons_data):
    # Check if person already exists
    cursor.execute("SELECT PersonID, Position, ProfileURL FROM Persons WHERE Name = ?", (name,))
    rows = cursor.fetchall()

    # Person is new
    if len(rows) == 0:
        cursor.execute("""
            INSERT INTO Persons (Name, Position, ContactEmailAddress, ProfileURL)
            VALUES (?, ?, ?, ?)
        """, (name, position, email_address, profile_url))

        # Also insert email_address into EmailAddresses
        if email_address:
            person_id = cursor.lastrowid
            cursor.execute("INSERT INTO EmailAddresses (PersonID, EmailAddress) VALUES (?, ?)", (person_id, email_address))

    # Person already exists
    elif len(rows) == 1:
        person_id, current_position, current_profile_url = rows[0]

        # Update position if different
        if position and position != current_position:
            print(f"Updating {name}: {current_position=} -> {position=}", file=f_log)
            cursor.execute("UPDATE Persons SET Position = ? WHERE PersonID = ?", (position, person_id))

        # Update profile URL if different
        if profile_url and profile_url != current_profile_url:
            print(f"Updating {name}: {current_profile_url} -> {profile_url}", file=f_log)
            cursor.execute("UPDATE Persons SET ProfileURL = ? WHERE PersonID = ?", (profile_url, person_id))

        # Add email address if new
        if email_address:
            cursor.execute("SELECT 1 FROM EmailAddresses WHERE PersonID = ? AND EmailAddress = ?", (person_id, email_address))
            if not cursor.fetchone():
                print(f"Updating {name}: {email_address}", file=f_log)
                cursor.execute("INSERT INTO EmailAddresses (PersonID, EmailAddress) VALUES (?, ?)", (person_id, email_address))

    else:
        print(f"Warning: Potentially duplicated person ({name=})", file=f_log)
        break

    conn.commit()

# Step 4: All people not among the online information are no longer students at INI, so update their position value
cursor.execute("SELECT PersonID, Name, Position, ProfileURL FROM Persons WHERE Position = ?", ("Not INI Student",))
old_n_non_students = len(cursor.fetchall())

# SQLite does not support list directly, so we use a parameterized query
scraped_names = [name for name, _, _, _ in persons_data]
placeholders = ",".join("?" for _ in scraped_names)
cursor.execute(f"""
    UPDATE Persons
    SET Position = 'Not INI Student'
    WHERE Name NOT IN ({placeholders})
""", scraped_names)

cursor.execute("SELECT PersonID, Name, Position, ProfileURL FROM Persons WHERE Position = ?", ("Not INI Student",))
print(f"Number of non students: {old_n_non_students} -> {len(cursor.fetchall())}", file=f_log)
conn.commit()

# Step 5: Update everyone with their supervisor/mentor name
cursor.execute("SELECT PersonID, Name, ProfileURL FROM Persons WHERE ProfileURL IS NOT NULL")
people_with_profiles = cursor.fetchall()

for person_id, name, profile_url in tqdm(people_with_profiles):
    response = requests.get(profile_url)
    soup = BeautifulSoup(response.text, "html.parser")
    supervisor_tag = soup.find("h2", string=lambda text: text and ("Mentor" in text or "Supervisor" in text))
    supervisor_name = supervisor_tag.find_next("a").get_text(strip=True)

    cursor.execute("SELECT Supervisor FROM Persons WHERE PersonID = ?", (person_id,))
    rows = cursor.fetchall()
    if len(rows) == 1:
        current_supervisor_name = rows[0][0]
        if current_supervisor_name != supervisor_name:
            cursor.execute("UPDATE Persons SET Supervisor = ? WHERE PersonID = ?", (supervisor_name, person_id))
            print(f"Updated supervisor for {name=}: {current_supervisor_name} -> {supervisor_name}", file=f_log)
            conn.commit()
        
    time.sleep(0.1)

conn.commit()
conn.close()

f_log.close()

100%|██████████| 19/19 [00:00<00:00, 6493.26it/s]
100%|██████████| 91/91 [00:00<00:00, 314.73it/s]
100%|██████████| 105/105 [00:48<00:00,  2.17it/s]


In [4]:
# Assign each person a score (based on a random number and weights)
import sqlite3
import pandas as pd
import numpy as np

from data.names_of_unavailable import names_of_unavailable

# Connect to SQLite
conn = sqlite3.connect(new_db_path)

# Load people with their past aperos counts
query = """
SELECT 
    p.PersonID,
    p.Name,
    p.ContactEmailAddress,
    SUM(CASE WHEN a.Status='done' THEN 1 ELSE 0 END) AS DoneCount,
    SUM(CASE WHEN a.Status='found_sub' THEN 1 ELSE 0 END) AS FoundSubCount,
    SUM(CASE WHEN a.Status='awol' THEN 1 ELSE 0 END) AS AWOLCount
FROM Persons p
LEFT JOIN AperoAssignments a ON p.PersonID = a.PersonID
WHERE p.Position IN ('PhD Student', 'NSC Master Student')
GROUP BY p.PersonID
"""
df = pd.read_sql(query, conn)
conn.close()

# Write down predicted availability
# Get names from admin, ad-hoc identify the corresponding names in the db and use those here
# For some people this includes middle names, for others not. Some names have accents
df["Available"] = [1 for _ in range(len(df))]
df.loc[df["Name"].isin(names_of_unavailable), "Available"] = 0

# Base random number
np.random.seed(17)
df["Random"] = np.random.rand(len(df))
df["TotalAssigned"] = df["DoneCount"] + df["FoundSubCount"] + df["AWOLCount"]

# Define weights
weight_done = 1.0      # reduce score for done
weight_found_sub = 0.5 # reduce score for found_sub
weight_awol = -1.5     # negative weight → more AWOL increases score
penalty_never_assigned = 0.8  # slightly reduce score if total assigned = 0

# Compute score and multiply by Availability
df["Score"] = df["Available"] * df["Random"] / (
    0.5 + \
    df["DoneCount"]*weight_done + \
    df["FoundSubCount"]*weight_found_sub + \
    df["AWOLCount"]*weight_awol + \
    df["TotalAssigned"].apply(lambda x: 0 if x else penalty_never_assigned)
)

# Clip negative scores to zero
df["Score"] = df["Score"].clip(lower=0)

# Sort descending
df = df.sort_values("Score", ascending=False).reset_index(drop=True)

df[["Name", "ContactEmailAddress", "DoneCount", "FoundSubCount", "AWOLCount", "Score"]].head(20)

Unnamed: 0,Name,ContactEmailAddress,DoneCount,FoundSubCount,AWOLCount,Score
0,Xiaoliang Wang,xiaoliang.wang@uzh.ch,0,0,0,0.738728
1,Raimon Bullich,raimonbullich@ini.ethz.ch,0,0,0,0.660404
2,Sandro Ackermann,sandro@ini.uzh.ch,1,0,0,0.640525
3,Giovanni Camisa,giovanni.camisa@uzh.ch,1,0,0,0.636225
4,Maryada Maryada,maryada@ini.uzh.ch,0,0,0,0.611996
5,Florian Bolli,bollif@ethz.ch,1,0,0,0.61074
6,Emilly Zoë Jeanne Sidaine--Daumiller,esidai@ini.uzh.ch,0,0,0,0.605373
7,Leonardo Martinelli,leomar@ini.uzh.ch,0,0,0,0.603911
8,Maris Basha,maris@ini.uzh.ch,1,0,0,0.591988
9,Liyuan Li,liyli@ini.uzh.ch,1,0,0,0.589243


In [5]:
# Create assignment table
from datetime import datetime
from bs4 import BeautifulSoup
import requests

response = requests.get(colloquia_url)
soup = BeautifulSoup(response.text, "html.parser")

# Extract apero info
aperos = []
for tr in soup.find_all("tr"):
    tds = tr.find_all("td")
    if len(tds) < 4:
        continue

    # Convert date to ISO format
    raw_date = tds[0].get_text(strip=True)
    date = datetime.strptime(raw_date, "%d.%m.%y").strftime("%Y-%m-%d")

    speaker_info = tds[2].get_text(separator="\n", strip=True)
    # speaker_name = speaker_info[0] if speaker_info else None

    # Talk title
    title = tds[3].get_text(strip=True)

    aperos.append({
        "Date": date,
        "Speaker": speaker_info,
        "Title": title,
    })

aperos = pd.DataFrame(aperos)
aperos

Unnamed: 0,Date,Speaker,Title
0,2025-09-19,Herbert Jaeger\nUniversity of Groningen (Groni...,"""Hrhhmm... what does ‘Computing’ mean anyway? ..."
1,2025-09-26,Jean-Pascal Pfister\nDepartment of Physiology...,"""Why spikes? An information theory perspective"""
2,2025-10-03,Gregor Schuhknecht\nMPI Brain Research (Frankf...,"""Structure and function of biological neural n..."
3,2025-10-10,Tor Stensola\nUniversity of Agder (Kristiansan...,TBA
4,2025-10-17,Johannes Sarnthein\nUniversitätsspital Zürich ...,"""The hippocampus in human working memory"""
5,2025-10-24,Elisabetta Chicca\nUniversity of Groningen (Gr...,TBA
6,2025-11-07,Steffen Schneider\nHelmholtz Munich ( Neuherbe...,TBA
7,2025-11-14,"Luca Benini\nETH Zurich, D-ITET (Zurich, Switz...",""" End-to-End Open Platforms for Embodied Gener..."
8,2025-11-21,Nathalie Rochefort\nUniversity of Edinburgh (...,TBA
9,2025-11-28,"Adil Khan\nInstitute of Psychiatry, Psychology...","""Neural circuits underlying flexible behaviour"""


In [None]:
# assign people with top scores to the aperos
n_persons_per_apero = 3

contacts = df[:len(aperos) * n_persons_per_apero].apply(lambda x: f"{x['Name']}\n{x['ContactEmailAddress']}", axis=1).tolist()
contacts = pd.DataFrame(
    [contacts[i:i+n_persons_per_apero] for i in range(0, len(contacts), n_persons_per_apero)],
    columns = [f"Person {i}" for i in range(1, n_persons_per_apero+1)],
)
schedule = pd.concat([aperos, contacts], axis=1)
schedule

Unnamed: 0,Date,Speaker,Title,Person 1,Person 2,Person 3
0,2025-09-19,Herbert Jaeger\nUniversity of Groningen (Groni...,"""Hrhhmm... what does ‘Computing’ mean anyway? ...",Xiaoliang Wang\nxiaoliang.wang@uzh.ch,Raimon Bullich\nraimonbullich@ini.ethz.ch,Sandro Ackermann\nsandro@ini.uzh.ch
1,2025-09-26,Jean-Pascal Pfister\nDepartment of Physiology...,"""Why spikes? An information theory perspective""",Giovanni Camisa\ngiovanni.camisa@uzh.ch,Maryada Maryada\nmaryada@ini.uzh.ch,Florian Bolli\nbollif@ethz.ch
2,2025-10-03,Gregor Schuhknecht\nMPI Brain Research (Frankf...,"""Structure and function of biological neural n...",Emilly Zoë Jeanne Sidaine--Daumiller\nesidai@i...,Leonardo Martinelli\nleomar@ini.uzh.ch,Maris Basha\nmaris@ini.uzh.ch
3,2025-10-10,Tor Stensola\nUniversity of Agder (Kristiansan...,TBA,Liyuan Li\nliyli@ini.uzh.ch,Jacob Ayers\njayers@ethz.ch,Luca Nicolas Yapura\nlyapura@ethz.ch
4,2025-10-17,Johannes Sarnthein\nUniversitätsspital Zürich ...,"""The hippocampus in human working memory""",Orhun Eren\noreren@student.ethz.ch,Robin Duvoisin\nrobinarnaud.duvoisin@uzh.ch,Chonghao Cai\nchonghao.cai@uzh.ch
5,2025-10-24,Elisabetta Chicca\nUniversity of Groningen (Gr...,TBA,Tianshu Shen\ntianshu.shen@uzh.ch,Siqi Liu\nsiqi@ini.ethz.ch,Joelle Faybishenko\njoellev.faybishenko@uzh.ch
6,2025-11-07,Steffen Schneider\nHelmholtz Munich ( Neuherbe...,TBA,Aidan Truel\naidangarrick.truel@uzh.ch,Patrick Bösch\nboeschpa@ethz.ch,Julien Schmidt\njulien@ini.uzh.ch
7,2025-11-14,"Luca Benini\nETH Zurich, D-ITET (Zurich, Switz...",""" End-to-End Open Platforms for Embodied Gener...",Valentin Magis\nvalentinwilfriedfernandosimon....,Marcel Socoró Garrigosa\nmarcel.socorogarrigos...,Victoria Ploerer\nvictoriaisabella.ploerer@uzh.ch
8,2025-11-21,Nathalie Rochefort\nUniversity of Edinburgh (...,TBA,Alessandro Bifulco\nalessandro.bifulco@uzh.ch,William Buxton\nwilliamjoseph.buxton@uzh.ch,Zhining Zhang\nzhining.zhang@uzh.ch
9,2025-11-28,"Adil Khan\nInstitute of Psychiatry, Psychology...","""Neural circuits underlying flexible behaviour""",Simon Steffens\nssteffens@student.ethz.ch,Xueqian Ma\nxueqma@student.ethz.ch,Elena Cucculelli\nelena.cucculelli@uzh.ch


In [None]:
# export schedule as csv
schedule.to_csv("data/HS2025.csv", index=False)