# Student Thesis Supervisor Matching

The problem to solve is matching student preferences to available supervisor slots. At the same time, each supervisor should have an equal number of students and each student should get their highest preference topic. 

## Extract supervisors and topics

Topics and supervisors are collected from the website. Note that if people leave the group then the supervisor of the topic is not detected correctly as the contact for thesis coordination is automatically shown and falsely taken as supervisor.

In [51]:
import requests
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from urllib.parse import urljoin

web_url = 'https://www.wur.nl'
topics_page_url = urljoin(web_url,'/en/research-results/chair-groups/environmental-sciences/laboratory-of-geo-information-science-and-remote-sensing/education/thesis/msc-thesis-topics/all-topics.htm?f46940599=')

topics_df = pd.DataFrame(columns=["ID", "Topic","Supervisor","Second Supervisor","Link","Max. Students"])

id_prefix = "mgi_24_"

supervisors_df = pd.DataFrame(columns=["ID", "Name","Titled","Role","Affiliation","Link","Max. Students"])

topic_nr = 1
for i in range(0,130,10):
    response = requests.get(topics_page_url+str(i))
    soup = BeautifulSoup(response.text, 'html.parser')
    mylis = soup.find_all("li", {"class": "list-item"})
    for li in mylis:
        for topic_url in li.find_all('a'):
            full_topic_url = urljoin(web_url,topic_url.get('href'))
            print(full_topic_url)
            response = requests.get(full_topic_url)
            topic_soup = BeautifulSoup(response.text, 'html.parser')
            print(topic_soup.title.string)
            #print(topic_soup.prettify())
            myaddresses = topic_soup.find_all("address", {"class": "vcard"})
            people = list()
            for address in myaddresses:
                myas = address.find_all("a", {"class": "navigation"})
                for a in myas:
                    print(a.get('title'))
                    full_person_url = urljoin(web_url,a.get('href'))
                    print(full_person_url)
                    people.append(full_person_url)

                    if not full_person_url in supervisors_df['Link'].values:
                        response = requests.get(full_person_url)
                        person_soup = BeautifulSoup(response.text, 'html.parser')

                        role = ""
                        mymetas = person_soup.find_all("meta", {"name": "position"})
                        for mymeta in mymetas:
                            print(mymeta.get('content'))
                            role = mymeta.get('content')

                        name = ""
                        affil = ""
                        mynavs = person_soup.find_all("nav", {"aria-label": "Breadcrumbs"})
                        for mynav in mynavs:
                            mylis2 = mynav.find_all("li", {"class": "breadcrumbs__crumb"})
                            for myli2 in mylis2:
                                #print(myli2.prettify)
                                if not myli2.find("a"):
                                    affil_name = myli2.string.split(" - ")
                                    affil = " ".join(affil_name[0].split())
                                    name = " ".join(affil_name[1].split())
                                    print(affil)
                                    print(name)

                        num_students = 1
                        role_lower = role.lower()
                        if "Geo-information" in affil:
                            if "professor" in role_lower:
                                num_students = 5
                            elif "universitair" in role_lower:
                                num_students = 5
                            elif "lecturer" in role_lower:
                                num_students = 3
                            elif "docent" in role_lower:
                                num_students = 3
                            elif "promovendus" in role_lower:
                                num_students = 1
                            elif "phd" in role_lower:
                                num_students = 1
                            else: #postdoc
                                num_students = 2
                        print(num_students)
                        person_entry = pd.DataFrame.from_dict({
                            "ID": [len(supervisors_df)],
                            "Name": [name],
                            "Titled": [a.get('title')],
                            "Role":  [role],
                            "Affiliation":  [affil],
                            "Link":[full_person_url],
                            "Max. Students":[num_students]
                        })
                        supervisors_df = pd.concat([supervisors_df, person_entry], ignore_index=True)
                    
            if len(people)==0:
                people.append("")
                people.append("")
            if len(people)==1:
                people.append("")
            entry = pd.DataFrame.from_dict({
                "ID": [id_prefix+f"{topic_nr:02d}"],
                "Topic": [topic_soup.title.string],
                "Supervisor":  [people[0]],
                "Second Supervisor":  [people[1]],
                "Link":[full_topic_url],
                "Max. Students":[1]
            })
            
            topics_df = pd.concat([topics_df, entry], ignore_index=True)
        topic_nr = topic_nr + 1
supervisors_df.to_csv("supervisors.csv")
topics_df.to_csv("topics.csv")

https://www.wur.nl/en/article/msc-thesis-topic-9000-pigs-why-humans-cannot-understand-large-numbers-and-what-this-means-for-the-planet-and-society.htm
MSc thesis topic: 9000 Pigs – Why humans cannot understand large numbers and what this means for the planet and society - WUR
prof.dr. AKAJ (Alexander) Klippel
https://www.wur.nl/en/persons/alexander-klippel.htm
Personal Professor
Cultural Geography
Alexander Klippel
1
https://www.wur.nl/en/article/msc-thesis-topic-agent-based-modeling-and-simulation-of-shared-space-in-virtual-reality.htm
MSc thesis topic: Agent-based modeling and simulation of shared space in virtual reality - WUR
J (Jiayan) Zhao PhD
https://www.wur.nl/en/persons/jiayan-zhao.htm
Universitair docent
Laboratory of Geo-information Science and Remote Sensing
Jiayan Zhao
5
dr.ir. A (Arend) Ligtenberg
https://www.wur.nl/en/persons/arend-ligtenberg.htm
Assistant Professor
Laboratory of Geo-information Science and Remote Sensing
Arend Ligtenberg
5
https://www.wur.nl/en/article/