In [1]:
# Description: Scrapes the schedule from the ISI 2023 website and saves it as a csv file.
# Then, ranks events for each start time according to user's interests using GPT-3.5

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from tqdm import tqdm

In [18]:
# start by defining the options
options = webdriver.ChromeOptions()
options.headless = True  # it's more scalable to work in headless mode
# normally, selenium waits for all resources to download
# we don't need it as the page also populated with the running javascript code.
options.page_load_strategy = "none"
# this returns the path web driver downloaded
chrome_path = ChromeDriverManager().install()
chrome_service = Service(chrome_path)
# pass the defined options and service objects to initialize the web driver
driver = Chrome(options=options, service=chrome_service)
driver.implicitly_wait(5)

  options.headless = True # it's more scalable to work in headless mode


In [21]:
start_url = "https://www.isi2023.org/conferences/15/programme/"
base_url = "https://www.isi2023.org"

driver.get(start_url)
time.sleep(2)
page_source = driver.page_source
page_source

soup = BeautifulSoup(page_source)

In [22]:
# Schedule is within #programme_new div.is-10-fullhd
schedule = soup.find(id="programme_new")
# Within schedule, find the div with class "is-10-fullhd"
schedule = schedule.find(class_="is-10-fullhd")
# Each day is within a child div of the schedule div
# Get all direct > div descendents of schedule as a list
days = schedule.find_all("div", recursive=False)
len(days)

4

In [None]:
# Create a list of all the events
events = []
for day in list(days):
    # The h3 tag contains the date in this format: "Monday, 17 July 2023"
    date = day.find("h3").get_text()
    print(date)
    # All the events are within a child div.column
    day_events = day.find("div", recursive=False)
    event_list = day_events.find_all("div", {"class": "item"})

    # Loop through all the events using TQDM to show progress bar
    for i in tqdm(range(len(event_list))):
        event_divs = event_list[i].find_all("div", {"class": "column"})
        event = {}
        # Each of these has 4 child divs: time, event title, location and Link to details
        time_div = event_divs[0]
        title_div = event_divs[1]
        location_div = event_divs[2]
        link_div = event_divs[3]

        print(title_div.get_text())

        # Time div contains two spans: Start and end time (format 16:00)
        time_spans = time_div.find_all("span")
        start_time = time_spans[0].get_text()
        end_time = time_spans[1].get_text()
        event["start_time"] = start_time.strip()
        event["end_time"] = end_time.strip()

        event["title"] = title_div.get_text()
        event["location"] = location_div.get_text()
        event["link"] = link_div.find("a")["href"]
        # Manipulate link to absolute URL
        event["link"] = base_url + event["link"]

        # Add date to event
        event["date"] = date

        # Now follow the link to get the description
        event_page = driver.get(event["link"])
        time.sleep(1)
        event_page_source = driver.page_source
        event_soup = BeautifulSoup(event_page_source)

        # There are 4 kinds of events: CPS, IPS, Poster, and SIPS
        # They are in "span.category"
        category = event_soup.find("span", {"class": "category"})
        if category:
            event["category"] = category.get_text()

        # Paper titles are in h3.is-size-4
        # There can be multiple paper titles
        paper_titles = event_soup.find_all("h3", {"class": "is-size-4"})
        event["paper_titles"] = ";".join([title.get_text() for title in paper_titles])

        # Authors are in div.author
        # There can be multiple authors
        authors = event_soup.find_all("div", {"class": "author"})
        event["authors"] = ";".join([author.get_text() for author in authors])

        # Sometimes there is a description also in div.description
        description = event_soup.find("div", {"class": "description"})
        if description:
            event["description"] = description.get_text()

        # Add event to list of events
        events.append(event)

In [41]:
# Create a dataframe from the list of events
df = pd.DataFrame(events)

df

Unnamed: 0,start_time,end_time,title,location,link,date,category,paper_titles,authors,description
0,08:30,09:40,CPS 01 - Statistical methodology II,CPS Room 108,https://www.isi2023.org/conferences/session/53...,"Monday, 17 July 2023",Category: CPS,Performance Metrics for Sample Selection Bias ...,\n AL\n Ms An-C...,Ms An-Chiao Liu Mr Marc Vidal Mária Pécs Karen...
1,08:30,09:40,CPS 02 - Aspects of official statistics II,CPS Room 101,https://www.isi2023.org/conferences/session/53...,"Monday, 17 July 2023",Category: CPS,Quality Management in Statistics Portugal – ne...,\n MZ\n Mrs Mar...,Mrs Maria Zilhão Zilhão\nMr Ali Al flaiti\n
2,08:30,09:40,CPS 03 - Environmental statistics II and CPS 4...,CPS Room 104,https://www.isi2023.org/conferences/session/52...,"Monday, 17 July 2023",Category: CPS,From skepticism to conviction: The emerging st...,\n BS\n Prof. B...,Prof. Bashiru I.I. Saeed\nProf. Lillian Pazvak...
3,08:30,09:40,CPS 04 - Finance and business statistics II,CPS Room 105,https://www.isi2023.org/conferences/session/52...,"Monday, 17 July 2023",Category: CPS,Application of Geographically Weighted Regress...,\n LG\n Prof. L...,Prof. Luigi Grossi\nMr Bob Barugahare
4,08:30,09:40,CPS 05 - Statistical modelling VI,CPS Room 201,https://www.isi2023.org/conferences/session/52...,"Monday, 17 July 2023",Category: CPS,Modelling consumer preferences in Multilateral...,Prof. Tiziana Laureti ; Prof. Claudia Adriana...,Prof. Tiziana Laureti\nProf. Claudia Adriana C...
...,...,...,...,...,...,...,...,...,...,...
294,14:00,15:40,IPS 406 - Advances in Symbolic Data Analysis,IPS Room 102,https://www.isi2023.org/conferences/session/45...,"Thursday, 20 July 2023",Category: IPS,From Numbers to Intervals to Distributions;Hyp...,\n PB\n PROF. D...,This session intends to focus on some of the m...
295,14:00,15:40,IPS 408 - Recent advancements in data governan...,IPS Room 206,https://www.isi2023.org/conferences/session/45...,"Thursday, 20 July 2023",Category: IPS,European Initiatives to sustainably use new da...,\n AW\n Mr Albr...,The session will present a series of recent in...
296,14:00,15:40,IPS 413 - Reimagining data literacy education ...,IPS Room 207,https://www.isi2023.org/conferences/session/44...,"Thursday, 20 July 2023",Category: IPS,Educating Nonmajors: Real Life Experiences of ...,\n SB\n Prof. S...,Description This session aims to promote shift...
297,14:00,15:40,IPS 423 - Design and Analysis of Order-of-Addi...,IPS Room 107,https://www.isi2023.org/conferences/session/46...,"Thursday, 20 July 2023",Category: IPS,A Position-Based Approach for Design and Analy...,\n HX\n Prof. H...,"Traditionally, factorial design is one of the ..."


In [43]:
# Extract the day from the date
df["day"] = df["date"].apply(lambda x: x.split(",")[0])
df["date"] = df["date"].apply(lambda x: x.split(",")[1])

In [44]:
df2 = df[
    [
        "date",
        "day",
        "start_time",
        "end_time",
        "title",
        "location",
        "category",
        "paper_titles",
        "authors",
        "description",
        "link",
    ]
]
df2.sort_values(by=["date", "start_time", "title"], inplace=True)

In [46]:
df2.to_csv("schedule_sorted.csv", index=False)

## Start here to skip the scraping

In [7]:
import pandas as pd
import time
from tqdm import tqdm
import json

import openai
import os

# Load the API key ("OPENAI_API_KEY") from the .env file
from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

df2 = pd.read_csv("schedule_sorted.csv")
df2["start_date_time"] = df2["date"] + " " + df2["start_time"]

In [5]:
system_prompt = """
You are a scheduling bot for the World Statistics Conference which finds events that would most interest the user.
You will be provided with an event title and some additional info.
For each event, you will determine a relevance score between 0 and 1, where 1 is the most relevant and 0 is the least relevant.
Return the scoring in the following JSON format: [{title:<title1>, score:<score1>}, {title:<title2>, score:<score2>}, ...]. Do not include additional text.
"""

user_prompt = """
My interests are:
Public Health, Machine Learning, Government, Natural Language Processing (NLP), Reinforcement Learning, Data Science.
These are the events I have to choose from:\n
"""

In [6]:
responses = []
delay = 1
for i, start_date_time in enumerate(tqdm(df2["start_date_time"].unique())):
    events = df2[df2["start_date_time"] == start_date_time][
        ["title", "paper_titles", "description"]
    ].to_dict("records")

    for event in events:
        event["description"] = event["description"][:140]

    user_full_prompt = user_prompt + str(events)

    while True:
        # Get response with exponential backoff
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_full_prompt},
                ],
                temperature=0,
                max_tokens=1024,
                top_p=1.0,
                frequency_penalty=0.0,
                presence_penalty=0.0,
            )
            delay = 1
            break
        except Exception as e:
            print(e)
            print("Waiting", delay, "seconds...")
            time.sleep(delay)
            delay *= 2

    responses.append(
        {
            "start_date_time": start_date_time,
            "response": response.choices[0]["message"]["content"],
        }
    )
    print(response.choices[0]["message"]["content"])

  0%|          | 0/18 [00:09<?, ?it/s]

[{"title": "CPS 01 - Statistical methodology II", "score": 0.2}, {"title": "CPS 02 - Aspects of official statistics II", "score": 0.1}, {"title": "CPS 03 - Environmental statistics II and CPS 44 - Statistical Modelling II", "score": 0.3}, {"title": "CPS 04 - Finance and business statistics II", "score": 0.1}, {"title": "CPS 05 - Statistical modelling VI", "score": 0.2}, {"title": "CPS 06 - Clustering", "score": 0.1}, {"title": "CPS 07 - Statistical estimation II", "score": 0.2}, {"title": "CPS 08 - Statistics and climate I and CPS 64 - Statistics and climate II", "score": 0.3}, {"title": "CPS 09 - Impact of covid III", "score": 0.2}, {"title": "CPS 10 - Disease and mortality modelling", "score": 0.3}]





In [11]:
json_responses = []
for response in responses:
    start_date_time = response['start_date_time']
    event_scores = json.loads(response['response'])
    for event_score in event_scores:
        event_score['start_date_time'] = start_date_time
    json_responses.extend(event_scores)
json_responses[:3]

[{'title': 'CPS 01 - Statistical methodology II',
  'score': 0.2,
  'start_date_time': ' 17 July 2023 08:30'},
 {'title': 'CPS 02 - Aspects of official statistics II',
  'score': 0.1,
  'start_date_time': ' 17 July 2023 08:30'},
 {'title': 'CPS 03 - Environmental statistics II and CPS 44 - Statistical Modelling II',
  'score': 0.3,
  'start_date_time': ' 17 July 2023 08:30'}]

In [10]:
scores_df = pd.DataFrame(json_responses)
# Join with events dataframe
ranked_df = scores_df.merge(df2, on=["title", "start_date_time"]).sort_values(by=["start_date_time", "score"], ascending=[True, False])
ranked_df.head()

Unnamed: 0,title,score,start_date_time,date,day,start_time,end_time,location,category,paper_titles,authors,description,link
2,CPS 03 - Environmental statistics II and CPS 4...,0.3,17 July 2023 08:30,17 July 2023,Monday,08:30,09:40,CPS Room 104,Category: CPS,From skepticism to conviction: The emerging st...,\n BS\n Prof. B...,Prof. Bashiru I.I. Saeed\nProf. Lillian Pazvak...,https://www.isi2023.org/conferences/session/52...
7,CPS 08 - Statistics and climate I and CPS 64 -...,0.3,17 July 2023 08:30,17 July 2023,Monday,08:30,09:40,CPS Room 202,Category: CPS,The relationship between population aging and ...,\n LX\n Dr Lixi...,Dr Lixiao Xu \nDr Jonathan Owen\nSarah Assem M...,https://www.isi2023.org/conferences/session/47...
9,CPS 10 - Disease and mortality modelling,0.3,17 July 2023 08:30,17 July 2023,Monday,08:30,09:40,CPS Room 103,Category: CPS,Non-Normal Estimation of Multiple Spatial Patt...,\n AK\n Dr Kass...,Ayalew Kassahun Samuel Manda Ngianga-Bakwin Ka...,https://www.isi2023.org/conferences/session/46...
0,CPS 01 - Statistical methodology II,0.2,17 July 2023 08:30,17 July 2023,Monday,08:30,09:40,CPS Room 108,Category: CPS,Performance Metrics for Sample Selection Bias ...,\n AL\n Ms An-C...,Ms An-Chiao Liu Mr Marc Vidal Mária Pécs Karen...,https://www.isi2023.org/conferences/session/53...
4,CPS 05 - Statistical modelling VI,0.2,17 July 2023 08:30,17 July 2023,Monday,08:30,09:40,CPS Room 201,Category: CPS,Modelling consumer preferences in Multilateral...,Prof. Tiziana Laureti ; Prof. Claudia Adriana...,Prof. Tiziana Laureti\nProf. Claudia Adriana C...,https://www.isi2023.org/conferences/session/52...
