In [252]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import yaml

In [253]:
# Get table of past workshops
pastw = requests.get("https://ucsbcarpentry.github.io/past-workshops/").text
soup = BeautifulSoup(pastw, 'html.parser')

In [254]:
# Search tables in the data
table = soup.find('table')

In [255]:
wlist = []
for row in table.tbody.find_all("tr"):
    columns = row.find_all("td")
    for td in columns:
        if td.a:
            wlist.append({'href': td.a.get('href'), 'title_home': ' '.join(td.a.text.split())})

In [256]:
for w in wlist:
    if w['href'].rsplit('/')[1] == 'workshop':
        filename = '-'.join(w['href'].rsplit('/')[2:6])
        md_filename = filename[:-4] + 'md'
        wsite = 'https://raw.githubusercontent.com/UCSBCarpentry/ucsbcarpentry.github.io/main/_posts/' + md_filename
        wtext = requests.get(wsite).text
        w['date'] = filename[:10]
        try:
            instructors = re.search(r'instructors:\s(.+)', wtext).group(1).split(', ')
            helpers = re.search(r'helpers:\s(.+)', wtext).group(1).split(', ')
            w['instructors'] = instructors
            w['helpers'] = helpers
        except AttributeError:
            w['instructors'] = []
            w['helpers'] = []
    elif w['href'].rsplit('/')[2].endswith('github.io'):
        try:
            github = re.search(r'https://(.*?)\.github\.io', w['href']).group(1)
            page = w['href'].rsplit('/')[-2]
            website = 'https://raw.githubusercontent.com/' + github + '/' + page + '/gh-pages/index'
            wtext = requests.get(website + '.md').text
            if wtext == '404: Not Found':
                wtext = requests.get(website + '.html').text
            w['date'] = str(yaml.safe_load(wtext.strip().split('---\n')[1]).get('startdate', []))
            w['instructors'] = yaml.safe_load(wtext.strip().split('---\n')[1]).get('instructor', [])
            w['helpers'] = yaml.safe_load(wtext.strip().split('---\n')[1]).get('helper', [])
        except AttributeError:
            w['date'] = []
            w['instructors'] = []
            w['helpers'] = []
    else:
        print('error')

In [257]:
wlist

[{'href': '/workshop/2024/06/10/ucsb-geospatial.html',
  'title_home': 'Introduction to Geospatial Raster and Vector Data with R',
  'date': '2024-06-10',
  'instructors': ['Julien Brun',
   'Jon Jablonski',
   'Kristi Liu',
   'Jose Niño Muriel'],
  'helpers': ['Patty Park']},
 {'href': '/workshop/2024/06/04/ucsb-containers.html',
  'title_home': 'Container-driven Reproducible Research Made Simple',
  'date': '2024-06-04',
  'instructors': ['Ronald Lencevičius'],
  'helpers': ['Sang-Yun Oh', 'Yu-Chieh "Jay" Chi', 'Jose Niño Muriel']},
 {'href': '/workshop/2024/05/01/ucsb-r.html',
  'title_home': 'Introduction to Data Analysis with R for Social Science',
  'date': '2024-05-01',
  'instructors': ['Seth Erickson',
   'Jon Jablonski',
   'Greg Janée',
   'Jose Nino Muriel',
   'Donovan Rasamoelison'],
  'helpers': ['Dave Hunter', 'Kristi Liu', 'Noah Spahn']},
 {'href': '/workshop/2024/02/27/ucsb-webscraping.html',
  'title_home': 'Introduction to Web Scraping',
  'date': '2024-02-27',
  '

In [259]:
data = []
for workshop in wlist:
    row = {
        'date': workshop['date'],
        'title_home': workshop['title_home']
    }
    for instructor in workshop['instructors']:
        row[instructor] = 'instructor'
    for helper in workshop['helpers']:
        row[helper] = 'helper'
    data.append(row)

df = pd.DataFrame(data)

# Step 2: Replace NaN with empty strings
df = df.fillna('')

# Step 3: Get the list of all columns except 'date' and 'wname'
columns = df.columns.tolist()
fixed_columns = ['date', 'title_home']
name_columns = sorted([col for col in columns if col not in fixed_columns])

# Step 4: Reorder the DataFrame columns
df = df[fixed_columns + name_columns]

In [260]:
df.head()

Unnamed: 0,date,title_home,Alexandre de Siqueira (UCB),Amanda Ho,Amber Budden,Amelia Meyer,Amelia Meyers,Andrea Medina-Smith,Anita Carraher,Anna Sackmann (UCB),...,Susan Mazer,TBD,Thao Phan,Tim Dennis (UCLA),Torin White,Vania Wang,"Yu-Chieh ""Jay"" Chi",Yuan Wu,Zach Sisco,Zhiyuan Yao (UCLA)
0,2024-06-10,Introduction to Geospatial Raster and Vector D...,,,,,,,,,...,,,,,,,,,,
1,2024-06-04,Container-driven Reproducible Research Made Si...,,,,,,,,,...,,,,,,,helper,,,
2,2024-05-01,Introduction to Data Analysis with R for Socia...,,,,,,,,,...,,,,,,,,,,
3,2024-02-27,Introduction to Web Scraping,,,,,,,,,...,,,,,,,,,,
4,2024-01-30,Version Control with Git,,,,,,,,,...,,,,,,,,,,
