In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import yaml

In [4]:
# Get table of past workshops
pastw = requests.get("https://ucsbcarpentry.github.io/past-workshops/").text
soup = BeautifulSoup(pastw, 'html.parser')

In [5]:
# Search tables in the data
table = soup.find('table')

In [6]:
wlist = []
for row in table.tbody.find_all("tr"):
    columns = row.find_all("td")
    for td in columns:
        if td.a:
            wlist.append({'href': td.a.get('href'), 'title_home': ' '.join(td.a.text.split())})

In [7]:
for w in wlist:
    if w['href'].rsplit('/')[1] == 'workshop':
        filename = '-'.join(w['href'].rsplit('/')[2:6])
        md_filename = filename[:-4] + 'md'
        wsite = 'https://raw.githubusercontent.com/UCSBCarpentry/ucsbcarpentry.github.io/main/_posts/' + md_filename
        wtext = requests.get(wsite).text
        w['date'] = filename[:10]
        try:
            instructors = re.search(r'instructors:\s(.+)', wtext).group(1).split(', ')
            helpers = re.search(r'helpers:\s(.+)', wtext).group(1).split(', ')
            w['instructors'] = instructors
            w['helpers'] = helpers
        except AttributeError:
            w['instructors'] = []
            w['helpers'] = []
    elif w['href'].rsplit('/')[2].endswith('github.io'):
        try:
            github = re.search(r'https://(.*?)\.github\.io', w['href']).group(1)
            page = w['href'].rsplit('/')[-2]
            website = 'https://raw.githubusercontent.com/' + github + '/' + page + '/gh-pages/index'
            wtext = requests.get(website + '.md').text
            if wtext == '404: Not Found':
                wtext = requests.get(website + '.html').text
            w['date'] = str(yaml.safe_load(wtext.strip().split('---\n')[1]).get('startdate', []))
            w['instructors'] = yaml.safe_load(wtext.strip().split('---\n')[1]).get('instructor', [])
            w['helpers'] = yaml.safe_load(wtext.strip().split('---\n')[1]).get('helper', [])
        except AttributeError:
            w['date'] = []
            w['instructors'] = []
            w['helpers'] = []
    else:
        print('error')

In [8]:
wlist

[{'href': '/workshop/2025/02/25/ucsb-ml.html',
  'title_home': 'Introduction to Machine Learning in Python',
  'date': '2025-02-25',
  'instructors': ['Ronald Lencevičius', 'Jose Niño Muriel'],
  'helpers': ['Arieanna Balbar', 'Jay Chi', 'Julien Brun']},
 {'href': '/workshop/2025/02/18/ucsb-intermediater.html',
  'title_home': 'Intermediate R',
  'date': '2025-02-18',
  'instructors': ['Sigrid Van Den Abeele', 'Jose Niño Muriel'],
  'helpers': ['Ronald Lencevičius', 'Seth Erickson']},
 {'href': '/workshop/2025/02/05/ucsb-containers.html',
  'title_home': 'Container-driven Reproducible Research Made Simple',
  'date': '2025-02-05',
  'instructors': ['Ronald Lencevičius'],
  'helpers': ['Seth Erickson',
   'Jose Niño Muriel',
   'Sang-Yun Oh',
   'Kinji Leslie']},
 {'href': '/workshop/2025/02/03/ucsb-computing.html',
  'title_home': 'Research Computing - What to do when your work gets too big for your laptop',
  'date': '2025-02-03',
  'instructors': ['Brian Emery',
   'Jay Chi',
   'Jos

In [11]:
# Check for names written in different ways

# Initialize an empty set to collect unique names
unique_names = set()

# Iterate over each dictionary in the list
for workshop in wlist:
    # Add all instructors and helpers to the set
    unique_names.update(workshop['instructors'])
    unique_names.update(workshop['helpers'])

# Convert the set back to a list
unique_names_list = sorted(list(unique_names))

print(unique_names_list)

['Alejandra Sicairos (UCSB)', 'Alexandre de Siqueira (UCB)', 'Allie Caughman', 'Amanda Ho', 'Amber Budden', 'Amber Chen', 'Amelia Meyer', 'Andrea Medina-Smith', 'Anita Carraher', 'Anna Sackmann (UCB)', 'Annajiat Alim Rasel', 'Annie Huang', 'Arieanna Balbar', 'Austin Covey', 'Brian Dincau', 'Brian Emery', 'Britta Schumacher', 'Camila Vargas-Poulsen', "Casey O'Hara", 'Chris Kibler', 'Christine Wells (UCLA)', 'Connor Levenson', 'Dave George (UCLA)', 'Dave Hunter', 'David Palmquist', 'David Palmquist (CSU Fullerton)', 'Derek Devnich (UC Merced)', 'Donovan Rasamoelison', 'Eastern Kang (UCSD)', 'Echelle Burns', 'Elnaz Amanzadeh Jajin', 'Erin Foster (UC Berkeley)', 'Erin Foster (UCB)', 'Ezra Aguimatang', 'Fernanda Ribeiro', 'Gavin McDonald', 'Geno Sanchez (UCLA)', 'Geoffrey Boushey (UCSF)', 'Grace Nunnelley', 'Greg Janée', 'Harry Zhou (UCSD)', 'Helper', 'Henry Stokes', 'Ian Lessing', 'Isaac Park', 'Jairo Melo-Flórez', 'James Frew', 'Jamie Jamison (UCLA)', 'Jamie Montgomery', 'Jason Flower', '

In [10]:
# We see that we should replace "David Hunter" for "Dave Hunter" and "Greg Janee" for "Greg Janée"
for workshop in wlist:
    workshop['instructors'] = ['Dave Hunter' if name == 'David Hunter' else name for name in workshop['instructors']]
    workshop['helpers'] = ['Dave Hunter' if name == 'David Hunter' else name for name in workshop['helpers']]
    workshop['instructors'] = ['Greg Janée' if name == 'Greg Janee' else name for name in workshop['instructors']]
    workshop['helpers'] = ['Greg Janée' if name == 'Greg Janee' else name for name in workshop['helpers']]
    workshop['instructors'] = ['Kristi Liu' if name == 'Kristi Liu (UCSB)' else name for name in workshop['instructors']]
    workshop['helpers'] = ['Kristi Liu' if name == 'Kristi Liu (UCSB)' else name for name in workshop['helpers']]
    workshop['instructors'] = ['Amelia Meyer' if name == 'Amelia Meyers' else name for name in workshop['instructors']]
    workshop['helpers'] = ['Amelia Meyer' if name == 'Amelia Meyers' else name for name in workshop['helpers']]
    workshop['instructors'] = ['Amber Chen' if name == 'Amber Xuqian Chen (UCSB)' else name for name in workshop['instructors']]
    workshop['helpers'] = ['Amber Chen' if name == 'Amber Xuqian Chen (UCSB)' else name for name in workshop['helpers']]
    workshop['instructors'] = ['Allie Caughman' if name == 'Allie Caughman (UCSB)' else name for name in workshop['instructors']]
    workshop['helpers'] = ['Allie Caughman' if name == 'Allie Caughman (UCSB)' else name for name in workshop['helpers']]
    workshop['instructors'] = ['Echelle Burns' if name == 'Echelle Burns (UCSB)' else name for name in workshop['instructors']]
    workshop['helpers'] = ['Echelle Burns' if name == 'Echelle Burns (UCSB)' else name for name in workshop['helpers']]
    workshop['instructors'] = ['Jose Niño Muriel' if name == 'Jose Nino Muriel' else name for name in workshop['instructors']]
    workshop['helpers'] = ['Jose Niño Muriel' if name == 'Jose Nino Muriel' else name for name in workshop['helpers']]
    workshop['instructors'] = ['Jose Niño Muriel' if name == 'Jose Niño Muriel (UCSB)' else name for name in workshop['instructors']]
    workshop['helpers'] = ['Jose Niño Muriel' if name == 'Jose Niño Muriel (UCSB)' else name for name in workshop['helpers']]
    workshop['instructors'] = ['Ronald Lencevičius' if name == 'Ron Lencevičius' else name for name in workshop['instructors']]
    workshop['helpers'] = ['Ronald Lencevičius' if name == 'Ron Lencevičius' else name for name in workshop['helpers']]
    workshop['instructors'] = ['Sigrid Van Den Abbeele' if name == 'Sigrid Van Den Abeele' else name for name in workshop['instructors']]
    workshop['helpers'] = ['Sigrid Van Den Abbeele' if name == 'Sigrid Van Den Abeele' else name for name in workshop['helpers']]

In [12]:
data = []
for workshop in wlist:
    row = {
        'date': workshop['date'],
        'title_home': workshop['title_home']
    }
    for helper in workshop['helpers']:
        row[helper] = 'helper'
    for instructor in workshop['instructors']:
        row[instructor] = 'instructor'
    data.append(row)

df = pd.DataFrame(data)

# Step 2: Replace NaN with empty strings
df = df.fillna('')

# Step 3: Get the list of all columns except 'date' and 'wname'
columns = df.columns.tolist()
fixed_columns = ['date', 'title_home']
name_columns = sorted([col for col in columns if col not in fixed_columns])

# Step 4: Reorder the DataFrame columns
df = df[fixed_columns + name_columns]

In [13]:
df

Unnamed: 0,date,title_home,Alejandra Sicairos (UCSB),Alexandre de Siqueira (UCB),Allie Caughman,Amanda Ho,Amber Budden,Amber Chen,Amelia Meyer,Andrea Medina-Smith,...,Tim Dennis (UCLA),Torin White,Vania Wang,Xochitl Ortiz Ross (UCLA),Yuan Wu,Zach Sisco,Zhiyuan Yao (UCLA),pre_workshop_survey:,"pre_workshop_survey: ""https://ucsb.co1.qualtrics.com/jfe/form/SV_1zu1a9eHmqQPR9c""","pre_workshop_survey: ""https://ucsb.co1.qualtrics.com/jfe/form/SV_5jBlh85uuXrM9kG"""
0,2025-02-25,Introduction to Machine Learning in Python,,,,,,,,,...,,,,,,,,,,
1,2025-02-18,Intermediate R,,,,,,,,,...,,,,,,,,,,
2,2025-02-05,Container-driven Reproducible Research Made Si...,,,,,,,,,...,,,,,,,,,,
3,2025-02-03,Research Computing - What to do when your work...,,,,,,,,,...,,,,,,,,,helper,
4,2025-01-23,Working with Geospatial Data in R,,,instructor,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,2019-03-21,"Bash, Git & R",,,,,,,,,...,,,,,instructor,,,,,
88,2019-02-21,"Bash, Git & R",,,,,,,,,...,,,,,instructor,,,,,
89,2019-01-25,"Bash, Git & R",,,,,,,,,...,,,,,instructor,,,,,
90,[],R Programming Language,,,,,,,,,...,,,,,,,,,,


In [14]:
df2 = df.drop(columns=['date', 'title_home'])
# Melt the dataframe to long format
df_melted = df2.reset_index().melt(id_vars='index', var_name='Name', value_name='Role')

# Remove empty roles
df_melted = df_melted[df_melted['Role'] != '']

# Add date and title columns back to the melted dataframe
df_melted = df_melted.merge(df[['date', 'title_home']].reset_index(), on='index', how='left')
df_melted.drop(columns=['index'], inplace=True)

# Count roles for each person
role_counts = df_melted.pivot_table(index='Name', columns='Role', aggfunc='size', fill_value=0)
role_counts['Total'] = role_counts.sum(axis=1)
role_counts = role_counts.reset_index()

# Find the last workshop details for each person
last_workshop = df_melted.sort_values(by='date').groupby('Name').last().reset_index()

# Merge the last workshop details with role_counts
final_result = role_counts.merge(last_workshop[['Name', 'date', 'title_home', 'Role']], on='Name', how='left')

# Rename columns for clarity
final_result.rename(columns={'date': 'Last_Workshop_Date', 'title_home': 'Last_Workshop_Title', 'Role': 'Last_Workshop_Role'}, inplace=True)

In [15]:
final_result

Unnamed: 0,Name,helper,instructor,Total,Last_Workshop_Date,Last_Workshop_Title,Last_Workshop_Role
0,Alejandra Sicairos (UCSB),1,0,1,2024-09-09,2024 UC Carpentries Workshop Series (Remote),helper
1,Alexandre de Siqueira (UCB),0,1,1,2021-09-13,2021 UC Fall Workshop,instructor
2,Allie Caughman,1,1,2,2025-01-23,Working with Geospatial Data in R,instructor
3,Amanda Ho,11,12,23,2022-08-16,Machine Learning for Tabular Data in R,helper
4,Amber Budden,1,0,1,2023-01-23,Mondays R for Social Science,helper
...,...,...,...,...,...,...,...
121,Zach Sisco,0,2,2,2020-01-31,Databases and SQL,instructor
122,Zhiyuan Yao (UCLA),0,2,2,2022-09-06,UC Carpentries Fall 2022 Workshop Series,instructor
123,pre_workshop_survey:,1,0,1,2024-09-05,Data Carpentry: Species Interaction Data Works...,helper
124,"pre_workshop_survey: ""https://ucsb.co1.qualtri...",1,0,1,2025-02-03,Research Computing - What to do when your work...,helper


In [16]:
# Save to excel
with pd.ExcelWriter('ucsb_workshops_list.xlsx', engine='xlsxwriter') as writer:
    # Write each dataframe to a different worksheet
    df.to_excel(writer, sheet_name='comp_wkshp_instr', index=False)
    final_result.to_excel(writer, sheet_name='summary_instr', index=False)