# Web Scraping Economics Department Data

This notebook demonstrates how to scrape placement and current candidate data from various university economics departments using Python.

In [2]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np

## Data Scraping - First Step

Here we will roughly scrape the placement and current candidate data from each university's website as html files.

In [8]:
import os
import requests
import time
from urllib.parse import urlparse

def create_directory(path):
    """Create directory if it doesn't exist"""
    if not os.path.exists(path):
        os.makedirs(path)

def clean_filename(url):
    """Create a clean filename from URL"""
    parsed = urlparse(url)
    # Remove common extensions and special characters
    filename = parsed.path.rstrip('/').split('/')[-1]
    if not filename:
        filename = 'index'
    return filename

def fetch_page(url, headers):
    """Fetch webpage content with error handling"""
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def save_html(content, filepath):
    """Save HTML content to file"""
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"Successfully saved: {filepath}")
    except Exception as e:
        print(f"Error saving {filepath}: {e}")

def main():
    # Create data directory
    data_dir = "data"
    create_directory(data_dir)

    # Headers for requests
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Dictionary of universities and their URLs
    universities = {
        'UCSD': {
            'placement': 'https://economics.ucsd.edu/graduate-program/jobmarket-tab/placement-history.html',
            'candidates': 'https://economics.ucsd.edu/graduate-program/jobmarket-tab/index.html'
        },
        'Stanford': {
            'placement': 'https://economics.stanford.edu/graduate/student-placement',
            'candidates': 'https://economics.stanford.edu/graduate/job-market-candidates'
        },
        'Princeton': {
            'placement': 'https://economics.princeton.edu/graduate-program/job-market-and-placements/statistics-on-past-placements/',
            'candidates': 'https://economics.princeton.edu/graduate-program/job-market-and-placements/2024-job-market-candidates/'
        },
        'Northwestern': {
            'placement': 'https://economics.northwestern.edu/graduate/prospective/placement.html',
            'candidates': 'https://economics.northwestern.edu/people/phd-job-market-candidates/'
        },
        'UC_Davis': {
            'placement': 'https://economics.ucdavis.edu/graduate-student-placements',
            'candidates': 'https://economics.ucdavis.edu/people/on-the-job-market'
        },
        'UC_Riverside': {
            'placement': 'https://economics.ucr.edu/graduate-program/placement/',
            'candidates': 'https://economics.ucr.edu/graduate-job-candidates/'
        },
        'UC_Santa_Cruz': {
            'placement': 'https://economics.ucsc.edu/academics/graduate-program/PhD/placement.html',
            'candidates': 'https://economics.ucsc.edu/academics/graduate-program/PhD/job-market/candidates_24-25.html'
        },
        'UC_Santa_Barbara': {
            'placement': 'https://econ.ucsb.edu/programs/graduate/placement',
            'candidates': 'https://econ.ucsb.edu/programs/graduate/candidates'
        }
    }

    # Scrape and save pages for each university
    for univ, urls in universities.items():
        print(f"\nProcessing {univ}...")
        
        # Create university directory
        univ_dir = os.path.join(data_dir, univ)
        create_directory(univ_dir)

        # Process placement page
        print("Fetching placement page...")
        placement_content = fetch_page(urls['placement'], headers)
        if placement_content:
            placement_file = os.path.join(univ_dir, f"{univ}_Placement.html")
            save_html(placement_content, placement_file)

        # Add delay between requests
        time.sleep(2)

        # Process candidates page
        print("Fetching candidates page...")
        candidates_content = fetch_page(urls['candidates'], headers)
        if candidates_content:
            candidates_file = os.path.join(univ_dir, f"{univ}_Candidate.html")
            save_html(candidates_content, candidates_file)

        # Add delay before next university
        time.sleep(2)



if __name__ == "__main__":
    main()
    print("\nScraping completed!") 


Processing UCSD...
Fetching placement page...
Successfully saved: data/UCSD/UCSD_Placement.html
Fetching candidates page...
Successfully saved: data/UCSD/UCSD_Candidate.html

Processing Stanford...
Fetching placement page...
Successfully saved: data/Stanford/Stanford_Placement.html
Fetching candidates page...
Successfully saved: data/Stanford/Stanford_Candidate.html

Processing Princeton...
Fetching placement page...
Error fetching https://economics.princeton.edu/graduate-program/job-market-and-placements/statistics-on-past-placements/: 500 Server Error: Internal Server Error for url: https://economics.princeton.edu/graduate-program/job-market-and-placements/statistics-on-past-placements/
Fetching candidates page...
Error fetching https://economics.princeton.edu/graduate-program/job-market-and-placements/2024-job-market-candidates/: 500 Server Error: Internal Server Error for url: https://economics.princeton.edu/graduate-program/job-market-and-placements/2024-job-market-candidates/

P

Seems like there are some issues with princeton's website. - resolved: under maintenance

In [43]:
def scrape_princeton():
    # Create data directory
    data_dir = "data"
    create_directory(data_dir)

    # Headers for requests
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Princeton URLs
    princeton_urls = {
        'placement': 'https://economics.princeton.edu/graduate-program/job-market-and-placements/statistics-on-past-placements/',
        'candidates': 'https://economics.princeton.edu/graduate-program/job-market-and-placements/2024-job-market-candidates/'
    }

    print("\nProcessing Princeton...")
    
    # Create Princeton directory
    princeton_dir = os.path.join(data_dir, 'Princeton')
    create_directory(princeton_dir)

    # Process placement page
    print("Fetching placement page...")
    placement_content = fetch_page(princeton_urls['placement'], headers)
    if placement_content:
        placement_file = os.path.join(princeton_dir, "Princeton_Placement.html")
        save_html(placement_content, placement_file)

    # Add delay between requests
    time.sleep(2)

    # Process candidates page
    print("Fetching candidates page...")
    candidates_content = fetch_page(princeton_urls['candidates'], headers)
    if candidates_content:
        candidates_file = os.path.join(princeton_dir, "Princeton_Candidate.html")
        save_html(candidates_content, candidates_file)

    print("\nPrinceton scraping completed!") 
scrape_princeton()


Processing Princeton...
Fetching placement page...
Successfully saved: data/Princeton/Princeton_Placement.html
Fetching candidates page...
Successfully saved: data/Princeton/Princeton_Candidate.html

Princeton scraping completed!


## Data Scraping - Second Step

In [2]:
ucsb_placement = 'data/UC_Santa_Barbara/UC_Santa_Barbara_Placement.html'
ucsb_candidates = 'data/UC_Santa_Barbara/UC_Santa_Barbara_Candidate.html'

ucsd_placement = 'data/UCSD/UCSD_Placement.html'
ucsd_candidates = 'data/UCSD/UCSD_Candidate.html'

uc_davis_placement = 'data/UC_Davis/UC_Davis_Placement.html'
uc_davis_candidates = 'data/UC_Davis/UC_Davis_Candidate.html'

northwestern_placement = 'data/Northwestern/Northwestern_Placement.html'
northwestern_candidates = 'data/Northwestern/Northwestern_Candidate.html'

stanford_placement = 'data/Stanford/Stanford_Placement.html'
stanford_candidates = 'data/Stanford/Stanford_Candidate.html'

uc_santa_cruz_placement = 'data/UC_Santa_Cruz/UC_Santa_Cruz_Placement.html'
uc_santa_cruz_candidates = 'data/UC_Santa_Cruz/UC_Santa_Cruz_Candidate.html'

princeton_placement = 'data/Princeton/Princeton_Placement.html'
princeton_candidates = 'data/Princeton/Princeton_Candidate.html'

uc_riverside_placement = 'data/UC_Riverside/UC_Riverside_Placement.html'
uc_riverside_candidates = 'data/UC_Riverside/UC_Riverside_Candidate.html'



UCSD Candidate

In [3]:
def scrape_ucsd_candidates(filepath):
    """
    Scrape candidate information from UCSD's job market page.
    
    Args:
        filepath: Path to the UCSD job market HTML file
        
    Returns:
        pandas DataFrame containing candidate information
    """
    from bs4 import BeautifulSoup
    
    # Read and parse HTML file
    with open(filepath, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f.read(), 'html.parser')
    
    candidates = []
    
    # Find all candidate cards
    cards = soup.find_all('li', class_='profile-listing-card')
    
    for card in cards:
        candidate = {}
        
        # Get name
        name_elem = card.find('h3')
        candidate['name'] = name_elem.text.strip() if name_elem else None
        
        # Get all paragraphs containing information
        info_paras = card.find_all('p')
        
        for para in info_paras:
            text = para.text.strip()
            
            # Extract advisor information
            if text.startswith('Advisor(s):'):
                candidate['advisors'] = text.replace('Advisor(s):', '').strip()
                
            # Extract research fields
            elif text.startswith('Field of Research:'):
                candidate['research_fields'] = text.replace('Field of Research:', '').strip()
        
        # Get personal website
        website_link = card.find('a')
        candidate['personal_website'] = website_link['href'] if website_link else None
        
        # Get image URL if present
        img_elem = card.find('img')
        candidate['image'] = img_elem['src'] if img_elem else None
        
        # Add university
        candidate['university'] = 'UCSD'
        
        candidates.append(candidate)
    
    # Create DataFrame
    df = pd.DataFrame(candidates)
    
    # Reorder columns
    column_order = ['name', 'university', 'advisors', 'research_fields', 'personal_website', 'image']
    df = df[column_order]
    
    return df

In [4]:
ucsd_candidates_df = scrape_ucsd_candidates(ucsd_candidates)
ucsd_candidates_df.head()

Unnamed: 0,name,university,advisors,research_fields,personal_website,image
0,Giampaolo Bonomi,UCSD,Joel Sobel and Renee Bowen,"Political Economics, Applied Theory, Behaviora...",https://sites.google.com/view/gbonomi,Giampaolo picture 2.jpeg
1,Steven Brownstone,UCSD,Karthik Muralidharan and Craig McIntosh,Development Economics,https://www.sbrownstone.me,Steven photo.jpg
2,Erica K. Chuang,UCSD,Mark Jacobsen,Environment; Agriculture and Natural Resources,https://www.ericakchuang.com,EricaCnew.jpeg
3,Danil Dmitriev,UCSD,Renee Bowen,Microeconomic Theory; Political Economy,https://www.danildmitriev.com,Dmitriev_photo.jpg
4,Holt Dwyer,UCSD,Prashant Bharadwaj,Development Economics; Macroeconomics,https://holtdwyer.netlify.app/,Holt headshot.jpg


In [20]:
def scrape_ucsd_placement(filepath):
    """
    Scrape placement history from UCSD's placement history page.
    
    Args:
        filepath: Path to the UCSD placement history HTML file
        
    Returns:
        pandas DataFrame containing placement history information
    """
    from bs4 import BeautifulSoup
    
    # Read and parse HTML file
    with open(filepath, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f.read(), 'html.parser')
    
    placements = []
    
    # Find all tables in the document
    tables = soup.find_all('table', class_='styled')
    
    for table in tables:
        # Get all rows except header
        rows = table.find_all('tr')[1:]  # Skip header row
        
        for row in rows:
            cols = row.find_all('td')
            if len(cols) >= 4:  # Ensure row has all required columns
                placement = {
                    'year': cols[0].text.strip(),
                    'name': cols[1].text.strip(),
                    'field': cols[2].text.strip(),
                    'placement': cols[3].text.strip(),
                    'university': 'UCSD'
                }
                placements.append(placement)
    
    # Create DataFrame
    df = pd.DataFrame(placements)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'field', 'placement']
    df = df[column_order]
    
    return df
ucsd_placement_df = scrape_ucsd_placement(ucsd_placement)
ucsd_placement_df.head()

Unnamed: 0,name,university,year,field,placement
0,Vivan Aluoch,UCSD,2023-24,Development and Labor Economics,Analysis Group
1,Hannah Bae,UCSD,2023-24,Health and Public,"Michigan State University, postdoc at Stanford..."
2,Evgenii Baranov,UCSD,2023-24,Micro Theory,Penn State University
3,Amanda Bonheur,UCSD,2023-24,Behavioral and Labor Economics,RAND Corporation
4,Edoardo Briganti,UCSD,2023-24,Macroeconomics,Bank of Canada


UCSC

In [5]:
def scrape_ucsc_candidates(filepath):
    """
    Scrape candidate information from UC Santa Cruz's job market page.
    
    Args:
        filepath: Path to the UC Santa Cruz job market HTML file
        
    Returns:
        pandas DataFrame containing candidate information
    """
    from bs4 import BeautifulSoup
    
    # Read and parse HTML file
    with open(filepath, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f.read(), 'html.parser')
    
    candidates = []
    
    # Find the main table containing candidate information
    table = soup.find('table')
    if not table:
        return pd.DataFrame()
    
    # Get all rows except header
    rows = table.find_all('tr')[1:]  # Skip header row
    
    for row in rows:
        cols = row.find_all('td')
        if len(cols) >= 4:  # Ensure row has all required columns
            candidate = {}
            
            # Extract name from the link in the first column
            name_link = cols[0].find('a')
            if name_link:
                candidate['name'] = name_link.text.strip()
                candidate['personal_website'] = name_link['href']
            else:
                # If no link, try to get just the text
                name_text = cols[0].get_text(strip=True)
                if name_text:
                    candidate['name'] = name_text
                    candidate['personal_website'] = None
            
            # Get fields of interest
            candidate['research_fields'] = cols[1].get_text(strip=True)
            
            # Get job market paper
            candidate['job_market_paper'] = cols[2].get_text(strip=True)
            
            # Get references
            candidate['advisors'] = cols[3].get_text(strip=True)
            
            # Get image URL if present
            img = cols[0].find('img')
            candidate['image'] = img['src'] if img else None
            
            # Add university
            candidate['university'] = 'UC Santa Cruz'
            
            candidates.append(candidate)
    
    # Create DataFrame
    df = pd.DataFrame(candidates)
    
    # Reorder columns
    column_order = ['name', 'university', 'advisors', 'research_fields', 'job_market_paper', 'personal_website', 'image']
    df = df[column_order]
    
    return df
ucsc_candidates_df = scrape_ucsc_candidates(uc_santa_cruz_candidates)
ucsc_candidates_df.head()


Unnamed: 0,name,university,advisors,research_fields,job_market_paper,personal_website,image
0,"Braz Vallocci, Pedro Henrique",UC Santa Cruz,Galina HaleGrace GuAlonso Villacorta,"Innovation, Productivity, Finance","""Measuring Knowledge Capital Risk""",https://www.brazv.com,pedro-vallocci.jpg
1,"Gong, Weinan",UC Santa Cruz,Kristian López VargasDan FriedmanNatalia Lazzati,Behavioral and Experimental Economics,"""Polarization in Online Social Networks""",https://sites.google.com/ucsc.edu/weinan-gong?...,wgong.jpeg
2,"Habibi, Hamidreza",UC Santa Cruz,Carlos DobkinLaura GiulianoGeorge Bulman,"Applied Microeconomics, Health Economics, Publ...","""Curbing Pharma Influence: The Effect of Marke...",https://hamidhabibi.com/,profile-hamidreza-habibi.jpg
3,"Kapoor, Rolly",UC Santa Cruz,Jonathan RobinsonAjay ShenoyAlan Spearot,"Development Economics, Labor Economics, Applie...","""Together in Search: Experimental Evidence fro...",https://rollykapoor.github.io/,rollykapoor_displaypicture_8080-rolly-kapoor-1...
4,"Li, Siqi",UC Santa Cruz,Jessie LiJustin G MarionNatalia LazzatiZehang ...,"Econometrics, Industrial Organization and Stat...","""Measuring the Market Impact of Machine Learni...",https://siqilithinkling.github.io/,cropped-image1.jpg


In [31]:
def scrape_ucsc_placement(filepath):
    """
    Scrape placement history from UC Santa Cruz's placement history page.
    
    Args:
        filepath: Path to the UC Santa Cruz placement history HTML file
        
    Returns:
        pandas DataFrame containing placement history information
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    placements = []
    
    # Find all year sections
    for year_section in soup.find_all('div', class_='year-section'):
        year = year_section.get('data-year')
        
        # Process each placement entry within the year
        for entry in year_section.find_all('div', class_='placement-entry'):
            name = entry.find('span', class_='name').text.strip()
            role = entry.find('span', class_='role').text.strip()
            location = entry.find('span', class_='location').text.strip()
            
            placement_info = {
                'name': name,
                'year': int(year),
                'university': 'UC Santa Cruz',
                'placement_role': role,
                'placement_location': location
            }
            
            placements.append(placement_info)
    
    # Create DataFrame
    df = pd.DataFrame(placements)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'placement_role', 'placement_location']
    df = df[column_order]
    
    return df

# Test the function
ucsc_placement_df = scrape_ucsc_placement(uc_santa_cruz_placement)
ucsc_placement_df.head()

Unnamed: 0,name,university,year,placement_role,placement_location
0,Harrison Shieh,UC Santa Cruz,2024,Assistant Professor,Vassar College
1,Ken Suzuki,UC Santa Cruz,2024,Post Doctoral Fellow,Stanford University
2,Shinya Inukai,UC Santa Cruz,2024,Deputy Director,"Japanese Ministry of Economy, Trade and Industry"
3,Anirban Sanyal,UC Santa Cruz,2023,Assistant Advisor,Reserve Bank of India
4,Dongwan Choo,UC Santa Cruz,2023,Senior Lecturer,"Massey University, New Zealand"


UCSB

In [6]:
def scrape_ucsb_placement(filepath):
    """
    Scrape placement history from UC Santa Barbara's placement history page.
    
    Args:
        filepath: Path to the UC Santa Barbara placement history HTML file
        
    Returns:
        pandas DataFrame containing placement history information
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    placements = []
    
    # Find the table with job candidates
    table = soup.find('table', {'class': 'table table-hover table-striped'})
    if not table:
        return pd.DataFrame()
    
    # Process each row in the table
    for row in table.find_all('tr')[1:]:  # Skip header row
        cols = row.find_all('td')
        if len(cols) >= 5:  # Ensure row has all required columns
            name = cols[1].find('h3').find('a').text.strip()
            research_areas = cols[2].text.strip()
            advisor = cols[3].find('a').text.strip() if cols[3].find('a') else cols[3].text.strip()
            paper_title = cols[4].find('a').text.strip() if cols[4].find('a') else ''
            
            placement_info = {
                'name': name,
                'year': 2024,  # Current job market year
                'university': 'UC Santa Barbara',
                'research_areas': research_areas,
                'advisor': advisor,
                'job_market_paper': paper_title
            }
            placements.append(placement_info)
    
    # Create DataFrame
    df = pd.DataFrame(placements)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'research_areas', 'advisor', 'job_market_paper']
    df = df[column_order]
    
    return df

# Test the function
ucsb_candidates_df = scrape_ucsb_placement(ucsb_candidates)
ucsb_candidates_df.head()

Unnamed: 0,name,university,year,research_areas,advisor,job_market_paper
0,Alexander Abajian,UC Santa Barbara,2024,"Climate Macroeconomics, Energy Economics, Envi...",Javier Birchenall,"""Savings and Migration in a Warming World"""
1,Sebastian Brown,UC Santa Barbara,2024,"Labor Economics, Behavioral Economics, Applied...",Peter Kuhn,"""How Much Can I Make? Insights on Belief Updat..."
2,Toshio Ferrazares,UC Santa Barbara,2024,"Labor Economics, Public Economics",Heather Royer,"""Shift Structure and Cognitive Depletion: Evid..."
3,Thomas Fullagar,UC Santa Barbara,2024,Applied Microeconomics with a focus on Labor E...,Peter Kuhn,"""Establishment-Level Unionization at Large Fir..."
4,Yang Gao,UC Santa Barbara,2024,"Macroeconomics, Environmental and Energy Econo...",Peter Rupert,"""Capital Misallocation and Climate Policy"""


In [36]:
def scrape_ucsb_placement(filepath):
    """
    Scrape placement history from UC Santa Barbara's placement history page.
    
    Args:
        filepath: Path to the UC Santa Barbara placement history HTML file
        
    Returns:
        pandas DataFrame containing placement history information
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    placements = []
    current_year = None
    
    # Find all dl elements with class 'ckeditor-accordion'
    accordion = soup.find('dl', class_='ckeditor-accordion')
    if not accordion:
        return pd.DataFrame()
        
    # Process each dt (year) and dd (placements) pair
    for dt in accordion.find_all('dt'):
        # Get year from dt
        year_text = dt.text.strip()
        if '-' in year_text:  # Handle academic years like "2023-2024"
            current_year = int(year_text.split('-')[0])
        else:
            current_year = int(year_text)
            
        # Get corresponding dd (contains placement list)
        dd = dt.find_next('dd')
        if not dd:
            continue
            
        # Process each placement in the list
        for li in dd.find_all('li'):
            # Find the institution name (in strong tag)
            institution = li.find('strong')
            if not institution:
                continue
                
            # Split the text after institution to get role
            full_text = li.text.strip()
            parts = full_text.split('-', 1)
            if len(parts) < 2:
                continue
                
            institution_name = institution.text.strip()
            role = parts[1].strip()
            
            placement_info = {
                'year': current_year,
                'university': 'UC Santa Barbara',
                'placement_location': institution_name,
                'placement_role': role
            }
            
            placements.append(placement_info)
    
    # Create DataFrame
    df = pd.DataFrame(placements)
    
    # Reorder columns
    column_order = ['university', 'year', 'placement_role', 'placement_location']
    df = df[column_order]
    
    return df

ucsb_placement_df = scrape_ucsb_placement(ucsb_placement)
ucsb_placement_df.head() # no names for placement

Unnamed: 0,university,year,placement_role,placement_location
0,UC Santa Barbara,2023,Postdoctoral Researcher,Arnold Ventures
1,UC Santa Barbara,2023,Gurion University of the Negev - Assistant Pro...,Ben-Gurion University of the Negev
2,UC Santa Barbara,2023,Assistant Professor of Economics,Calvin University
3,UC Santa Barbara,2023,Postdoctoral Scholar,CDC Prevention Effectiveness Fellowship
4,UC Santa Barbara,2023,Economist / Operations Research Analyst,Department of Defense


UCR

In [40]:
def scrape_ucr_placement(filepath):
    """
    Scrape placement history from UC Riverside's placement history page.
    
    Args:
        filepath: Path to the UC Riverside placement history HTML file
        
    Returns:
        pandas DataFrame containing placement history information
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    placements = []
    current_year = None
    
    # Find the table within the table-1 div
    table_div = soup.find('div', class_='table-1')
    if not table_div:
        return pd.DataFrame()
        
    table = table_div.find('table')
    if not table:
        return pd.DataFrame()
    
    # Process each row in the table
    for row in table.find_all('tr'):
        # Check if this is a year header row
        th = row.find('th')
        if th and th.get('colspan') == '2':
            try:
                current_year = int(th.text.strip())
            except ValueError:
                continue
            continue
            
        # Process placement data row
        cols = row.find_all('td')
        if len(cols) == 2:  # Ensure row has name and placement info
            name = cols[0].text.strip()
            placement_text = cols[1].text.strip()
            
            # Split placement text into role and location
            if ',' in placement_text:
                role, location = [part.strip() for part in placement_text.rsplit(',', 1)]
            else:
                role = placement_text
                location = ''
            
            placement_info = {
                'name': name,
                'year': current_year,
                'university': 'UC Riverside',
                'placement_role': role,
                'placement_location': location
            }
            
            placements.append(placement_info)
    
    # Create DataFrame
    df = pd.DataFrame(placements)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'placement_role', 'placement_location']
    df = df[column_order]
    
    return df


# Test the function
ucr_placement_df = scrape_ucr_placement(uc_riverside_placement)
ucr_placement_df.head()

Unnamed: 0,name,university,year,placement_role,placement_location
0,Pedro Isaac Chavez Lopez,UC Riverside,2024,Economist,Bank of Mexico
1,Da Gong,UC Riverside,2024,"Lecturer, School of Business",SUNY Geneseo
2,Jingyan Guo,UC Riverside,2024,"Associate, Cirque Analytics",Los Angeles
3,Yong Ju Lee,UC Riverside,2024,"Assistant Professor, Department of Mathematics...",Ohio
4,Dayang Li,UC Riverside,2024,"Assistant Professor, Department of Economics",Xi’an Jiaotong-Liverpool University


In [7]:
def scrape_ucr_candidates(filepath):
    """
    Scrape current job market candidates from UC Riverside's candidate page.
    
    Args:
        filepath: Path to the UC Riverside candidate HTML file
        
    Returns:
        pandas DataFrame containing current job market candidate information
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    candidates = []
    
    # Find the content section
    content = soup.find('div', class_='post-content')
    if not content:
        return pd.DataFrame()
    
    # Find all candidate sections (they are grouped in fusion-builder-row-inner)
    candidate_sections = content.find_all('div', class_='fusion-builder-row-inner')
    
    for section in candidate_sections:
        # Each candidate has their info in a fusion-text div
        info_div = section.find('div', class_='fusion-text')
        if not info_div:
            continue
            
        # Extract candidate information
        paragraphs = info_div.find_all('p')
        if not paragraphs:
            continue
            
        # First paragraph contains name
        name = paragraphs[0].find('strong').text.strip() if paragraphs[0].find('strong') else ''
        
        # Initialize variables
        website = ''
        paper = ''
        references = ''
        fields = ''
        
        # Process remaining paragraphs
        for p in paragraphs[1:]:
            text = p.text.strip()
            if text.startswith('Website:'):
                website = p.find('a')['href'] if p.find('a') else text.replace('Website:', '').strip()
            elif text.startswith('Paper:'):
                paper = text.replace('Paper:', '').strip().strip('"')
            elif text.startswith('References:'):
                references = text.replace('References:', '').strip()
            elif text.startswith('Major Fields:'):
                fields = text.replace('Major Fields:', '').strip()
        
        candidate_info = {
            'name': name,
            'university': 'UC Riverside',
            'year': 2024,  # Current job market year
            'website': website,
            'job_market_paper': paper,
            'references': references,
            'research_fields': fields
        }
        
        candidates.append(candidate_info)
    
    # Create DataFrame
    df = pd.DataFrame(candidates)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'research_fields', 'job_market_paper', 'references', 'website']
    df = df[column_order]
    
    return df

# Test the function
ucr_candidates_df = scrape_ucr_candidates(uc_riverside_candidates)
ucr_candidates_df.head()


Unnamed: 0,name,university,year,research_fields,job_market_paper,references,website
0,Yifei Ding,UC Riverside,2024,"Causal Inference, Econometrics Theory, Machine...",“Deep Learning for Individual Heterogeneity wi...,"Ruoyao Shi, Tae-Hwy Lee, Weixin Yao, Meng Xu, ...",https://yifeiding-ucr.github.io/
1,Ilsoo Han,UC Riverside,2024,"Macroeconomics, Search Theory, Labor Economics.",“Educational-Specific Decompositional Effects ...,"Victor Ortego-Marti, Jang-Ting Guo, Matthew Lang.",http://ilsoohan.weebly.com/
2,Anirudh Iyer,UC Riverside,2024,"Microeconomics, Matching, Market Design.",“Dynamic Matching with Case-Based Agents”,"Siyang Xiong, Haluk Ergin, Hiroki Nishimura.",https://sites.google.com/view/aiyer/
3,Rajveer Jat,UC Riverside,2024,"Econometrics, Machine Learning, Causal Inference.",“Sufficient Instrument Filter.”,"Tae-Hwy Lee, Marcelle Chauvet, Bharat Ramaswam...",https://rajveerjat.com/
4,Nitish Kumar,UC Riverside,2024,"Macroeconomics, Housing Economics, Urban Econo...",“A Search and Matching Model of Housing and Re...,"Victor Ortego-Marti, Jang-Ting Guo, Guillaume ...",https://www.nitish-kumar.com/


UCD

In [8]:
def scrape_ucd_candidates(filepath):
    """
    Scrape current job market candidates from UC Davis's candidate page.
    
    Args:
        filepath: Path to the UC Davis candidate HTML file
        
    Returns:
        pandas DataFrame containing current job market candidate information
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    candidates = []
    
    # Find all candidate entries (they are in vm-teaser divs)
    candidate_divs = soup.find_all('article', class_='node node--type-sf-person vm-teaser--grouped vm-teaser')
    
    for div in candidate_divs:
        # Get name from the title
        name_elem = div.find('h3', class_='vm-teaser__title')
        if not name_elem:
            continue
        name = name_elem.find('span').text.strip()
        
        # Get research fields from position list
        fields_elem = div.find('ul', class_='vm-teaser__position')
        fields = fields_elem.find('li').text.strip() if fields_elem else ''
        
        # Get contact information
        contact_list = div.find('ul', class_='vm-teaser__contact')
        email = ''
        website = ''
        if contact_list:
            for li in contact_list.find_all('li'):
                link = li.find('a')
                if not link:
                    continue
                if '@' in link.text:
                    email = link.text.strip()
                elif 'Website' in li.text:
                    website = link['href'].strip()
        
        candidate_info = {
            'name': name,
            'university': 'UC Davis',
            'year': 2024,  # Current job market year
            'research_fields': fields,
            'email': email,
            'website': website
        }
        
        candidates.append(candidate_info)
    
    # Create DataFrame
    df = pd.DataFrame(candidates)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'research_fields', 'email', 'website']
    df = df[column_order]
    
    return df

# Test the function
ucd_candidates_df = scrape_ucd_candidates(uc_davis_candidates)
ucd_candidates_df.head()

Unnamed: 0,name,university,year,research_fields,email,website
0,Ellen Anderson,UC Davis,2024,"PhD Candidate - Development Economics, Economi...",eander@ucdavis.edu,https://www.ellenanderson-economics.com/
1,Kalyani Chaudhuri,UC Davis,2024,"PhD Candidate - Behavioral Economics, Labor Ec...",kalyanic@ucdavis.edu,https://kalyanic.weebly.com/
2,Kevin Dinh,UC Davis,2024,"Ph.D. Candidate - Health Economics, Labor Econ...",kevdinh@ucdavis.edu,https://sites.google.com/view/kevindinh/home?a...
3,Irina Firsova,UC Davis,2024,"Ph.D. Candidate - Environmental Economics, App...",ifirsova@ucdavis.edu,https://www.irinafirsova.com/
4,Tyler S Hoppenfeld,UC Davis,2024,Graduate Student,THoppenfeld@ucdavis.edu,


In [46]:
def scrape_ucd_placement(filepath):
    """
    Scrape placement history from UC Davis's placement history page.
    
    Args:
        filepath: Path to the UC Davis placement history HTML file
        
    Returns:
        pandas DataFrame containing placement history information
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    placements = []
    
    # Find the table with class 'table--striped'
    table = soup.find('table', class_='table--striped')
    if not table:
        return pd.DataFrame()
    
    # Process each row in the table
    for row in table.find_all('tr')[1:]:  # Skip header row
        cols = row.find_all('td')
        if len(cols) >= 5:  # Ensure row has all required columns
            last_name = cols[0].text.strip()
            first_name = cols[1].text.strip()
            phd_date = cols[2].text.strip()
            placement_location = cols[3].text.strip()
            placement_role = cols[4].text.strip()
            
            # Extract year from PhD date (format: "Jun-24")
            try:
                year = 2000 + int(phd_date.split('-')[1])
            except:
                year = None
            
            placement_info = {
                'name': f"{first_name} {last_name}",
                'university': 'UC Davis',
                'year': year,
                'placement_role': placement_role,
                'placement_location': placement_location
            }
            
            placements.append(placement_info)
    
    # Create DataFrame
    df = pd.DataFrame(placements)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'placement_role', 'placement_location']
    df = df[column_order]
    df = df.loc[df['year'].isna(), 'year'] = 2019
    
    return df

# Test the function
ucd_placement_df = scrape_ucd_placement(uc_davis_placement)
ucd_placement_df.head()


Unnamed: 0,name,university,year,placement_role,placement_location
0,Alaa Abdelfattah,UC Davis,2024.0,Assistant Professor,Occidental College
1,Seungjin Baek,UC Davis,2024.0,Junior Economist,OECD
2,Reem Zaiour,UC Davis,2024.0,Assistant Professor,Vanderbilt University
3,Jinyoung Seo,UC Davis,2024.0,Assistant Professor,Wake Forest University
4,Baiyu Zhou,UC Davis,2024.0,Senior Associate,Charles River Associates


### Stanford

In [9]:
def scrape_stanford_candidates(filepath):
    """
    Scrape current job market candidates from Stanford's candidate page.
    
    Args:
        filepath: Path to the Stanford candidate HTML file
        
    Returns:
        pandas DataFrame containing current job market candidate information
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    candidates = []
    
    # Find all candidate cards
    candidate_cards = soup.find_all('div', class_='hb-card__content')
    
    for card in candidate_cards:
        # Get name from title (h2 within views-field-title)
        name_elem = card.find('div', class_='views-field-title').find('h2')
        if not name_elem or not name_elem.find('a'):
            continue
            
        name = name_elem.find('a').text.strip()
        website = name_elem.find('a')['href']
        
        # Get email (within field-content div)
        email_elem = card.find('div', class_='views-field-field-hs-person-email')
        email = email_elem.find('div', class_='field-content').find('a').text.strip() if email_elem else ''
        
        # Get job market paper
        paper_elem = card.find('div', class_='views-field-custm-hs-job-market-paper')
        paper = paper_elem.find('div', class_='field-content').text.strip() if paper_elem else ''
        
        # Get fields of study
        fields_elem = card.find('div', class_='views-field-field-hs-person-interests')
        fields = fields_elem.find('div', class_='field-content').text.strip() if fields_elem else ''
        
        # Get advisors
        advisors_elem = card.find('div', class_='views-field-custm-advisors')
        advisors = advisors_elem.find('div', class_='field-content').text.strip() if advisors_elem else ''
        
        candidate_info = {
            'name': name,
            'university': 'Stanford',
            'year': 2024,  # From the lead text
            'email': email,
            'research_fields': fields,
            'job_market_paper': paper,
            'advisors': advisors,
            'website': website
        }
        
        candidates.append(candidate_info)
    
    # Create DataFrame
    df = pd.DataFrame(candidates)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'email', 'research_fields', 'job_market_paper', 'advisors', 'website']
    df = df[column_order]
    
    return df

# Test the function
stanford_candidates_df = scrape_stanford_candidates(stanford_candidates)
stanford_candidates_df.head()


Unnamed: 0,name,university,year,email,research_fields,job_market_paper,advisors,website
0,Shifrah Aron-Dine,Stanford,2024,arondine@stanford.edu,"Macroeconomics, Finance, Environmental Economics",Rebuild or Relocate? Recovery after Natural Di...,"Monika Piazzesi (Co-primary), Martin Schneider...",https://www.shifraharondine.com/
1,Matthew Brown,Stanford,2024,mbrown35@stanford.edu,"Public Economics, Behavioral and Experimental",Do Sports Bettors Need Consumer Protection? Ev...,"Matthew Gentzkow, Hunt Allcott, B. Douglas Ber...",https://mattbrownecon.github.io/
2,Ian Calaway,Stanford,2024,icalaway@stanford.edu,"Public Economics, Labor Economics",Early Mentors for Exceptional Students,"Caroline Hoxby, Isaac Sorkin, Nicholas Bloom",https://sites.google.com/view/ian-calaway/home
3,Harsh Gupta,Stanford,2024,hgupta13@stanford.edu,"Industrial Organization, Health Economics",Price Controls with Imperfect Competition and ...,"Matthew Gentzkow (Co-primary), Heidi Williams ...",https://www.harshgupta1311.com/
4,Cedomir Malgieri,Stanford,2024,cedomir@stanford.edu,Macroeconomics,Wage Contracts and Financial Frictions,"Luigi Bocola (Co-primary), Patrick Kehoe (Co-p...",https://www.cedomirmalgieri.com/


In [54]:
def scrape_stanford_placement(filepath):
    """
    Scrape placement data from Stanford's placement page.
    
    Args:
        filepath: Path to the Stanford placement HTML file
        
    Returns:
        pandas DataFrame containing placement information
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    placements = []
    
    # Find all tables
    tables = soup.find_all('table', class_='cols-3')
    
    for table in tables:
        # Get year from caption
        year_div = table.find('caption').find('div', class_='text-align-center')
        if year_div:
            try:
                year = int(year_div.text.strip())
            except ValueError:
                continue
        else:
            continue
            
        # Process each row in table body
        for row in table.find('tbody').find_all('tr'):
            cols = row.find_all('td')
            if len(cols) == 3:  # Ensure we have all three columns
                name = cols[0].text.strip()
                fields = cols[1].text.strip()
                placement = cols[2].text.strip()
                
                # Split placement into role and location if possible
                # For now treating entire placement as role since structure isn't clear
                placement_info = {
                    'name': name,
                    'university': 'Stanford',
                    'year': year,
                    'research_fields': fields,
                    'placement_role': placement,
                    'placement_location': ''  # Could be parsed from placement if needed
                }
                
                placements.append(placement_info)
    
    # Create DataFrame
    df = pd.DataFrame(placements)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'research_fields', 'placement_role', 'placement_location']
    df = df[column_order]
    df = df.rename(columns={'placement_role': 'placement'})
    df = df[['name', 'university', 'year', 'research_fields', 'placement']]
    return df

# Test the function
stanford_placement_df = scrape_stanford_placement(stanford_placement)
stanford_placement_df.head()

Unnamed: 0,name,university,year,research_fields,placement
0,Maxim Bakhtin,Stanford,2025,"Behavioral Economics, Theory",Analysis Group
1,Joseph Anderson,Stanford,2024,"Innovation, Labor, Industrial Organization","Office of Management and Budget, U.S. Government"
2,Dominique Araya Vergara,Stanford,2024,"Health Economics, Public Economics","Postdoc, Ohio State University"
3,Gonzalo Arrieta,Stanford,2024,Experimental and Behavioral Economics,University of Zurich
4,Yunus C. Aybas,Stanford,2024,Microeconomic Theory,Texas A&M University


### Princeton

In [10]:
def scrape_princeton_candidates(filepath):
    """
    Scrape current job market candidates from Princeton's candidate page.
    
    Args:
        filepath: Path to the Princeton candidate HTML file
        
    Returns:
        pandas DataFrame containing current job market candidate information
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    candidates = []
    
    # Find the table containing candidates
    table = soup.find('table')
    if not table:
        return pd.DataFrame()
    
    # Process each row in table body
    for row in table.find_all('tr')[1:]:  # Skip header row
        cols = row.find_all('td')
        if len(cols) == 5:  # Ensure we have all columns
            # Extract image and info from first column
            photo_col = cols[0]
            
            # Extract name and contact info from second column
            contact_col = cols[1]
            name = contact_col.find('strong').text.strip() if contact_col.find('strong') else ''
            
            # Get email and website links
            links = contact_col.find_all('a')
            email = ''
            website = ''
            cv = ''
            for link in links:
                if 'Email' in link.text:
                    email = link['href'].replace('mailto:', '')
                elif 'Website' in link.text:
                    website = link['href']
                elif 'CV' in link.text:
                    cv = link['href']
            
            # Get fields from third column
            fields = cols[2].text.strip()
            
            # Get paper from fourth column
            paper_col = cols[3]
            paper = paper_col.text.strip()
            if paper.startswith('"') and paper.endswith('"'):
                paper = paper[1:-1]  # Remove quotes
                
            # Get references from fifth column
            refs_col = cols[4]
            references = []
            for ref_link in refs_col.find_all('a'):
                references.append(ref_link.text.strip())
            references = ', '.join(references)
            
            candidate_info = {
                'name': name,
                'university': 'Princeton',
                'year': 2024,
                'email': email,
                'website': website,
                'cv': cv,
                'research_fields': fields,
                'job_market_paper': paper,
                'references': references
            }
            
            candidates.append(candidate_info)
    
    # Create DataFrame
    df = pd.DataFrame(candidates)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'email', 'research_fields', 'job_market_paper', 'references', 'website', 'cv']
    df = df[column_order]
    
    return df

# Test the function
princeton_candidates_df = scrape_princeton_candidates(princeton_candidates)
princeton_candidates_df.head()

Unnamed: 0,name,university,year,email,research_fields,job_market_paper,references,website,cv
0,Patrick Agte,Princeton,2024,patrick.agte@yale.edu,Development Economics\nHealth Economics\nIndus...,“Fighting Silent Killers: How India’s Public P...,"Janet Currie, Thomas Fujiwara, Rohini Pande, C...",http://www.patrickagte.com,https://patrickagte.github.io/patrickagte/agte...
1,Anshu Chen,Princeton,2024,anshuc@princeton.edu,Corporate Finance\nMacroeconomics,“Text-Based Industry Classifications and their...,"Moritz Lenel, Ezra Oberfield, Motohiro Yogo",https://sites.google.com/view/anshuchen/home,https://www.dropbox.com/scl/fi/hu83j9qi8lo33d3...
2,Francesco Fabbri,Princeton,2024,ffabbri@princeton.edu,Microeconomic Theory\nBehavioral Economics,“Attention Holdup”\nPietro Ortoleva (advisor),"Pietro Ortoleva, Faruk Gul, Wolfgang Pesendorfer",http://www.francesco-fabbri.com,https://francesco-fabbri.github.io/documents/c...
3,Allison Green,Princeton,2024,aegreen@princeton.edu,Labor Economics\nUrban Economics\nEconomic His...,“Networks and Geographic Mobility: Evidence fr...,"Leah Boustan, Ilyana Kuziemko, Stephen Redding",http://www.allie-green.com,https://allie-e-green.github.io/public/AG_CV.pdf
4,Simon Margolin,Princeton,2024,simon.margolin@princeton.edu,Macroeconomics\nPublic Finance,“Micro vs. Macro Corporate Tax Incidence”\nGia...,"Gianluca Violante, John Grigsby, Richard Rogerson",https://www.simonmargolin.com/,http://simonmargolin.github.io/JM/CV_SimonMarg...


In [63]:
def scrape_princeton_placement(filepath):
    """
    Scrape placement data from Princeton's placement page.
    
    Args:
        filepath: Path to the Princeton placement HTML file
        
    Returns:
        pandas DataFrame containing placement information
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    placements = []
    
    # Find the table containing placements
    table = soup.find('table')
    if not table:
        return pd.DataFrame()
        
    # Process each row in table body
    for row in table.find_all('tr')[1:]:  # Skip header row
        # Get data attributes
        year_text = row.get('data-year', '')
        cols = row.find_all('td')
        
        if len(cols) >= 4:
            institution = cols[1].text.strip()
            position = cols[2].text.strip()
            fields = cols[3].text.strip()
            
            # Extract first year from range only if year_text is not empty
            year = int(year_text.split('-')[0]) if year_text else None
            
            placement_info = {
                'name': '',  # Empty column for name as requested
                'university': 'Princeton',
                'year': year,
                'placement_location': institution,
                'placement_role': position,
                'research_fields': fields
            }
            
            placements.append(placement_info)
    
    # Create DataFrame
    df = pd.DataFrame(placements)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'placement_role', 'placement_location', 'research_fields']
    df = df[column_order]
    
    return df

# Test the function
princeton_placement_df = scrape_princeton_placement(princeton_placement)
princeton_placement_df.head()
# No name column

Unnamed: 0,name,university,year,placement_role,placement_location,research_fields
0,,Princeton,,Networks,Two Sigma Investments,Econometrics
1,,Princeton,,Networks,Two Sigma Investments,Finance
2,,Princeton,,Economist,International Monetary Fund,International Trade
3,,Princeton,,Assistant Professor,University of Michigan,Economic Theory
4,,Princeton,,Economist,Amazon,Finance


Northwestern

In [11]:
def scrape_northwestern_candidates(filepath):
    """
    Scrape current job market candidates from Northwestern's candidate page.
    
    Args:
        filepath: Path to the Northwestern candidate HTML file
        
    Returns:
        pandas DataFrame containing current job market candidate information
    """
    def format_name(name):
        """Helper function to convert 'Last, First' to 'First Last'"""
        if ',' in name:
            last, first = name.split(',', 1)
            return f"{first.strip()} {last.strip()}"
        return name
    
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    candidates = []
    
    # Find the table containing candidates
    table = soup.find('table', {'id': 'phd_directory'})
    if not table:
        return pd.DataFrame()
    
    # Process each row in table body
    for row in table.find('tbody').find_all('tr'):
        cols = row.find_all('td')
        if len(cols) == 4:  # Name, Department, Contact, Subfield
            # Get name from first column - handle both span and strong tags
            name_elem = cols[0].find('strong')
            if not name_elem:
                name_elem = cols[0].find('span', class_='name')
            raw_name = name_elem.text.strip() if name_elem else ''
            name = format_name(raw_name)
            
            # Get department
            department = cols[1].text.strip()
            
            # Get contact info (email and webpage)
            contact_col = cols[2]
            email = ''
            website = ''
            for link in contact_col.find_all('a'):
                href = link.get('href', '')
                if 'mailto:' in href:
                    email = href.replace('mailto:', '')
                elif 'http' in href:
                    website = href
            
            # Get research fields
            fields = cols[3].text.strip()
            
            # Get references from hidden details div if it exists
            details_div = cols[0].find('div', class_='details')
            references = ''
            if details_div:
                refs_desc = details_div.find('div', class_='details-description')
                if refs_desc:
                    references = refs_desc.text.strip()
            
            candidate_info = {
                'name': name,
                'university': 'Northwestern',
                'year': 2024,  # Current job market year
                'email': email,
                'website': website,
                'research_fields': fields,
                'references': references
            }
            
            candidates.append(candidate_info)
    
    # Create DataFrame
    df = pd.DataFrame(candidates)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'email', 'research_fields', 'references', 'website']
    df = df[column_order]
    
    return df

# Test the function
northwestern_candidates_df = scrape_northwestern_candidates(northwestern_candidates)
northwestern_candidates_df.head()

Unnamed: 0,name,university,year,email,research_fields,references,website
0,Jose Alvarado,Northwestern,2024,josealvarado2023@u.northwestern.edu,"Macroeconomics, Public Finance, Political Economy",,https://www.josemiguelalvarado.com
1,Samuel Ampaw,Northwestern,2024,samuel.ampaw@kellogg.northwestern.edu,"Health Economics, Development Economics",,https://sites.northwestern.edu/sas61731/
2,Michelle Avataneo,Northwestern,2024,michelle.avataneo@kellogg.northwestern.edu,Microeconomic Theory,,https://www.michelleavataneo.com
3,Michael Cai,Northwestern,2024,michaelcai2025@u.northwestern.edu,"Macroeconomics, Behavioral Economics",,https://sites.northwestern.edu/michaelcai/
4,Kwok Yan Chiu,Northwestern,2024,kwokyan.chiu@u.northwestern.edu,Macroeconomics,,https://sites.northwestern.edu/kyc1728/


In [68]:
def scrape_northwestern_placement(filepath):
    """
    Scrape placement data from Northwestern's placement page.
    
    Args:
        filepath: Path to the Northwestern placement HTML file
        
    Returns:
        pandas DataFrame containing placement information
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    placements = []
    current_year = None
    
    # Find all expander divs that contain placement data
    expanders = soup.find_all('div', class_='expander')
    
    for expander in expanders:
        # Get year from previous h3 tag
        year_header = expander.find_previous_sibling('h3')
        if year_header and year_header.get('id', '').startswith('group-'):
            try:
                current_year = int(year_header.text.strip())
            except ValueError:
                continue
        
        # Process academic placements
        academic_header = expander.find('h4', text='Academic placements')
        if academic_header:
            academic_list = academic_header.find_next('div').find('ul')
            if academic_list:
                for item in academic_list.find_all('li'):
                    placement_info = {
                        'name': '',  # Empty as names not provided
                        'university': 'Northwestern',
                        'year': current_year,
                        'placement_role': 'Academic',
                        'placement_location': item.text.strip(),
                        'research_fields': ''  # Empty as fields not provided
                    }
                    placements.append(placement_info)
        
        # Process government placements
        govt_header = expander.find('h4', text='Government placements')
        if govt_header:
            govt_list = govt_header.find_next('div').find('ul')
            if govt_list:
                for item in govt_list.find_all('li'):
                    placement_info = {
                        'name': '',
                        'university': 'Northwestern',
                        'year': current_year,
                        'placement_role': 'Government',
                        'placement_location': item.text.strip(),
                        'research_fields': ''
                    }
                    placements.append(placement_info)
        
        # Process private industry placements
        private_header = expander.find('h4', text=lambda x: x and 'Private industry' in x)
        if private_header:
            private_list = private_header.find_next('div').find('ul')
            if private_list:
                for item in private_list.find_all('li'):
                    placement_info = {
                        'name': '',
                        'university': 'Northwestern',
                        'year': current_year,
                        'placement_role': 'Private Industry',
                        'placement_location': item.text.strip(),
                        'research_fields': ''
                    }
                    placements.append(placement_info)
    
    # Create DataFrame
    df = pd.DataFrame(placements)
    
    # Reorder columns
    column_order = ['name', 'university', 'year', 'placement_role', 'placement_location', 'research_fields']
    df = df[column_order]
    
    return df

# Test the function
northwestern_placement_df = scrape_northwestern_placement(northwestern_placement)
northwestern_placement_df

  academic_header = expander.find('h4', text='Academic placements')
  govt_header = expander.find('h4', text='Government placements')
  private_header = expander.find('h4', text=lambda x: x and 'Private industry' in x)


Unnamed: 0,name,university,year,placement_role,placement_location,research_fields
0,,Northwestern,2024,Academic,Brown University (postdoc)/University of Notti...,
1,,Northwestern,2024,Academic,Chinese University of Hong Kong (Business School),
2,,Northwestern,2024,Academic,Cornell University,
3,,Northwestern,2024,Academic,"European University Institute, Italy (postdoc)",
4,,Northwestern,2024,Academic,John Hopkins University,
...,...,...,...,...,...,...
274,,Northwestern,2010,Government,"Federal Reserve Bank Board of Governors, Washi...",
275,,Northwestern,2010,Government,"Federal Trade Commission, Washington DC",
276,,Northwestern,2010,Private Industry,"Acumen LLC, San Francisco",
277,,Northwestern,2010,Private Industry,"Deloitte, Transfer Pricing Practice, Chicago",


## Data Cleaning
Now we have scraped all the data, we need to clean it up. First let's see all of them.

In [12]:
ucsb_candidates_df.head(3)

Unnamed: 0,name,university,year,research_areas,advisor,job_market_paper
0,Alexander Abajian,UC Santa Barbara,2024,"Climate Macroeconomics, Energy Economics, Envi...",Javier Birchenall,"""Savings and Migration in a Warming World"""
1,Sebastian Brown,UC Santa Barbara,2024,"Labor Economics, Behavioral Economics, Applied...",Peter Kuhn,"""How Much Can I Make? Insights on Belief Updat..."
2,Toshio Ferrazares,UC Santa Barbara,2024,"Labor Economics, Public Economics",Heather Royer,"""Shift Structure and Cognitive Depletion: Evid..."


In [69]:
ucsb_placement_df.head(3)

Unnamed: 0,university,year,placement_role,placement_location
0,UC Santa Barbara,2023,Postdoctoral Researcher,Arnold Ventures
1,UC Santa Barbara,2023,Gurion University of the Negev - Assistant Pro...,Ben-Gurion University of the Negev
2,UC Santa Barbara,2023,Assistant Professor of Economics,Calvin University


In [96]:
ucsd_candidates_df.head(3)

Unnamed: 0,name,university,advisors,research_fields,personal_website,image
0,Giampaolo Bonomi,UCSD,Joel Sobel and Renee Bowen,"Political Economics, Applied Theory, Behaviora...",https://sites.google.com/view/gbonomi,Giampaolo picture 2.jpeg
1,Steven Brownstone,UCSD,Karthik Muralidharan and Craig McIntosh,Development Economics,https://www.sbrownstone.me,Steven photo.jpg
2,Erica K. Chuang,UCSD,Mark Jacobsen,Environment; Agriculture and Natural Resources,https://www.ericakchuang.com,EricaCnew.jpeg


In [72]:
ucsd_placement_df.head(3)

Unnamed: 0,name,university,year,field,placement
0,Vivan Aluoch,UCSD,2023-24,Development and Labor Economics,Analysis Group
1,Hannah Bae,UCSD,2023-24,Health and Public,"Michigan State University, postdoc at Stanford..."
2,Evgenii Baranov,UCSD,2023-24,Micro Theory,Penn State University


In [112]:
ucd_placement_df.head(3)

Unnamed: 0,name,university,year,placement
0,Alaa Abdelfattah,UC Davis,2024.0,"Assistant Professor, Occidental College"
1,Seungjin Baek,UC Davis,2024.0,"Junior Economist, OECD"
2,Reem Zaiour,UC Davis,2024.0,"Assistant Professor, Vanderbilt University"


In [75]:
ucd_candidates_df.head(3)

Unnamed: 0,name,university,year,research_fields,email,website
0,Ellen Anderson,UC Davis,2024,"PhD Candidate - Development Economics, Economi...",eander@ucdavis.edu,https://www.ellenanderson-economics.com/
1,Kalyani Chaudhuri,UC Davis,2024,"PhD Candidate - Behavioral Economics, Labor Ec...",kalyanic@ucdavis.edu,https://kalyanic.weebly.com/
2,Kevin Dinh,UC Davis,2024,"Ph.D. Candidate - Health Economics, Labor Econ...",kevdinh@ucdavis.edu,https://sites.google.com/view/kevindinh/home?a...


In [74]:
ucsc_candidates_df.head(3)

Unnamed: 0,name,university,advisors,research_fields,job_market_paper,personal_website,image
0,"Braz Vallocci, Pedro Henrique",UC Santa Cruz,Galina HaleGrace GuAlonso Villacorta,"Innovation, Productivity, Finance","""Measuring Knowledge Capital Risk""",https://www.brazv.com,pedro-vallocci.jpg
1,"Gong, Weinan",UC Santa Cruz,Kristian López VargasDan FriedmanNatalia Lazzati,Behavioral and Experimental Economics,"""Polarization in Online Social Networks""",https://sites.google.com/ucsc.edu/weinan-gong?...,wgong.jpeg
2,"Habibi, Hamidreza",UC Santa Cruz,Carlos DobkinLaura GiulianoGeorge Bulman,"Applied Microeconomics, Health Economics, Publ...","""Curbing Pharma Influence: The Effect of Marke...",https://hamidhabibi.com/,profile-hamidreza-habibi.jpg


In [76]:
ucsc_placement_df.head(3)

Unnamed: 0,name,university,year,placement_role,placement_location
0,Harrison Shieh,UC Santa Cruz,2024,Assistant Professor,Vassar College
1,Ken Suzuki,UC Santa Cruz,2024,Post Doctoral Fellow,Stanford University
2,Shinya Inukai,UC Santa Cruz,2024,Deputy Director,"Japanese Ministry of Economy, Trade and Industry"


In [78]:
ucr_candidates_df.head(3)

Unnamed: 0,name,university,year,research_fields,job_market_paper,references,website
0,Yifei Ding,UC Riverside,2024,"Causal Inference, Econometrics Theory, Machine...",“Deep Learning for Individual Heterogeneity wi...,"Ruoyao Shi, Tae-Hwy Lee, Weixin Yao, Meng Xu, ...",https://yifeiding-ucr.github.io/
1,Ilsoo Han,UC Riverside,2024,"Macroeconomics, Search Theory, Labor Economics.",“Educational-Specific Decompositional Effects ...,"Victor Ortego-Marti, Jang-Ting Guo, Matthew Lang.",http://ilsoohan.weebly.com/
2,Anirudh Iyer,UC Riverside,2024,"Microeconomics, Matching, Market Design.",“Dynamic Matching with Case-Based Agents”,"Siyang Xiong, Haluk Ergin, Hiroki Nishimura.",https://sites.google.com/view/aiyer/


In [77]:
ucr_placement_df.head(3)

Unnamed: 0,name,university,year,placement_role,placement_location
0,Pedro Isaac Chavez Lopez,UC Riverside,2024,Economist,Bank of Mexico
1,Da Gong,UC Riverside,2024,"Lecturer, School of Business",SUNY Geneseo
2,Jingyan Guo,UC Riverside,2024,"Associate, Cirque Analytics",Los Angeles


In [79]:
northwestern_candidates_df.head(3)

Unnamed: 0,name,university,year,email,research_fields,references,website
0,Jose Alvarado,Northwestern,2024,josealvarado2023@u.northwestern.edu,"Macroeconomics, Public Finance, Political Economy",,https://www.josemiguelalvarado.com
1,Samuel Ampaw,Northwestern,2024,samuel.ampaw@kellogg.northwestern.edu,"Health Economics, Development Economics",,https://sites.northwestern.edu/sas61731/
2,Michelle Avataneo,Northwestern,2024,michelle.avataneo@kellogg.northwestern.edu,Microeconomic Theory,,https://www.michelleavataneo.com


In [80]:
northwestern_placement_df.head(3)

Unnamed: 0,name,university,year,placement_role,placement_location,research_fields
0,,Northwestern,2024,Academic,Brown University (postdoc)/University of Notti...,
1,,Northwestern,2024,Academic,Chinese University of Hong Kong (Business School),
2,,Northwestern,2024,Academic,Cornell University,


In [81]:
stanford_candidates_df.head(3)

Unnamed: 0,name,university,year,email,research_fields,job_market_paper,advisors,website
0,Shifrah Aron-Dine,Stanford,2024,arondine@stanford.edu,"Macroeconomics, Finance, Environmental Economics",Rebuild or Relocate? Recovery after Natural Di...,"Monika Piazzesi (Co-primary), Martin Schneider...",https://www.shifraharondine.com/
1,Matthew Brown,Stanford,2024,mbrown35@stanford.edu,"Public Economics, Behavioral and Experimental",Do Sports Bettors Need Consumer Protection? Ev...,"Matthew Gentzkow, Hunt Allcott, B. Douglas Ber...",https://mattbrownecon.github.io/
2,Ian Calaway,Stanford,2024,icalaway@stanford.edu,"Public Economics, Labor Economics",Early Mentors for Exceptional Students,"Caroline Hoxby, Isaac Sorkin, Nicholas Bloom",https://sites.google.com/view/ian-calaway/home


In [82]:
stanford_placement_df.head(3)

Unnamed: 0,name,university,year,research_fields,placement
0,Maxim Bakhtin,Stanford,2025,"Behavioral Economics, Theory",Analysis Group
1,Joseph Anderson,Stanford,2024,"Innovation, Labor, Industrial Organization","Office of Management and Budget, U.S. Government"
2,Dominique Araya Vergara,Stanford,2024,"Health Economics, Public Economics","Postdoc, Ohio State University"


In [83]:
princeton_candidates_df.head(3)

Unnamed: 0,name,university,year,email,research_fields,job_market_paper,references,website,cv
0,Patrick Agte,Princeton,2024,patrick.agte@yale.edu,Development Economics\nHealth Economics\nIndus...,“Fighting Silent Killers: How India’s Public P...,"Janet Currie, Thomas Fujiwara, Rohini Pande, C...",http://www.patrickagte.com,https://patrickagte.github.io/patrickagte/agte...
1,Anshu Chen,Princeton,2024,anshuc@princeton.edu,Corporate Finance\nMacroeconomics,“Text-Based Industry Classifications and their...,"Moritz Lenel, Ezra Oberfield, Motohiro Yogo",https://sites.google.com/view/anshuchen/home,https://www.dropbox.com/scl/fi/hu83j9qi8lo33d3...
2,Francesco Fabbri,Princeton,2024,ffabbri@princeton.edu,Microeconomic Theory\nBehavioral Economics,“Attention Holdup”\nPietro Ortoleva (advisor),"Pietro Ortoleva, Faruk Gul, Wolfgang Pesendorfer",http://www.francesco-fabbri.com,https://francesco-fabbri.github.io/documents/c...


In [84]:
princeton_placement_df.head(3)


Unnamed: 0,name,university,year,placement_role,placement_location,research_fields
0,,Princeton,,Networks,Two Sigma Investments,Econometrics
1,,Princeton,,Networks,Two Sigma Investments,Finance
2,,Princeton,,Economist,International Monetary Fund,International Trade


Things we need to analyze at the end:
- Trend of number of candidates
- Trend of number of placements per field
- Trend of number of candidates by gender
    - Trend of gender composition
- movement of university (USNews)
- advisor's gender (on academic)
    - women advised by  women
    - men advised by men
    - so on
    - create a matrix, table for this

Variables that we need:
- Field
- Gender
- College
- Year
- advisor

Two dataset that are not usable:
- northwestern_placement_df
- princeton_placement_df
- ucsb_placement_df


Let's first combine all the placement data into one dataframe:

In [15]:
candidate_dfs = [
    ucsd_candidates_df,
    ucsb_candidates_df,
    ucr_candidates_df,
    ucd_candidates_df,
    stanford_candidates_df,
    princeton_candidates_df,
    northwestern_candidates_df
]

In [14]:
placement_dfs = [
    ucsd_placement_df,
    ucr_placement_df,
    ucd_placement_df,
    stanford_placement_df,
]

candidate_dfs = [
    ucsd_candidates_df,
    ucsb_candidates_df,
    ucr_candidates_df,
    ucd_candidates_df,
    stanford_candidates_df,
    princeton_candidates_df,
    northwestern_candidates_df
]


NameError: name 'ucsd_placement_df' is not defined

In [16]:
candidate_df = pd.concat(candidate_dfs)
candidate_df.head(3) # We don't really conduct anlaysis on this other than seeing the number increase

Unnamed: 0,name,university,advisors,research_fields,personal_website,image,year,research_areas,advisor,job_market_paper,references,website,email,cv
0,Giampaolo Bonomi,UCSD,Joel Sobel and Renee Bowen,"Political Economics, Applied Theory, Behaviora...",https://sites.google.com/view/gbonomi,Giampaolo picture 2.jpeg,,,,,,,,
1,Steven Brownstone,UCSD,Karthik Muralidharan and Craig McIntosh,Development Economics,https://www.sbrownstone.me,Steven photo.jpg,,,,,,,,
2,Erica K. Chuang,UCSD,Mark Jacobsen,Environment; Agriculture and Natural Resources,https://www.ericakchuang.com,EricaCnew.jpeg,,,,,,,,


In [17]:
candidate_df.to_csv('data/candidate_df.csv', index=False)

In [145]:
for df in placement_dfs:
    if 'placement_role' in df.columns and 'placement_location' in df.columns:
        df['placement'] = df['placement_role'] + ', ' + df['placement_location']
        df.drop(columns=['placement_role', 'placement_location'], inplace=True)

placement_df = pd.concat(placement_dfs)
placement_df.head(20)


Unnamed: 0,name,university,year,field,placement,research_fields
0,Vivan Aluoch,UCSD,2023-24,Development and Labor Economics,Analysis Group,
1,Hannah Bae,UCSD,2023-24,Health and Public,"Michigan State University, postdoc at Stanford...",
2,Evgenii Baranov,UCSD,2023-24,Micro Theory,Penn State University,
3,Amanda Bonheur,UCSD,2023-24,Behavioral and Labor Economics,RAND Corporation,
4,Edoardo Briganti,UCSD,2023-24,Macroeconomics,Bank of Canada,
5,Tjeerd de Vries,UCSD,2023-24,Finance and Econometrics,"HEC Paris, Finance Department",
6,Tanner Eastmond,UCSD,2023-24,Labor Economics,Brigham Young University,
7,Stefan Faridani,UCSD,2023-24,Econometrics,Georgia Tech,
8,Carlos Goes,UCSD,2023-24,International Trade,World Bank Group,
9,Zachary Hall,UCSD,2023-24,Applied Micro,Western Alliance Bank,


Let's work on changing all the year to integer, adding the type of placement(academic, private, government), also adding the gender of the candidate through using gender guesser.

In [147]:
# Changing year to integer
placement_df['year'] = placement_df['year'].apply(lambda x: int(str(x).split('-')[0]) if isinstance(x, str) and '-' in x else x)
placement_df['year'].unique()

array([2023., 2022., 2021., 2020., 2019., 2018., 2017., 2016., 2015.,
       2024., 2014., 2013., 2012., 2011., 2010., 2009., 2008., 2007.,
       2006., 2005., 2004., 2003., 2002., 2001., 2000., 1999., 2025.])

In [148]:
# This whole block is to classify the placement into academic, private_company, or government categories.
def classify_placement(placement_text):
    """
    Classify placement into academic, private_company, or government categories.
    
    Args:
        placement_text: String containing the placement information
        
    Returns:
        Dictionary with binary indicators for each category
    """
    placement_text = str(placement_text).lower()
    
    # Initialize all categories to 0
    result = {
        'academic': 0,
        'private_company': 0,
        'government': 0
    }
    
    # Academic keywords
    academic_keywords = {
        'university', 'college', 'professor', 'faculty', 'postdoc', 'post-doc',
        'lecturer', 'assistant prof', 'school', 'department of economics',
        'institute', 'post doctoral', 'visiting', 'research fellow',
        'business school', 'teaching'
    }
    
    # Government keywords
    government_keywords = {
        'fed', 'federal reserve', 'imf', 'world bank', 'treasury', 'department of',
        'ministry', 'central bank', 'government', 'agency', 'bureau', 'board of governors',
        'congressional', 'national', 'federal', 'usda', 'epa', 'doj', 'ftc', 'fda',
        'census', 'bank of canada', 'bank of korea', 'bank of england',
        'office of', 'administration', 'policy', 'public'
    }
    
    # Private company keywords
    private_keywords = {
        'inc', 'corp', 'llc', 'consulting', 'bank', 'capital', 'analytics',
        'group', 'company', 'associates', 'pwc', 'deloitte', 'amazon', 'google',
        'microsoft', 'jpmorgan', 'chase', 'goldman sachs', 'morgan stanley',
        'analysis group', 'cornerstone', 'research', 'private', 'industry'
    }
    
    # Check for matches
    if any(keyword in placement_text for keyword in academic_keywords):
        result['academic'] = 1
    if any(keyword in placement_text for keyword in government_keywords):
        result['government'] = 1
    if any(keyword in placement_text for keyword in private_keywords):
        result['private_company'] = 1
        
    # Special cases where we need to override
    if 'world bank' in placement_text or 'imf' in placement_text:
        result['private_company'] = 0
        result['government'] = 1
    
    # Research institutes that aren't universities should be academic
    if 'institute' in placement_text and not any(gov in placement_text for gov in ['federal', 'government']):
        result['academic'] = 1
        result['private_company'] = 0
    
    return result

# Example usage:
# Apply to DataFrame
def add_placement_classifications(df):
    """
    Add placement classification columns to DataFrame.
    
    Args:
        df: DataFrame containing 'placement_location' column
        
    Returns:
        DataFrame with added classification columns
    """
    classifications = df['placement'].apply(classify_placement)
    
    # Add new columns
    df['academic'] = classifications.apply(lambda x: x['academic'])
    df['private_company'] = classifications.apply(lambda x: x['private_company'])
    df['government'] = classifications.apply(lambda x: x['government'])
    
    return df

In [149]:
placement_df = add_placement_classifications(placement_df)
placement_df.head()

Unnamed: 0,name,university,year,field,placement,research_fields,academic,private_company,government
0,Vivan Aluoch,UCSD,2023.0,Development and Labor Economics,Analysis Group,,0,1,0
1,Hannah Bae,UCSD,2023.0,Health and Public,"Michigan State University, postdoc at Stanford...",,1,0,0
2,Evgenii Baranov,UCSD,2023.0,Micro Theory,Penn State University,,1,0,0
3,Amanda Bonheur,UCSD,2023.0,Behavioral and Labor Economics,RAND Corporation,,0,1,0
4,Edoardo Briganti,UCSD,2023.0,Macroeconomics,Bank of Canada,,0,1,1


In [150]:
placement_df[['academic', 'private_company', 'government']].sum()

academic           404
private_company    118
government         112
dtype: int64

In [151]:
# Now let's add the gender of the candidate
import gender_guesser.detector as gender

d = gender.Detector()
placement_df['gender_guess'] = placement_df['name'].apply(lambda x: d.get_gender(str(x).split()[0]))
placement_df['gender_guess'].value_counts()


gender_guess
male             236
unknown          178
female           110
andy              74
mostly_male       22
mostly_female     14
Name: count, dtype: int64

In [152]:
# Update gender_guess column
placement_df['gender_guess'] = placement_df['gender_guess'].replace({
    'mostly_male': 'male',
    'mostly_female': 'female'
})

# Extract 'andy' and 'unknown' genders for further analysis on gender
andy_unknown_list = placement_df[placement_df['gender_guess'].isin(['andy', 'unknown'])]['name'].tolist()
with open('gender_classify.txt', 'w') as file:
    for name in andy_unknown_list:
        file.write(f"{name}\n")

In [185]:
# Here is a deepseek R1_search classified dictionary of the gender. It is especially useful here since most names here are chinese names,
# it is perfect to use this Chinese LLM to classify the gender,
# But please note that this does not guarantee the accuracy of the gender classification::
gender_classification = {
    # Confirmed via search results
    "Songyu He": "female",  # Ph.D. candidate at UC San Diego :cite[1]:cite[3]
    "Jinhyeon Han": "male",  # Korean name conventions (Jinhyeon typically male)

    # Classified via cultural/linguistic patterns
    "Xiameng Hua": "female",
    "Sabareesh Ramachandran": "male",
    "Hanyi Wang": "female",
    "Bei Luo": "female",
    "Haitian Xie": "male",
    "Minki Kim": "male",
    "Wanchang Zhang": "male",
    "Wei-Lin Chen": "male",
    "Yu-Chang Chen": "male",
    "Zhiyun Jiang": "female",
    "Youngju Lee": "female",
    "Jianan Yang": "male",
    "Linyan Zhu": "female",
    "Yibun Liu": "male",
    "Xiao Ma": "female",
    "Kye Lippold": "male",
    "Nobu Nakazawa": "male",
    "Yang Wang": "male",
    "Jiajun Lu": "male",
    "Runjing Lu": "female",
    "Chu Alex Yu": "male",
    "Shihan Xie": "male",
    "Shuning Mao": "female",
    "Yuehui Amber Wang": "female",
    "Dodge Cahan": "male",
    "Seung-Keun Martinez": "male",
    "Vinayak Alladi": "male",
    "Wenxin Xie": "female",
    "Yaein Baek": "female",
    "Fanglin Sun": "female",
    "Xueying Lu": "female",
    "Yanjun Liao": "male",
    "Chen Liu": "male",
    "Xu Zhang": "male",
    "Ying Feng": "female",
    "Eul Noh": "male",
    "Yifei Lyu": "female",
    "Ce Liu": "male",
    "Xiaxin Wang": "female",
    "Zhenting Sun": "male",
    "Pedram Heydari": "male",
    "Xuan Ding": "female",
    "Onyi Lam": "female",
    "Sieuwerd Gaastra": "male",
    "Wei You": "male",
    "Shanthi Manian": "female",
    "Qihui Chen": "male",
    "Yinchu Zhu": "male",
    "Jue Wang": "female",
    "Wenbin Wu": "male",
    "Zheng Huang": "male",
    "Jungbin Hwang": "male",
    "Da Gong": "male",
    "Jingyan Guo": "female",
    "Dayang Li": "female",
    "Xinchan Lu": "female",
    "Andong Yan": "male",
    "Yongli Chen": "male",
    "Opinder Kaur": "female",
    "Yaojue Xu": "male",
    "Tao Wang": "male",
    "Yanchao Yang": "male",
    "Zhuozhen Zhao": "female",
    "Quanfeng Zhou": "male",
    "Chia-Lo Chen": "male",
    "Mirewuti Muhetaer": "male",
    "Xiaolu Zhu": "female",
    "Seolah Kim": "female",
    "Yoon Jae Ro": "male",
    "Peng Zhao": "male",
    "Jianghao Chu": "male",
    "Mingyuan Jia": "male",
    "Yun Luo": "female",  # "Yun" often gender-neutral but leans female in modern Chinese
    "Yi Mao": "male",  # "Yi" commonly male in Chinese
    "Hanbyul Ryu": "male",  # Korean "-byul" (star) is unisex, but "Han" leans male
    "Shiyun Zhang": "female",  # "Shiyun" has soft phonetic cues (common in female names)
    "Bo-Yu Chen": "male",  # "Bo" (博, scholarly) and "Yu" (宇, universe) are male markers
    "Tasneem Raihan": "female",  # "Tasneem" is a feminine Arabic name
    "He Wang": "male",  # "He" (和, harmony) is neutral but often male in compound names
    "Hao Xu": "male",  # "Hao" (豪, heroic) is strongly male
    "Zhi Zhao": "male",  # "Zhi" (智, wisdom) is unisex but more common for males
    "Anaka Aiyar": "female",  # "Anaka" is a feminine Sanskrit name
    "Najrin Khanom": "female",  # "Khanom" is an honorific for women in Persian/Turkic cultures
    "Xuefeng Pan": "male",  # "Feng" (峰, peak) is typically male
    "Wen Kong": "male",  # "Wen" (文, literature) is neutral but leans male
    "Huiling Zhang": "female",  # "-ling" (玲, jade sound) is common in female names
    "Dong Zhou": "male",  # "Dong" (东, east) is male-leaning
    "Zhiming Fu": "male",  # "Ming" (明, bright) is unisex but often male
    "Cheng Jiang": "male",  # "Cheng" (成, achieve) is male-leaning
    "Mingming Jiang": "male",  # Double "Ming" reinforces male association
    "Dongpeng Liu": "male",  # "Peng" (鹏, mythical bird) is male
    "Huansha Wang": "female",  # "Sha" (莎, sand) is common in female names
    "Jie Wei": "female",  # "Jie" (洁, purity) is often female
    "Ru Zhang": "female",  # "Ru" (茹, eat/herb) leans female
    "Venoo Kakar": "female",  # "Venoo" is a feminine Indian/Pakistani name
    "Wei Lin": "male",  # "Wei" (伟, greatness) is strongly male
    "Mi Lu": "female",  # "Mi" (蜜, honey) is feminine
    "Yingying Sun": "female",  # Reduplicated "Ying" (莹, lustrous) is female
    "Zhou Xi": "male",  # "Xi" (熙, brightness) is male-leaning
    "Yundong Tu": "male",  # "Dong" (东, east) reinforces male
    "Yun Wang": "female",  # "Yun" (云, cloud) is unisex but leans female
    "Xiangbo Liu": "male",  # "Bo" (博, scholarly) is male
    "Getachew Nigatu": "male",  # Ethiopian name; "Getachew" is male
    "Hung-Lin Chen": "male",  # "Hung" (宏, vast) and "Lin" (林, forest) are male
    "Insu Kim": "male",  # Korean "In-su" (仁秀) is typically male
    "Wing Yu Leung": "female",  # Cantonese "Wing" (咏, sing) leans female
    "Jie Li": "female",  # "Jie" (洁, purity) is female
    "Arindam Nandi": "male",  # Bengali name; "Arindam" (অরিন্দম) is male
    "Weiqiang Qian": "male",  # "Qiang" (强, strong) is male
    "Jaehee Son": "female",  # Korean "Jae-hee" (재희) is often female
    "Xiaoyu Wu": "female",  # "Xiaoyu" (小雨, light rain) is female-leaning
    "Lopamudra Banerjee": "female",  # "Lopamudra" is a feminine Sanskrit name
    "Subhadip Chattopadhyay": "male",  # Bengali "Subhadip" (শুভদীপ) is male
    "Huiyu Huang": "female",  # "Huiyu" (慧瑜) includes "hui" (wisdom), often female
    "Sucharita Sinha": "female",  # "Sucharita" is a feminine Sanskrit name
    "Anirban Dasgupta": "male",  # Bengali "Anirban" (অনির্বাণ) is male
    "Shatakshee Dhongde": "female",  # "Shatakshee" is a feminine Indian name
    "Yang Yang": "male",  # Double "Yang" (阳, sun/male) reinforces male
    "Xiao Huang": "female",  # "Xiao" (小, little) is neutral but leans female
    "Xiangdong Long": "male",  # "Dong" (东, east) is male
    "Wei Sun": "male",  # "Wei" (伟, greatness) is male
    "Weiping Yang": "male",  # "Wei" + "Ping" (平, peace) is male-leaning
    "Witsaroot Pariyaprasert": "male",  # Thai "Witsaroot" (วิทยารุทธิ์) is male
    "Savvina Chowdhury": "female",  # "Savvina" is a Greek feminine name
    "Sushama Murty": "female",  # "Sushama" is a feminine Sanskrit name
    "Fang Dong": "female",  # "Fang" (芳, fragrant) is female
    "Debasri Mukherjee": "female",  # "Debasri" is a feminine Bengali name
    "Gurleen Popli": "female",  # "Gurleen" is a female Punjabi name
    "Indranil Dutta": "male",  # Bengali "Indranil" (ইন্দ্রনীল) is male
    "Chengxuan Yu": "male",  # "Xuan" (轩, lofty) is male
    "Subodh Kumar": "male",  # "Subodh" (सुबोध) is a male Sanskrit name
    "Anjan Chakrabarti": "male",  # Bengali "Anjan" (অঞ্জন) is male
    "Achin Chakrabarti": "male",  # Bengali "Achin" (অচিন) is male
    "Indraneel Dasgupta": "male",  # Bengali "Indraneel" (ইন্দ্রনীল) is male
    "Mwangi wa Githinji": "male",  # Kikuyu (Kenyan) name; "Mwangi" is male
    "Chiehwei Hung": "male",  # "Wei" (偉, greatness) reinforces male
    "Insong Jang": "male",  # Korean "In-song" (인송) is male
    "Nilanjana Roy": "female",  # "Nilanjana" is a feminine Bengali name
    "Rong-Chang Wu": "male",  # "Rong" (荣, glory) is male-leaning
    "Zhong-Guo Zhou": "male",  # "Zhongguo" (中国, China) is gender-neutral but contextually male
    "Seungjin Baek": "male",  # Korean "Seung-jin" (승진) is male
    "Jinyoung Seo": "male",  # Korean "Jin-young" (진영) is typically male
    "Baiyu Zhou": "female",         # "Yu" (玉, jade) often female, but contextually unisex
    "Jou-Chun Lin": "female",       # "Chun" (春, spring) leans female
    "Eunju Lee": "female",          # Korean "Eun-ju" (은주) typically female
    "Deokjay Jeong": "male",        # Korean "Deok" (덕, virtue) male
    "Yumeng Gu": "female",          # Chinese "Meng" (萌, budding) female
    "Yijing Wang": "female",        # "Yijing" (怡静, pleasant/quiet) female
    "Ninghui Li": "female",         # "Hui" (慧, wise) female
    "Manho Kang": "male",           # Korean "Man-ho" (만호) male
    "Hanguo Huang": "male",         # "Guo" (国, nation) male
    "Thiago DE LUCENA COELHO": "male",  # Western/Portuguese male
    "Minsu KIM": "male",            # Korean "Min-su" (민수) male
    "Hyok Jung KIM": "male",        # Korean "Hyok" (혁, revolutionary) male
    "Mingxi LI": "male",            # "Ming" (明, bright) male
    "Iwunze UGO": "male",           # Igbo (Nigerian) "Ugo" male
    "Rizki SIREGAR": "male",        # Indonesian "Rizki" male
    "Tamoghna HALDER": "male",      # Bengali "Tamoghna" male
    "Sukjoon LEE": "male",          # Korean "Suk-joon" (석준) male
    "Haopeng SHEN": "male",         # "Hao" (豪, heroic) male
    "Hang ZHOU": "male",            # Contextually male (common usage)
    "Dakyung SEONG": "male",        # Korean "Da-kyung" (다경) male
    "Zhixian LIN": "male",          # "Zhi" (智, wisdom) male
    "Xuan FEI": "female",           # "Xuan" (萱, daylily) female
    "Tingting ZHU": "female",       # Reduplicated "Ting" (婷, graceful) female
    "Mingzhi XU": "male",           # "Ming" (明, bright) male
    "Tian XIA": "male",             # "Tian" (天, sky) male
    "Jaerim CHOI": "male",          # Korean "Jae-rim" (재림) male
    "Chuan HE": "male",             # "Chuan" (川, river) male
    "Chenghao HU": "male",          # "Hao" (豪, heroic) male
    "Chi-Yuan TSAI": "male",        # "Yuan" (元, origin) male
    "Yujung SUH": "female",         # Korean "Yu-jung" (유정) female
    "Qi HAN": "female",             # "Qi" (琪, fine jade) female
    "Jiwon LEE": "female",          # Korean "Ji-won" (지원) often female
    "Yingxue LI": "female",         # "Ying" (莹, lustrous) female
    "Zhe YANG": "male",             # "Zhe" (哲, philosophy) male
    "Lijuan YIN": "female",         # "Juan" (娟, graceful) female
    "Jongkwan LEE": "male",         # Korean "Jong-kwan" (종관) male
    "Seungduck LEE": "male",        # Korean "Seung" (승) male
    "Tao LIU": "male",              # "Tao" (涛, wave) male
    "Jae Wook JUNG": "male",        # Korean "Jae-wook" (재욱) male
    "Ji Hyun PARK": "female",       # Korean "Ji-hyun" (지현) female
    "Na'ama SHENHAV": "female",     # Hebrew "Na'ama" female
    "Wenjun MA": "male",            # "Wenjun" (文君, scholarly) male
    "Angsoka PAUNDRALINGGA": "female",  # Indonesian "Angsoka" (flower) female
    "Kyunghun KIM": "male",         # Korean "Kyung-hun" (경훈) male
    "Xiaohan ZHANG": "male",        # "Xiao" (晓, dawn) male-leaning
    "Hang-Wei HAO": "male",         # "Wei" (伟, greatness) male
    "Kuk Mo JUNG": "male",          # Korean "Kuk-mo" (국모) male
    "Guojun WANG": "male",          # "Guojun" (国军, national army) male
    "Chi-Hung LIAO": "male",        # "Hung" (宏, vast) male
    "Yu HAO": "male",               # "Hao" (豪, heroic) male
    "Yoon-Kyung CHUNG": "female",   # Korean "Yoon-kyung" (윤경) female
    "Hyung Suk KIM": "male",        # Korean "Hyung-suk" (형석) male
    "Ankur PATEL": "male",          # Indian "Ankur" (अंकुर) male
    "Shaofeng XU": "male",          # "Feng" (峰, peak) male
    "Teny MAGHAKIAN": "male",       # Armenian "Teny" male
    "Ju Hyun PYUN": "female",       # Korean "Ju-hyun" (주현) female
    "Rebbecca REED-ARTHURS": "female",  # Western female
    "Liugang SHENG": "male",        # "Gang" (刚, strong) male
    "Abhijit TALATHI": "male",      # Indian "Abhijit" (अभिजित्) male
    "Yi CHEN": "male",              # "Yi" (毅, resolute) male
    "Chia-Wen CHEN": "male",        # "Wen" (文, literature) male
    "Changho CHOI": "male",         # Korean "Chang-ho" (창호) male
    "Zhiyuan LI": "male",           # "Zhiyuan" (志远, ambitious) male
    "Weijun HU": "male",            # "Wei" (伟, greatness) male
    "Meixin GUO": "female",         # "Mei" (美, beautiful) female
    "Ling FENG": "female",          # "Ling" (玲, jade sound) female
    "Yanping CHONG": "female",      # "Yan" (艳, gorgeous) female
    "Shih-Wei CHAO": "male",        # "Wei" (伟, greatness) male
    "Sun Go": "male",               # Korean "Sun-go" (선고) male
    "Hong Ma": "male",              # "Hong" (洪, flood) male
    "Yuan Xu": "male",              # "Yuan" (元, origin) male
    "Chang Seon LEE": "male",       # Korean "Chang-seon" (창선) male
    "Seungjoon LEE": "male",        # Korean "Seung-joon" (승준) male
    "Chun WANG": "female",          # "Chun" (春, spring) female-leaning
    "DaeHwan KIM": "male",          # Korean "Dae-hwan" (대환) male
    "Ching-Yi LIN": "female",       # "Yi" (怡, joy) female
    "Kyungwon RHO": "male",         # Korean "Kyung-won" (경원) male
    "Yingying XU": "female",        # Reduplicated "Ying" (莹) female
    "Sunhwa LEE": "female",         # Korean "Sun-hwa" (선화) female
    "Piyachart PHIROMSWAD": "male", # Thai "Piyachart" male
    "Jing TONG": "female",          # "Jing" (静, calm) female
    "Huiran PAN": "female",         # "Hui" (慧, wise) female
    "Wei-Min HU": "male",           # "Wei" (伟, greatness) male
    "Sung Ju SONG": "male",         # Korean "Sung-ju" (성주) male
    "Huiya (Grace) CHEN": "female", # "Hui" (慧, wise) female
    "Uluc AYSUN": "male",           # Turkish "Uluç" male
    "Chunchih CHEN": "male",        # "Chih" (志, ambition) male
    "Chang HONG": "male",           # "Chang" (昌, prosperous) male
    "Ysbrand VAN DER WERF": "male", # Dutch "Ysbrand" male
    "Hyung-Cheol SHIN": "male",     # Korean "Hyung-cheol" (형철) male
    "Kyuil CHUNG": "male",          # Korean "Kyu-il" (규일) male
    "Miaojie YU": "female",         # "Miao" (妙, wonderful) female
    "Hai Yan DENG": "female",       # "Yan" (燕, swallow) female
    "Dae-Wook KIM": "male",         # Korean "Dae-wook" (대욱) male
    "Seung-Cheol JEON": "male",     # Korean "Seung-cheol" (승철) male
    "Ping-Hsuan (Loretta) FUNG": "female",  # "Ping-Hsuan" + Loretta (female)
    "Songhua LIN": "male",          # "Hua" (华, splendor) male-leaning
    "Karou NABESHIMA": "male",      # Japanese "Karou" (カロウ) male
    "Inho CHUNG": "male",           # Korean "In-ho" (인호) male
    "Hiau-Looi ('Kee') KEE": "male",  # Chinese "Looi" male
    "Dominique Araya Vergara": "female",  # Contextually female (common usage)
    "Jean-Felix Brouillette": "male",  # French male
    "Suhani Jalota": "female",      # Indian "Suhani" (सुहानी) female
    "Ziao Ju": "male",              # Contextually male (phonetic cues)
    "Xinyao Qiu": "female",         # "Xinyao" (欣瑶, joyful jade) female
    "Zhaonan Qu": "male",           # "Nan" (男, male) explicit marker
    "Xuejie Yi": "female",          # "Xue" (雪, snow) female
    "Chuan Yu": "male",             # "Chuan" (川, river) male
    "Subhadip Chattopadhyay": "male",
    "Hiau-Looi ('Kee') KEE": "female"
}



In [188]:
placement_df['gender'] = placement_df['gender_guess']

# Update gender based on the classification dictionary
for name, gender in gender_classification.items():
    placement_df.loc[placement_df['name'] == name, 'gender'] = gender

# Update specific indices based on known gender information
placement_df.loc[120, 'gender'] = 'male'
placement_df.loc[250, 'gender'] = 'female'
placement_df[placement_df['gender'] == 'unknown']



Unnamed: 0,name,university,year,field,placement,research_fields,academic,private_company,government,gender_guess,gender


In [189]:
placement_df.to_csv('analyzed_data/placement_df.csv')

## Data Analysis

In [9]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np

In [10]:
#Check point, load the data from here
placement_df = pd.read_csv('../data/analyzed_data/placement_df.csv')
candidate_df = pd.read_csv('../data/analyzed_data/candidate_df.csv')

### 1. Trend of number of candidates



In [13]:
import plotly.express as px
import plotly.io as pio

# Group the data by year and count the number of candidates
yearly_counts = placement_df[placement_df['year'] != 2025].groupby('year').size().reset_index(name='count')

# Create a line plot using Plotly
fig = px.line(yearly_counts, x='year', y='count', title='Trend of Number of Candidates', labels={'year': 'Year', 'count': 'Number of Candidates'})

# Save the plot as an HTML file
pio.write_html(fig, file='../output/candidates_trend.html')

# Display the plot in notebook
fig.show()

### 2. Trend by Placement Type

In [15]:
import plotly.express as px

# Group the data by year and academic, then count the number of candidates
yearly_academic_counts = placement_df.groupby(['year', 'academic']).size().reset_index(name='count')
yearly_academic_counts = yearly_academic_counts[yearly_academic_counts['academic'] == 1]

# Create a bar plot for academic placements
fig_academic = px.bar(yearly_academic_counts, x='year', y='count', 
                      title='Trend of Number of Academic Placements by Year',
                      labels={'year': 'Year', 'count': 'Number of Academic Placements'},
                      color_discrete_sequence=['#1f77b4'])  # Blue

fig_academic.update_layout(
    xaxis_title='Year',
    yaxis_title='Number of Academic Placements',
    template='plotly_white'
)

fig_academic.show()

# Group the data by year and private_company, then count the number of candidates
yearly_private_counts = placement_df.groupby(['year', 'private_company']).size().reset_index(name='count')
yearly_private_counts = yearly_private_counts[yearly_private_counts['private_company'] == 1]

# Create a bar plot for private company placements
fig_private = px.bar(yearly_private_counts, x='year', y='count', 
                     title='Trend of Number of Private Company Placements by Year',
                     labels={'year': 'Year', 'count': 'Number of Private Company Placements'},
                     color_discrete_sequence=['#ff7f0e'])  # Orange

fig_private.update_layout(
    xaxis_title='Year',
    yaxis_title='Number of Private Company Placements',
    template='plotly_white'
)

fig_private.show()

# Group the data by year and government, then count the number of candidates
yearly_government_counts = placement_df.groupby(['year', 'government']).size().reset_index(name='count')
yearly_government_counts = yearly_government_counts[yearly_government_counts['government'] == 1]

# Create a bar plot for government placements
fig_government = px.bar(yearly_government_counts, x='year', y='count', 
                        title='Trend of Number of Government Placements by Year',
                        labels={'year': 'Year', 'count': 'Number of Government Placements'},
                        color_discrete_sequence=['#2ca02c'])  # Green

fig_government.update_layout(
    xaxis_title='Year',
    yaxis_title='Number of Government Placements',
    template='plotly_white'
)

fig_government.show()

# Create a stacked bar plot for all placement types
yearly_combined_counts = placement_df.groupby('year').agg(
    academic_count=('academic', 'sum'),
    private_count=('private_company', 'sum'),
    government_count=('government', 'sum')
).reset_index()

fig_combined = px.bar(yearly_combined_counts, x='year', y=['academic_count', 'private_count', 'government_count'], 
                      title='Number of Jobs in Each Field by Year',
                      labels={'year': 'Year', 'value': 'Number of Jobs', 'variable': 'Field'},
                      barmode='stack',
                      color_discrete_sequence=['#1f77b4', '#ff7f0e', '#2ca02c'])  # Blue, Orange, Green

fig_combined.update_layout(
    xaxis_title='Year',
    yaxis_title='Number of Jobs',
    template='plotly_white'
)

fig_combined.show()

In [16]:
import plotly.express as px

# Prepare the data for the pie chart by melting the yearly counts into long format
yearly_combined_counts = placement_df.groupby('year').agg(
    academic_count=('academic', 'sum'),
    private_count=('private_company', 'sum'),
    government_count=('government', 'sum')
).reset_index()

# Filter out years 1999 and 2025
yearly_combined_counts = yearly_combined_counts[~yearly_combined_counts['year'].isin([1999, 2025])]

# Melt the data to get it in the right format for the pie chart
melted_counts = yearly_combined_counts.melt(
    id_vars=['year'],
    value_vars=['academic_count', 'private_count', 'government_count'],
    var_name='placement_type',
    value_name='count'
)

# Map the column names to display names
melted_counts['placement_type'] = melted_counts['placement_type'].map({
    'academic_count': 'Academic',
    'private_count': 'Private Company', 
    'government_count': 'Government'
})

# Create the pie chart
fig_pie = px.pie(
    melted_counts[melted_counts['year'] == melted_counts['year'].max()], # Start with most recent year
    values='count',
    names='placement_type',
    title='Distribution of Placements by Type',
    color_discrete_sequence=['#1f77b4', '#ff7f0e', '#2ca02c']  # Blue, Orange, Green
)

# Add a slider for the year
fig_pie.update_traces(
    selector=dict(type='pie'),
    pull=[0.1, 0, 0]  # Slightly pull out the academic slice for emphasis
)

# Create frames for animation
frames = [
    dict(
        data=[dict(
            type='pie',
            values=melted_counts[melted_counts['year'] == year].sort_values('placement_type')['count'],
            labels=sorted(['Academic', 'Private Company', 'Government'])
        )],
        name=str(int(year))
    )
    for year in sorted(melted_counts['year'].unique())
]

# Add slider
sliders = [dict(
    active=len(frames)-1,
    currentvalue={"prefix": "Year: "},
    pad={"t": 50},
    steps=[dict(
        label=str(int(year)),
        method='animate',
        args=[[str(int(year))], dict(
            frame=dict(duration=0, redraw=True),
            mode='immediate',
            transition=dict(duration=0)
        )]
    ) for year in sorted(melted_counts['year'].unique())]
)]

fig_pie.frames = frames
fig_pie.update_layout(
    sliders=sliders
)

# Save to HTML file
fig_pie.write_html("../output/placement_pie_chart.html")

# Display the figure
fig_pie.show()

In [17]:
import plotly.graph_objects as go
import pandas as pd

def create_sankey(start_year, placement_df):
    # Get the latest year (excluding 2025)
    latest_year = placement_df[placement_df['year'] != 2025]['year'].max()
    
    # Create nodes for both time periods
    node_labels = [
        f'Academic ({int(start_year)})', 
        f'Private Company ({int(start_year)})', 
        f'Government ({int(start_year)})',
        f'Academic ({int(latest_year)})', 
        f'Private Company ({int(latest_year)})', 
        f'Government ({int(latest_year)})'
    ]
    
    # Get data for start and latest years
    start_data = placement_df[placement_df['year'] == start_year]
    late_data = placement_df[placement_df['year'] == latest_year]
    
    # Create source-target pairs and values
    sources = []
    targets = []
    values = []
    
    # For each start placement type
    for i, start_type in enumerate(['academic', 'private_company', 'government']):
        start_count = start_data[start_type].sum()
        # Connect to each late placement type
        for j, late_type in enumerate(['academic', 'private_company', 'government']):
            late_count = late_data[late_type].sum()
            
            # Add to source-target pairs
            sources.append(i)  # Start type index
            targets.append(j + 3)  # Late type index (offset by 3)
            
            # Calculate proportional flow
            if i == j:  # Same category gets larger weight
                values.append(start_count * 0.6)
            else:  # Different categories split remaining proportion
                values.append(start_count * 0.2)
    
    return dict(
        node = dict(
            pad = 1, 
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = node_labels,
            color = ["#1f77b4", "#ff7f0e", "#2ca02c"] * 2  
        ),
        link = dict(
            source = sources,
            target = targets,
            value = values,
            color = ["rgba(31, 119, 180, 0.4)", 
                    "rgba(255, 127, 14, 0.4)", 
                    "rgba(44, 160, 44, 0.4)"] * 3 
        )
    )

# Get all years except 2025 and the latest year
available_years = sorted(placement_df[
    (placement_df['year'] != 2025) & 
    (placement_df['year'] != placement_df[placement_df['year'] != 2025]['year'].max())
]['year'].unique())

# Create frames for each year
frames = [
    go.Frame(
        data=[go.Sankey(
            **create_sankey(year, placement_df)
        )],
        name=str(int(year))
    )
    for year in available_years
]

# Create initial figure
initial_year = available_years[0]
fig = go.Figure(
    data=[go.Sankey(
        **create_sankey(initial_year, placement_df)
    )],
    frames=frames
)

# Add slider
sliders = [dict(
    active=0,
    currentvalue={"prefix": "Starting Year: "},
    pad={"t": 50},
    steps=[dict(
        label=str(int(year)),
        method='animate',
        args=[[str(int(year))], dict(
            frame=dict(duration=300, redraw=True),
            mode='immediate',
            transition=dict(duration=300)
        )]
    ) for year in available_years]
)]

# Update layout
fig.update_layout(
    title_text=f"Changes in Placement Types Over Time (End Year: {int(placement_df[placement_df['year'] != 2025]['year'].max())})",
    font_size=12,
    height=600,
    width=1000,
    sliders=sliders
)

# Save to HTML file
fig.write_html("../output/placement_liquid.html")

# Display the figure
fig.show()