In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
       


## Rank 1 Harvard University


In [17]:
import re
import requests

def harvard_placement(url="https://www.economics.harvard.edu/placement"):
    """
    Extracts graduate student placement data from Harvard Economics placement page for years 2024-2005.

    Args:
        url (str): The URL of the Harvard Economics placement page.
                  Defaults to https://www.economics.harvard.edu/placement

    Returns:
        DataFrame: A pandas DataFrame containing student placement records with columns:
                  year, name, fields_of_study, placement
    """
    # Get HTML content from URL
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    data = []
    
    # Find all accordion panels containing placement tables
    accordion_panels = soup.find_all('div', class_='accordion-panel')
    
    for panel in accordion_panels:
        # Find the year from the preceding h3 heading
        year_heading = panel.find_previous_sibling('h3')
        if year_heading and 'Graduate Student Placement' in year_heading.text:
            try:
                year = int(year_heading.text.split()[-1])
                # Only process years between 2024 and 2005
                if 2005 <= year <= 2024:
                    # Find table in this panel
                    table = panel.find('table')
                    if table:
                        # Get headers to determine table structure
                        headers = [th.get_text(strip=True).lower() for th in table.find_all(['th', 'td']) if th.get_text(strip=True)]
                        has_fields = 'fields of study' in headers
                        
                        # Process each row
                        for row in table.find_all('tr')[1:]:  # Skip header row
                            cells = row.find_all(['td'])
                            if cells:
                                record = {'year': year}
                                record['name'] = cells[0].get_text(strip=True)
                                
                                if has_fields:
                                    record['fields_of_study'] = cells[1].get_text(strip=True)
                                    record['placement'] = cells[2].get_text(strip=True)
                                else:
                                    record['fields_of_study'] = None
                                    record['placement'] = cells[1].get_text(strip=True)
                                    
                                data.append(record)
            except ValueError:
                # Skip if year cannot be parsed as integer
                continue

    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df


hardvard_placement_df = harvard_placement()
hardvard_placement_df['ranking'] = 1
hardvard_placement_df['university'] = 'Harvard University'
hardvard_placement_df['department'] = 'Economics'
hardvard_placement_df

Unnamed: 0,year,name,fields_of_study,placement,ranking,university,department
0,2024,Jenna Anders,,University of Virginia Batten,1,Harvard University,Economics
1,2024,Martin Aragoneses,,INSTEAD,1,Harvard University,Economics
2,2024,Michael Blank,,"Stanford University, Graduate School of Business",1,Harvard University,Economics
3,2024,Phoebe Cai,,Link Logistics Real Estate,1,Harvard University,Economics
4,2024,Romaine Campbell,,Cornell Brooks Policy School,1,Harvard University,Economics
...,...,...,...,...,...,...,...
555,2005,Kate Ho,Business Economics,"Columbia, Department of Economics",1,Harvard University,Economics
556,2005,Kristin Knox,Business Economics,"Harvard, Institutional Research",1,Harvard University,Economics
557,2005,Michael Ostrovsky,Business Economics,"Stanford Graduate School of Business, Economics",1,Harvard University,Economics
558,2005,Alexander Wagner,PEG,Analysis Group,1,Harvard University,Economics


In [18]:
hardvard_placement_df.to_csv('../data/raw/Harvard/harvard_placement.csv')

## Rank 2 Massachusetts Institute of Technology
is out - doesn't have placement data

## Rank 6 University of Chicago
is out - doesn't have placement data

## Rank 10 New York University (NYU)


In [44]:
def scrape_stern_placements(html_content):
    """
    Scrapes placement data from NYU Stern's HTML content.
    
    Args:
        html_content (str): HTML content of the placement page
        
    Returns:
        DataFrame: Contains columns year, program, name, placement, title
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    data = []
    
    # Find all year sections
    year_sections = soup.find_all('h2', class_='font-nyuperstare-a')
    
    for section in year_sections:
        # Extract year if section contains "Placements"
        if 'Placements' in section.text:
            try:
                year = int(section.text.strip().split()[0])
                
                # Find the next table after this heading
                table_div = section.find_next('div', class_='table-style')
                if table_div:
                    table = table_div.find('table')
                    if table:
                        # Process each row
                        rows = table.find_all('tr')
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_all('td')
                            if len(cells) >= 4:
                                record = {
                                    'year': year,
                                    'program': cells[0].get_text(strip=True),
                                    'name': cells[1].get_text(strip=True),
                                    'placement': cells[2].get_text(strip=True),
                                    'title': cells[3].get_text(strip=True)
                                }
                                data.append(record)
            except ValueError:
                continue
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Add university and ranking info
    df['university'] = 'New York University'
    df['department'] = 'Stern School of Business'
    df['ranking'] = 10  # NYU's ranking
    
    # Reorder columns
    column_order = ['year', 'name', 'program', 'placement', 'title', 'ranking', 'university', 'department']
    df = df[column_order]
    
    return df

# Read the HTML file
with open('../data/raw/NYU/Recent Job Placements - NYU Stern.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Create DataFrame
stern_df = scrape_stern_placements(html_content)

stern_df.drop('program', axis=1).head()
stern_df.to_csv('../data/raw/NYU/stern_placement.csv')
# Merge title and placement columns
stern_df['placement'] = stern_df['title'] + ' at ' + stern_df['placement']
stern_df.to_csv('../data/raw/NYU/stern_placement.csv')
stern_df


Unnamed: 0,year,name,program,placement,title,ranking,university,department
0,2024,Stephanie Dong,Accounting,Visiting Assistant Professor at University of ...,Visiting Assistant Professor,10,New York University,Stern School of Business
1,2024,Shuqing Huang,Accounting,Assistant Professor at California State Univer...,Assistant Professor,10,New York University,Stern School of Business
2,2024,Melanie Friedrichs,Economics,Research Economist at Office of Financial Rese...,Research Economist,10,New York University,Stern School of Business
3,2024,Raja Panjwani,Economics,Visiting Researcher at London School of Economics,Visiting Researcher,10,New York University,Stern School of Business
4,2024,Agata Farina,Economics,Post-Doctoral Researcher at Princeton University,Post-Doctoral Researcher,10,New York University,Stern School of Business
...,...,...,...,...,...,...,...,...
163,2014,Tingting Fan,Marketing,Assistant Professor at Chinese University of H...,Assistant Professor,10,New York University,Stern School of Business
164,2014,Shelle Santana,Marketing,Assistant Professor at Harvard University,Assistant Professor,10,New York University,Stern School of Business
165,2014,Jigar Patel,Operations,Assistant Professor at Montclair State University,Assistant Professor,10,New York University,Stern School of Business
166,2014,Christos Zacharias,Operations,Visiting Asst. Professor at University of Miami,Visiting Asst. Professor,10,New York University,Stern School of Business


## Rank 12 Duke University


In [26]:
def scrape_duke_placements(html_content):
    """
    Scrapes placement data from Duke Economics HTML content.
    
    Args:
        html_content (str): HTML content of the placement page
        
    Returns:
        DataFrame: Contains columns year, name, position, institution
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    data = []
    
    # Find all accordion panels
    accordion_panels = soup.find_all('div', class_='card-block panel-collapse collapse')
    
    for panel in accordion_panels:
        # Get year from the panel header
        year_header = panel.find_previous('div', class_='panel-title')
        if year_header and year_header.find('a'):
            try:
                year = int(year_header.find('a').text.strip())
                
                # Find table in the panel
                table = panel.find('table', class_='tablesaw')
                if table:
                    # Process each row
                    rows = table.find_all('tr')
                    for row in rows[1:]:  # Skip header row
                        cells = row.find_all('td')
                        if len(cells) >= 3:
                            record = {
                                'year': year,
                                'name': cells[0].get_text(strip=True),
                                'position': cells[1].get_text(strip=True),
                                'institution': cells[2].get_text(strip=True)
                            }
                            data.append(record)
            except ValueError:
                continue
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Add university and ranking info
    df['university'] = 'Duke University'
    df['department'] = 'Economics'
    df['ranking'] = 12  # Duke's ranking
    
    # Reorder columns
    column_order = ['year', 'name', 'position', 'institution', 'ranking', 'university', 'department']
    df = df[column_order]
    
    return df

# Read the HTML file
with open('../data/raw/Duke/Job Market Placements _ Economics Department.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Create DataFrame
duke_df = scrape_duke_placements(html_content)
# Combine position and institution columns into placement
duke_df['placement'] = duke_df['position'] + ' at ' + duke_df['institution']

# Drop original position and institution columns
duke_df = duke_df.drop(['position', 'institution'], axis=1)

# Display the DataFrame
duke_df.to_csv('../data/raw/Duke/duke_placement.csv')
duke_df

Unnamed: 0,year,name,ranking,university,department,placement
0,2024,Saketh Aleti,12,Duke University,Economics,Senior Analyst at PanAgora Asset Management
1,2024,Xinyue Bei,12,Duke University,Economics,"Asst. Professor at University of Texas, Austin"
2,2024,Erin Denison,12,Duke University,Economics,Associate at Analysis Group
3,2024,Seohee Kim,12,Duke University,Economics,Asst. Professor at Indiana University Kelley S...
4,2024,Eun-Seok Lee,12,Duke University,Economics,Research Economist at POSCO Research Institute
...,...,...,...,...,...,...
269,2005,Stephen Ryan,12,Duke University,Economics,Assistant Professor at Massachussetts Institut...
270,2005,Joanna Vinluan Tobiason,12,Duke University,Economics,Associate at Cornerstone Research
271,2005,Jingshu Wang,12,Duke University,Economics,Health Economist at Merck Research Laboratories
272,2005,Jianguo Xu,12,Duke University,Economics,Assistant Professor at McGill University


## Rank 13 University of Michigan--Ann Arbor


In [38]:
def scrape_umich_placements(html_content):
    """
    Scrapes placement data from UMich Economics HTML content.
    
    Args:
        html_content (str): HTML content of the placement page
        
    Returns:
        DataFrame: Contains columns year, name, placement
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    data = []
    
    # Find all accordion sections with the correct class
    accordion_sections = soup.find_all('div', {'class': 'accordion aem-GridColumn aem-GridColumn--default--12'})
    
    for section in accordion_sections:
        # Get the accordion wrapper
        wrapper = section.find('div', class_='accordion-wrap')
        if wrapper:
            # Get year from the h3 header
            header = wrapper.find('h3')
            if header and 'Job Market Placements' in header.text:
                try:
                    # Extract year from header text (e.g., "2020-2021" -> 2021)
                    year_text = header.text.strip().split()[0]
                    year = int(year_text.split('-')[1])
                    
                    # Find placement data in the accordion body
                    body = wrapper.find('div', class_='accordion-body text')
                    if body:
                        # Each placement is in a <p> tag
                        paragraphs = body.find_all('p')
                        for p in paragraphs:
                            # Name is in bold
                            name_tag = p.find('b')
                            if name_tag:
                                name = name_tag.get_text(strip=True)
                                # Get placement info (everything after the name)
                                full_text = p.get_text(strip=True)
                                placement = full_text[len(name):].strip()
                                
                                if placement:
                                    record = {
                                        'year': year,
                                        'name': name,
                                        'placement': placement
                                    }
                                    data.append(record)
                except ValueError:
                    continue
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    if not df.empty:
        # Add university and ranking info
        df['university'] = 'University of Michigan'
        df['department'] = 'Economics'
        df['ranking'] = 13  # UMich's ranking
        
        # Reorder columns
        column_order = ['year', 'name', 'placement', 'ranking', 'university', 'department']
        df = df[column_order]
    
    return df

# Read the HTML file
with open('../data/raw/UMich/UMich.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Create DataFrame
umich_df = scrape_umich_placements(html_content)
umich_df.to_csv('../data/raw/UMich/umich_placement.csv')
umich_df

Unnamed: 0,year,name,placement,ranking,university,department
0,2024,Hayley Abourezk-Pinkstone,Simpson College,13,University of Michigan,Economics
1,2024,Agostina Brinatti,"Chicago Booth (Fall 2026), Yale University pos...",13,University of Michigan,Economics
2,2024,Nadim Elayan Balague,Banco Central de Chile,13,University of Michigan,Economics
3,2024,Luis Miguel Espinoza Bardales,"Texas A&M University, The Bush School of Gover...",13,University of Michigan,Economics
4,2024,Andrea Foschi,Bank of Italy,13,University of Michigan,Economics
...,...,...,...,...,...,...
189,2015,Yulia Paramonova,National Research University Higher School of ...,13,University of Michigan,Economics
190,2015,Ihsan Saracgil,Cornerstone Research,13,University of Michigan,Economics
191,2015,Bartley Tablante,Keystone Strategy,13,University of Michigan,Economics
192,2015,Desmond Toohey,"Department of Economics, University of Delaware",13,University of Michigan,Economics


## Rank 14 California Institute of Technology (Caltech)
- nope

## Rank 15 University of Wisconsin--Madison
nope_no data

## Rank 17 Brown University


In [41]:
def scrape_brown_placements(html_content):
    """
    Scrapes placement data from Brown Economics HTML content.
    
    Args:
        html_content (str): HTML content of the placement page
        
    Returns:
        DataFrame: Contains columns year, name, placement
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    data = []
    
    # Find all accordion sections
    accordion_sections = soup.find_all('div', class_='accordion_item component_item')
    
    for section in accordion_sections:
        # Get year from the button
        year_button = section.find('button', class_='accordion_trigger')
        if year_button:
            try:
                # Extract year from button title
                year = int(year_button.get('title').split()[-1])
                
                # Find placement data in the list
                placement_list = section.find('ul', class_='list-2col')
                if placement_list:
                    # Process each list item
                    for item in placement_list.find_all('li'):
                        # Split name and placement at the hyphen
                        parts = item.get_text(strip=True).split('-', 1)
                        if len(parts) == 2:
                            name = parts[0].strip()
                            # Get placement from strong tag if it exists
                            placement_tag = item.find('strong')
                            if placement_tag:
                                placement = placement_tag.get_text(strip=True)
                                
                                record = {
                                    'year': year,
                                    'name': name,
                                    'placement': placement
                                }
                                data.append(record)
            except ValueError:
                continue
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    if not df.empty:
        # Add university and ranking info
        df['university'] = 'Brown University'
        df['department'] = 'Economics'
        df['ranking'] = 17  # Brown's ranking
        
        # Reorder columns
        column_order = ['year', 'name', 'placement', 'ranking', 'university', 'department']
        df = df[column_order]
    
    return df

with open('../data/raw/Brown/Brown.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

brown_df = scrape_brown_placements(html_content)
brown_df = brown_df[1:]
brown_df.to_csv('../data/raw/Brown/brown_placement.csv')
brown_df


Unnamed: 0,year,name,placement,ranking,university,department
1,2024,Alessandro Sovera,Postdoc at Tampere University,17,Brown University,Economics
2,2024,Alex Zhou,Postdoc Research Fellow at University of Warwick,17,Brown University,Economics
3,2024,Ali Lodermeier,U.S. Census Bureau,17,Brown University,Economics
4,2024,Francesco Ferlenga,"1 year postdoc at University of Warwick, then ...",17,Brown University,Economics
5,2024,Geetika Nagpal,World Bank,17,Brown University,Economics
...,...,...,...,...,...,...
158,2010,Daniel Puskin,US Department of Labor,17,Brown University,Economics
159,2010,Nathan Schiff,University of British Columbia,17,Brown University,Economics
160,2010,Andre Switala,Boston University,17,Brown University,Economics
161,2010,Carmina Vargas,"Banco de la República, Colombia",17,Brown University,Economics


## Rank 19 Johns Hopkins University


In [47]:
def scrape_jhu_placements(html_content):
    """
    Scrapes placement data from JHU Economics HTML content.
    
    Args:
        html_content (str): HTML content of the placement page
        
    Returns:
        DataFrame: Contains columns year, name, placement
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    data = []
    
    # Find the table with id 'tablepress-14'
    table = soup.find('table', id='tablepress-14')
    if table:
        # Process each row in the table body
        rows = table.find('tbody').find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 3:  # Ensure row has all three columns
                # Extract year from academic year (e.g., "2023-24" -> 2024)
                year_text = cols[0].get_text(strip=True)
                try:
                    if '-' in year_text:
                        year = int('20' + year_text.split('-')[1])  # Convert "23-24" to 2024
                    else:
                        year = int(year_text)
                        
                    record = {
                        'year': year,
                        'name': cols[1].get_text(strip=True),
                        'placement': cols[2].get_text(strip=True)
                    }
                    data.append(record)
                except ValueError:
                    continue
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    if not df.empty:
        # Add university and ranking info
        df['university'] = 'Johns Hopkins University'
        df['department'] = 'Economics'
        df['ranking'] = 19  # JHU's ranking
        
        # Reorder columns
        column_order = ['year', 'name', 'placement', 'ranking', 'university', 'department']
        df = df[column_order]
    
    return df

with open('../data/raw/JHU/JHU.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

jhu_df = scrape_jhu_placements(html_content)
jhu_df.to_csv('../data/raw/JHU/jhu_placement.csv')
jhu_df

Unnamed: 0,year,name,placement,ranking,university,department
0,2024,Huan Deng,Hong Kong Baptist University,19,Johns Hopkins University,Economics
1,2024,Aniruddha Ghosh,California Polytechnic State University,19,Johns Hopkins University,Economics
2,2024,Qingyang Han,Bates White Economic Consulting,19,Johns Hopkins University,Economics
3,2024,Zixuan Huang,IMF,19,Johns Hopkins University,Economics
4,2024,Yusuf Kulu,Ozyegin University,19,Johns Hopkins University,Economics
...,...,...,...,...,...,...
183,202001,Niamh Sheridan,International Monetary Fund,19,Johns Hopkins University,Economics
184,202001,Julie Smith,Trinity University (San Antonio),19,Johns Hopkins University,Economics
185,202001,Akinori Tomohara,Postdoctoral Fellow at Columbia University,19,Johns Hopkins University,Economics
186,202001,Junfu Zhang,Public Policy Institute of California,19,Johns Hopkins University,Economics


## Rank 20 University of Texas at Austin

In [53]:
def scrape_uta_placements(html_content):
    """
    Scrapes placement data from UT Austin Economics HTML content.
    
    Args:
        html_content (str): HTML content of the placement page
        
    Returns:
        DataFrame: Contains columns year, name, dissertation, advisor, placement
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    data = []
    
    # Find all accordion sections
    accordion_sections = soup.find_all('li', class_='accordion-item')
    
    for section in accordion_sections:
        # Get year from the accordion title
        title = section.find('span')
        if title and 'Graduates' in title.text:
            try:
                # Extract year from title (e.g., "2023-2024: 12 Graduates" -> 2024)
                year_text = title.text.split(':')[0]
                year = int(year_text.split('-')[1])
                
                # Find table in the accordion content
                table = section.find('table')
                if table:
                    rows = table.find_all('tr')
                    for row in rows:
                        cells = row.find_all('td')
                        if len(cells) >= 4:  # Ensure row has enough columns
                            record = {
                                'year': year,
                                'name': cells[0].get_text(strip=True),
                                'dissertation': cells[1].get_text(strip=True),
                                'advisor': cells[2].get_text(strip=True),
                                'placement': cells[3].get_text(strip=True)
                            }
                            # Only add if there's a name and placement
                            if record['name'] and record['placement']:
                                data.append(record)
            except (ValueError, IndexError):
                continue
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    if not df.empty:
        # Add university and ranking info
        df['university'] = 'University of Texas at Austin'
        df['department'] = 'Economics'
        df['ranking'] = 20  # UT Austin's ranking
        
        # Reorder columns
        column_order = ['year', 'name', 'placement', 'dissertation', 'advisor', 
                       'ranking', 'university', 'department']
        df = df[column_order]
    
    return df

with open('../data/raw/Univeristy_of_Texas_Austin/UTA.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

uta_df = scrape_uta_placements(html_content)
# Drop dissertation and advisor columns, and first row
uta_df = uta_df.drop(['dissertation', 'advisor'], axis=1)
uta_df = uta_df.iloc[1:]
uta_df.to_csv('../data/raw/Univeristy_of_Texas_Austin/uta_placement.csv')
uta_df.head()

Unnamed: 0,year,name,placement,ranking,university,department
1,2024,Anushka Mitra,"Federal Reserve Board, Economist",20,University of Texas at Austin,Economics
2,2024,Ebrahim Aliabadi Farahani,"Keystone Consulting, Economist",20,University of Texas at Austin,Economics
3,2024,Gue Sung Choi,"Korean Development Institute, Associate Resear...",20,University of Texas at Austin,Economics
4,2024,Ha Bui,"Grinnell College, Assistant Professor",20,University of Texas at Austin,Economics
5,2024,Hande Nur Celebi,"Universite Libre de Bruxelles, Post-Doc",20,University of Texas at Austin,Economics


# Combine all

In [96]:
# Collect all dataframes
dfs = [
    hardvard_placement_df,  # Harvard University placements
    stern_df,      # NYU Stern School of Business placements
    duke_df,     # Duke University placements 
    umich_df,    # University of Michigan placements
    brown_df,    # Brown University placements
    jhu_df,       # Johns Hopkins University placements
    uta_df       # University of Texas at Austin placements
]

# Combine all dataframes
combined_df = pd.concat(dfs, ignore_index=True)
combined_df

Unnamed: 0,year,name,fields_of_study,placement,ranking,university,department,program,title
0,2024,Jenna Anders,,University of Virginia Batten,1,Harvard University,Economics,,
1,2024,Martin Aragoneses,,INSTEAD,1,Harvard University,Economics,,
2,2024,Michael Blank,,"Stanford University, Graduate School of Business",1,Harvard University,Economics,,
3,2024,Phoebe Cai,,Link Logistics Real Estate,1,Harvard University,Economics,,
4,2024,Romaine Campbell,,Cornell Brooks Policy School,1,Harvard University,Economics,,
...,...,...,...,...,...,...,...,...,...
1595,2021,Jiwon Park,,Korea Institute for International Economic Policy,20,University of Texas at Austin,Economics,,
1596,2021,Pablo I. Varas,,"Economists Incorporated, Economist",20,University of Texas at Austin,Economics,,
1597,2021,Sangwoo Choi,,"Luohan Academy (Alibaba Group), Economist",20,University of Texas at Austin,Economics,,
1598,2021,Shenshen Yang,,"Tianjin University, Assistant Professor",20,University of Texas at Austin,Economics,,


In [97]:
# Drop unnecessary columns
combined_df = combined_df.drop(['fields_of_study', 'program', 'title'], axis=1, errors='ignore')

# Create placement_type column
combined_df['placement_type'] = combined_df['placement'].apply(
    lambda x: 'academic' if isinstance(x, str) and ('university' in x.lower() or 'college' in x.lower() or 'school' in x.lower()) 
    else 'unknown'
)

combined_df

Unnamed: 0,year,name,placement,ranking,university,department,placement_type
0,2024,Jenna Anders,University of Virginia Batten,1,Harvard University,Economics,academic
1,2024,Martin Aragoneses,INSTEAD,1,Harvard University,Economics,unknown
2,2024,Michael Blank,"Stanford University, Graduate School of Business",1,Harvard University,Economics,academic
3,2024,Phoebe Cai,Link Logistics Real Estate,1,Harvard University,Economics,unknown
4,2024,Romaine Campbell,Cornell Brooks Policy School,1,Harvard University,Economics,academic
...,...,...,...,...,...,...,...
1595,2021,Jiwon Park,Korea Institute for International Economic Policy,20,University of Texas at Austin,Economics,unknown
1596,2021,Pablo I. Varas,"Economists Incorporated, Economist",20,University of Texas at Austin,Economics,unknown
1597,2021,Sangwoo Choi,"Luohan Academy (Alibaba Group), Economist",20,University of Texas at Austin,Economics,unknown
1598,2021,Shenshen Yang,"Tianjin University, Assistant Professor",20,University of Texas at Austin,Economics,academic


In [98]:
job_list = combined_df[combined_df['placement_type'] == 'unknown']['placement'].unique().tolist()

In [66]:
len(job_list)

485

In [67]:
# Save job list to txt file
with open('job_list.txt', 'w') as f:
    for job in job_list:
        if isinstance(job, str):  # Check if job is a string
            f.write(job + '\n')

In [99]:
classification = {
    "Link Logistics Real Estate": "private_company",
    "Bank of Italy": "government",
    "Charles River Associates": "private_company",
    "IMF": "government",
    "Hoover Institution": "academic",
    "Open Philanthropy": "private_company",
    "Bank of England, Monetary Policy Outlook Division": "government",
    "Squarepoint Capital": "private_company",
    "Amazon": "private_company",
    "World Bank": "government",
    "CERGE-EI": "academic",
    "Carnegie Mellon": "academic",
    "U.S. Intelligence Community": "government",
    "Two Sigma": "private_company",
    "Federal Reserve Board": "government",
    "Federal Reserve Bank of Minneapolis": "government",
    "European Central Bank": "government",
    "Graham Capital Management": "private_company",
    "NERA": "private_company",
    "RAND": "private_company",
    "Give Well": "private_company",
    "Amazon Pharmacy": "private_company",
    "Harvard Society of Fellows": "academic",
    "International Monetary Fund": "government",
    "Jane Street": "private_company",
    "Uber": "private_company",
    "Universidad Católica de Chile": "academic",
    "PUC Rio (Toronto Postdoc)": "academic",
    "Federal Trade Commission": "government",
    "MIT, Department of Economics": "academic",
    "Boston Federal Reserve": "government",
    "Keystone": "private_company",
    "Edgeworth Economics": "private_company",
    "Federal Bank Reserve of New York": "government",
    "Upwork": "private_company",
    "Anaylsis Group": "private_company",
    "American Road & Transportation Builders Association": "private_company",
    "Facebook": "private_company",
    "Rand": "private_company",
    "California Policy Lab at UCLA Post Doc": "academic",
    "Yale SOM": "academic",
    "Vanguard": "private_company",
    "Joint Committee on Taxation": "government",
    "Mathematica": "private_company",
    "Princeton Economics Post Doc": "academic",
    "McKinsey": "private_company",
    "NBER post doc": "academic",
    "Analysis Group": "private_company",
    "EIEF Rome": "academic",
    "MIT post-doc": "academic",
    "Cornerstone Research": "private_company",
    "Goldman Sachs - Macro Research": "private_company",
    "Goldman Sachs": "private_company",
    "Federal Reserve Board of Governors": "government",
    "Dartmouth (IES post doc 2018-2019)": "academic",
    "UC Berkeley-Haas": "academic",
    "NBER post-doc": "academic",
    "Cornerstone": "private_company",
    "MIT Sloan": "academic",
    "Federal Reserve Bank of New York": "government",
    "Farallon Capital": "private_company",
    "Columbia Econ(Yale Cowles post-doc 2018-2019)": "academic",
    "Northwestern Kellogg MEDS": "academic",
    "Brown": "academic",
    "Goldman Sachs - London": "private_company",
    "Bain & Company": "private_company",
    "QuantCo.": "private_company",
    "Boston Consulting Group": "private_company",
    "Air BnB": "private_company",
    "Congressional Budget Office": "government",
    "Spotify": "private_company",
    "Cambridge Square Capital": "private_company",
    "Lyft": "private_company",
    "INSEAD": "academic",
    "Committee for Public Counsel Services, Boston": "government",
    "World Bank, Research Unit": "government",
    "Yale, Department of Economics": "academic",
    "Princeton, Department of Economics": "academic",
    "Dodge & Cox": "private_company",
    "Resources for the Future": "private_company",
    "Office of Tax Analysis-Treasury Department": "government",
    "Key Square Group": "private_company",
    "QuantCo": "private_company",
    "McKinsey & Company": "private_company",
    "UC San Diego, Department of Economics": "academic",
    "New York Federal Reserve": "government",
    "Becker Friedman Institute": "academic",
    "Deloitte Consulting": "private_company",
    "UC Berkeley, Department of Economics": "academic",
    "Wayfair": "private_company",
    "Ellington Management Group": "private_company",
    "TrueCar": "private_company",
    "Sciences Po": "academic",
    "Instituto Tecnológico Autónomo de México": "academic",
    "Boston Redevelopment Authority": "government",
    "Boston Federal Reserve Bank": "government",
    "Coursera": "private_company",
    "Treasury Department, France": "government",
    "PrepScholar Online Education": "private_company",
    "Bank of Korea": "government",
    "Dartmouth, Department of Economics": "academic",
    "OECD": "government",
    "American Enterprise Institute": "private_company",
    "Wilson Perumal & Company": "private_company",
    "Federal Reserve Board, International Division": "government",
    "Federal Reserve Bank, Research Department": "government",
    "Instituto de Estudios Superiores de la Empresa (IESE)": "academic",
    "RAND Corporation": "private_company",
    "Chinese Academy of Social Sciences, Institute of Economics": "academic",
    "Universitat Pompeu Fabra (Barcelona, Spain)": "academic",
    "Dean & Company": "private_company",
    "PSQR Capital Management": "private_company",
    "Bureau of Labor Statistics": "government",
    "US Department of Justice, Antitrust Division": "government",
    "Securities and Exchange Commission": "government",
    "MIT, Economics Department": "academic",
    "CREI": "academic",
    "TOBB-ETU, Economics Department": "academic",
    "New York Federal Reserve Board": "government",
    "McKinsey & Co.": "private_company",
    "Paulson & Co.": "private_company",
    "China National Petroleum Corporation": "private_company",
    "OC&C Strategy Consultants": "private_company",
    "International Food Policy Research Institute": "government",
    "Federal Board of Governors": "government",
    "Mathematica Policy Research, Inc": "private_company",
    "Department of Defense, Program Analysis and Evaluation": "government",
    "Princeton, Economics Department": "academic",
    "New York Fed": "government",
    "Goldman Sachs, Foreign Exchange Strategies": "private_company",
    "Lehman Brothers": "private_company",
    "NESTA": "private_company",
    "Ecole Polytechnique": "academic",
    "State Street Associates": "private_company",
    "Stanford, Department of Economics": "academic",
    "Columbia, Department of Economics": "academic",
    "Princeton, Department Economics": "academic",
    "CRA International": "private_company",
    "Goldman Sachs Asset Management": "private_company",
    "AQR Capital Management": "private_company",
    "Brookdale": "private_company",
    "CREI - Pompeu Fabra": "academic",
    "Pompeu Fabra, Barcelona": "academic",
    "Legal Economics": "private_company",
    "US Treasury, Office of International Affairs": "government",
    "INSEAD, Strategy Group": "academic",
    "Columbia GSB, Economics and Finance": "academic",
    "The Institute for the Study of Labor": "academic",
    "McKinsey & Co": "private_company",
    "US Department of the Treasury": "government",
    "NERA Intellectual Property Practice": "private_company",
    "New York Fed, International Research Group": "government",
    "Notre Dame, Department of Economics": "academic",
    "Harvard, Institutional Research": "academic",
    "Columbia, Department of Economics": "academic",
    "Research Economist at Office of Financial Research, U.S. Treasury": "government",
    "Economist at Amazon": "private_company",
    "Associate at Cornerstone Research": "private_company",
    "Senior Research Associate at JPMorgan AI Research": "private_company",
    "Assistant Professor at Stevens Institute of Technology": "academic",
    "Financial Economist at Securities and Exchange Commission": "government",
    "Economist at Revelio Labs": "private_company",
    "Economist at Federal Trade Commission Bureau of Economics": "government",
    "Economist at Department of Justice, Antitrust": "government",
    "Assistant Professor at HEC Paris": "academic",
    "Quantitative Researcher at The Citadel": "private_company",
    "Assistant Professor at Univeristy of British Columbia": "academic",
    "Applied Scientist at Amazon": "private_company",
    "Financial Economist at Federal Reserve Bank of New York": "government",
    "Researcher at Banco de Portugal": "government",
    "AI Research Associate at UN Global Pulse": "government",
    "Research Scientist at Facebook": "private_company",
    "Research Economist at Bank of Canada": "government",
    "Economist at Federal Reserve Board of Governors": "government",
    "Assistant Professor at INSEAD": "academic",
    "Quantitative Researcher at Two Sigma Investments": "private_company",
    "Assistant Professor at Georgia Institute of Technology": "academic",
    "Quantitative Researcher at Cubist Systematics of Point72": "private_company",
    "Co-Founder at Valorum Data, LLC": "private_company",
    "Lead Analytics Consultant/Data Scientist at Aetna Analytics & Behavior Change": "private_company",
    "Researcher at Financial Industry Regulatory Authority": "government",
    "Quantitative Researcher at AQR Capital Management": "private_company",
    "Assistant Professor at Univeristy of Georgia": "academic",
    "Strategist at Goldman Sachs": "private_company",
    "Assistant Professor at IESE, Spain": "academic",
    "Assistant Professor at Carnegie Mellon, Tepper": "academic",
    "Financial Economist at Federal Reserve Board": "government",
    "Senior Marketing Scientist at AOL": "private_company",
    "Assistant Professor at Hong Kong Univ. of Science and Technology": "academic",
    "Assistant Professor at UCLA Anderson": "academic",
    "Assistant Professor at SUNY Buffalo": "academic",
    "Economist at Department of Justice": "government",
    "Economist at Berkeley Research Group": "private_company",
    "Researcher at Facebook": "private_company",
    "Researcher at Integral Ad Science": "private_company",
    "Researcher at AT&T Labs": "private_company",
    "Senior Analyst at PanAgora Asset Management": "private_company",
    "Associate at Analysis Group": "private_company",
    "Research Economist at POSCO Research Institute": "private_company",
    "Senior Associate at Charles River Associates": "private_company",
    "Asst. Research Fellow at U of Nottingham post-doc; Academia Sinica Institute of Economics": "academic",
    "Economist at World Bank": "government",
    "Sr. Associate at Charles River Associates": "private_company",
    "Economist at Korea Development Institute": "government",
    "Associate at Dimensional Fund Advisors": "private_company",
    "Quantitative Researcher at Two Sigma": "private_company",
    "Applied Scientist II at Uber": "private_company",
    "Young Professionals Program at World Bank": "government",
    "Economist at Wayfair": "private_company",
    "Senior Economist at Bank of Canada": "government",
    "Assistant Professor at Texas A&M": "academic",
    "VP - Model Development Specialist at BNY Mellon": "private_company",
    "Research Economist at Bank of Portugal": "government",
    "Associate Researcher at Korea Institute for Industrial Economics and Trade": "government",
    "Postdoctoral Fellow at Carnegie Mellon": "academic",
    "Data Scientist II at Uber": "private_company",
    "Senior Data Scientist at Moody's Analytics": "private_company",
    "Economist at Federal Reserve Bank of Chicago": "government",
    "Economist at U.S. Department of Justice": "government",
    "Data Scientist at Airbnb": "private_company",
    "Assistant Professor* at Wharton, Finance Department": "academic",
    "Economist at Abt Associates": "private_company",
    "Economist at Federal Trade Commission": "government",
    "Senior Research Analyst at AidData": "private_company",
    "Data Scientist at Eli Lilly": "private_company",
    "Investment Strategies Researcher at Dimensional Fund Advisors": "private_company",
    "Economist at Bates White Economic Consulting": "private_company",
    "Associate at McKinsey & Company": "private_company",
    "Young Professional at World Bank": "government",
    "Senior Associate at PwC": "private_company",
    "Quantitative Researcher at BlackRock": "private_company",
    "Economist at European Central Bank": "government",
    "Economist at Inter-American Development Bank": "government",
    "Economist at Keystone Strategy": "private_company",
    "CDC Prevention Effectiveness Fellow at Department of Health and Human Services": "government",
    "Data Scientist at Quora": "private_company",
    "Senior Consultant at Ernst & Young": "private_company",
    "Assistant Vice President at Citi Bank": "private_company",
    "Senior Researcher at Central Bank of Peru": "government",
    "Associate Quantitative Researcher at Numeric Investors": "private_company",
    "Contractual at Inter-American Development Bank": "government",
    "Behavioral Scientist at Morningstar": "private_company",
    "Energy and Environmental Policy Analyst at National Renewable Energy Laboratory": "government",
    "Adjunct General Director at Mexican Ministry of Finance": "government",
    "Assistant Professor, Economics at UNC Greensboro": "academic",
    "Financial Economist at Boston Federal Reserve Bank": "government",
    "Assistant Professor, Economics at UC Denver": "academic",
    "Textended Term Consultant at Development Research Group, World Bank": "government",
    "Assistant Professor, Economics at Federal Reserve Board": "academic",
    "— at U.S. Department of Justice": "government",
    "Senior Economist at Siam Commercial Bank": "private_company",
    "Economist at Federal Reserve Bank of New York": "government",
    "Research Associate at Moody's Analytics": "private_company",
    "Data Scientist at CoreComplete, LLC": "private_company",
    "Economist at Federal Reserve Board": "government",
    "Research Economist at Bureau of Labor Statistics": "government",
    "Economist at ISO New England": "private_company",
    "Assistant Professor at UC Santa Cruz": "academic",
    "Economist at Bank of England": "government",
    "Postdoctoral Associate at European Center for Advanced Research in Economics and Statistics": "academic",
    "Quantitative Associate at Barclay's Capital": "private_company",
    "Assistant Professor at UCLA": "academic",
    "Post-doc at California Institute of Technology": "academic",
    "Senior Analyst at Bank of Canada": "government",
    " at Kansas City Federal Reserve Bank": "government",
    "Research Economist at Cleveland Federal Reserve Bank": "government",
    "Research Fellow at Korea Development Institute": "government",
    "VP in Decision Management at Citicards": "private_company",
    "Post-doc at ETH-Zurich": "academic",
    "Assistant Professor at Universidad Carlos Ill de Madrid": "academic",
    "Post-doc Research Fellow at Resources for the Future": "private_company",
    "Post-doc at European Economic Institute (Florence)": "academic",
    "Analyst at Central Bank of Brazil": "government",
    "Research Economist at Bank of France": "government",
    "Economist at Central Bank of Chile": "government",
    "Economist at Office of the Prime Minister of Thailand": "government",
    "Assistant Professor at Illinois Institute of Technology": "academic",
    "Associate Economist at RAND Corporation": "private_company",
    "Senior Economist at U.S. Government Accountability Office": "government",
    "Assistant Professor at UC Berkeley": "academic",
    "Associate at The Brattle Group": "private_company",
    "Senior Associate at Lehman Brothers": "private_company",
    "Quantitative Researcher at Citadel Investment Group": "private_company",
    "Consultant at Analysis Group, Inc.": "private_company",
    "Senior Researcher at Korea Information Strategy Development Institute": "government",
    "Research Associate at Cornerstone Research": "private_company",
    "Research Fellow at Korea Institute of Finance": "government",
    "Research Economist at ERS Group": "private_company",
    "Investment Manager at New York Asset Management Fund": "private_company",
    "Senior Economist at The Federal Reserve Bank of Minneapolis": "government",
    "Economist at Research Triangle Institute": "private_company",
    "Consultant at The Brattle Group": "private_company",
    "Health Economist at Research Triangle Institute": "private_company",
    "Senior Quantitative Analyst at State Street": "private_company",
    "Equity Derivatives Trader at Group One Trading, LP": "private_company",
    "Senior Vice President at China International Capital Corporation": "private_company",
    "Quantitative Research Analyst at Numeric Investors": "private_company",
    "Assistant Professor at Shanghai Advanced Institute of Finance": "academic",
    "Economist at Center for Economics, U.S.Government Accoutnability Office": "government",
    "Managing Economist at Nathan Associates, Inc.": "private_company",
    "Assistant Professor at Instituto Tecnologico Autonomo de Mexico": "academic",
    "Research Analyst/Field Representative at Center for Naval Analysis": "government",
    "Researcher at Centers for Disease Control and Prevention": "government",
    "Assistant Professor at Massachussetts Institute of Technology": "academic",
    "Health Economist at Merck Research Laboratories": "private_company",
    "Economist at Korean Development Institute": "government",
    "Banco Central de Chile": "government",
    "U.S. Department of the Treasury, Office of Tax Analysis": "government",
    "Bank of England": "government",
    "Banco de Mexico": "government",
    "Bates White Economic Consulting": "private_company",
    "World Bank Group": "government",
    'Columbia, Department of Economics': 'academic',
    'Asst. Research Fellow at U of Nottingham post-doc; Academia Sinica Institute of Economics': 'academic',
    'Director, Asia & Emerging Markets Active Equities at BlackRock': 'private_company',
    'Inter-American Development Bank': 'private_company',
    'U.S. Department of the Treasury, Office of Economic Policy': 'academic',
    'Center for Economic Studies, U.S. Census Bureau': 'government',
    'Qtron Investments': 'private_company',
    'Inter American Development Bank': 'private_company',
    'Office of Tax Analysis, Department of the Treasury': 'academic',
    'Intensity': 'private_company',
    'Consumer Financial Protection Bureau': 'government',
    'Deutsche Bundesbank': 'private_company',
    'Bank of Canada': 'private_company',
    'Office of Tax Analysis, U.S. Department of the Treasury': 'academic',
    'The Vanguard Group': 'private_company',
    'Hong Kong Monetary Authority': 'government',
    'Department of Economics, William & Mary': 'academic',
    'United States Census Bureau': 'government',
    'Federal Reserve Bank of Boston': 'private_company',
    'The World Bank': 'private_company',
    'The RAND Corporation': 'private_company',
    'Federal Reserve Bank of Richmond': 'private_company',
    'WhatsApp': 'private_company',
    'Federal Reserve Bank of San Francisco': 'private_company',
    'Department of Economics, Penn State': 'academic',
    'American Institutes for Research': 'academic',
    'Korea Institute of Public Finance': 'academic',
    'PricewaterhouseCoopers': 'private_company',
    'Board of Governors of the Federal Reserve System': 'government',
    'Abt Associates': 'private_company',
    'Compass Lexecon': 'private_company',
    'SOLO World Partners, LCC': 'private_company',
    'Bank of America': 'private_company',
    'Mathematica Policy Research': 'academic',
    'Ford Motor company': 'private_company',
    'United States Treasury': 'government',
    'Samsung Economic Research Institute': 'academic',
    'International Economics, EPFL, Lausanne': 'private_company',
    'Federal Reserve Bank of Cleveland': 'private_company',
    'CNA Corporation': 'private_company',
    'Korea Development Institute': 'academic',
    'Economic Analysis Group, Department of Justice': 'academic',
    'Nam Office of the Comptroller of the Currency (OCC), U.S. Department of the Treasury': 'academic',
    'Keystone Strategy': 'private_company',
    'U.S. Census Bureau': 'government',
    '1 year postdoc at Stanford and then AP at CEMFI, then Assistant Professor at CEMFI': 'academic',
    'Assistant Professor at Collegio Carlo Alberto': 'academic',
    'Assistant Professor at Universidad de Santiago de Chile': 'academic',
    'Assistant Professor at Universidad de Los Andes, Chile': 'academic',
    'Postdoc at CERDI': 'academic',
    'Postdoc at UCLA': 'academic',
    'Assistant Professor at': 'academic',
    'Assistant Professor at Shiv Nadar Institute of Eminence': 'academic',
    'ISO New England': 'private_company',
    'Assistant Professor at LMU Munich': 'academic',
    'OECD, Economics Department (Policy Studies Branch)': 'academic',
    'Assistant Professor at the': 'academic',
    'Postdoc at': 'academic',
    'Vice President, Economist at Morgan Stanley': 'private_company',
    'Central Bank of Ireland': 'private_company',
    'World Bank (office of Latin America)': 'private_company',
    'CSEF Naples': 'private_company',
    'Amazon Web Services': 'private_company',
    '1 year post-doc at Universitat Pompeu Fabra then ITAM Business (Mexico)': 'private_company',
    'post-doc at': 'private_company',
    'Wesleyan': 'private_company',
    'HKUST, Guangzhou': 'private_company',
    'Dartmouth': 'private_company',
    'Bank of Mexico': 'private_company',
    'Boston Fed': 'private_company',
    'Michigan (deferred by one year for MIT postdoc)': 'academic',
    'UC Davis, ARE (Agricultural and Resource Econ)': 'private_company',
    'Bank of Spain': 'private_company',
    'Dimensional Fund Advisors': 'private_company',
    'Federal Reserve Bank of Philadelphia': 'private_company',
    'Latin American Development Bank Research Group': 'academic',
    'Caltech': 'private_company',
    'Vanderbilt': 'private_company',
    'World Bank Research Group': 'academic',
    'NYU Abu Dhabi': 'private_company',
    'Colegio de Mexico': 'private_company',
    'Netflix': 'private_company',
    'UC Davis': 'private_company',
    'Capital One': 'private_company',
    'Weill Cornell Medicine': 'private_company',
    'German Federal Ministry for Economic Affairs and Climate Action': 'government',
    'Deloitte': 'private_company',
    'Universidad de Santiago de Chile': 'private_company',
    'Oxera': 'private_company',
    'Board of Governors at the Federal Reserve': 'government',
    'Universitat Pompeu Fabra': 'private_company',
    'APPRISE Incorporated': 'private_company',
    'US Department of Labor': 'academic',
    'Banco de la República, Colombia': 'private_company',
    'Korea Information Society Development Institute': 'academic',
    'NERA Economic Consulting': 'private_company',
    'Consultant at Boston Consulting Group (BCG)': 'private_company',
    'Senior Associate at PricewaterhouseCoopers, Boston': 'private_company',
    'Economist at Bank of Mexico': 'private_company',
    'Singapore Monetary Authority': 'government',
    'Ernst & Young': 'private_company',
    'Assistant Professor of Economics at California Institute of Technology': 'academic',
    'The Brattle Group': 'private_company',
    'Charles River Associates International, Inc.': 'private_company',
    'Pew Research Center': 'academic',
    'J.P. Morgan, New York': 'private_company',
    'Consultant at Competition Economics, LLC': 'private_company',
    'Post-doc at USC Schaeffer Center in Health Policy': 'private_company',
    'Deloitte and Touche': 'private_company',
    'FDIC': 'private_company',
    'Associate at Analysis Group, Boston': 'private_company',
    'Postdoc at the USC Lusk Center for Real Estate': 'academic',
    'Research Affiliates LLC': 'academic',
    'Competition Economics LLC': 'private_company',
    'Central Bank of Malaysia': 'private_company',
    'Bureau of Economic Analysis': 'government',
    'Fannie Mae': 'private_company',
    'International Food Policy Research': 'academic',
    'Department of Justice': 'academic',
    'IMPAQ International': 'private_company',
    'St. Louis Fed': 'private_company',
    'Bank of Japan': 'private_company',
    'Reserve Bank of New Zealand': 'private_company',
    'Ministry of Finance, Pakistan': 'government',
    'U.S. Department of Justice': 'academic',
    'Max Planck Institute for Evolutionary Economics': 'academic',
    'Korea Information Strategy Development Institute (KISDI)': 'academic',
    'SUNY – Stony Brook': 'private_company',
    'U. of Tokyo': 'private_company',
    'Universidad de Los Andes': 'private_company',
    'Asian Development Bank': 'private_company',
    'U. of California, Davis': 'private_company',
    'DIW / German Institute for Economic Research, Berlin': 'academic',
    'U. of Texas': 'private_company',
    'Center for Population and Health (Georgetown U.)': 'private_company',
    'Federal Communications Commission': 'government',
    'The Judge Group (Economic Consulting)': 'private_company',
    'CNA Corp (Economic Consulting)': 'private_company',
    'Bank of Ecuador': 'private_company',
    'Federal Reserve Board of Governors, International Finance Division': 'government',
    'German Central Bank': 'private_company',
    'Public Policy Institute of California': 'academic',
    'Federal Reserve Board, Economist': 'government',
    'Keystone Consulting, Economist': 'private_company',
    'Korean Development Institute, Associate Research Fellow': 'academic',
    'Universite Libre de Bruxelles, Post-Doc': 'private_company',
    'Amazon, Economist': 'private_company',
    'Bank of America, Economist': 'private_company',
    'International Monetary Fund, Economist': 'private_company',
    'Columbia, Department of Economics': 'academic',
    'Asst. Research Fellow at U of Nottingham post-doc; Academia Sinica Institute of Economics': 'academic',
    'Assistant Professor at Collegio Carlo Alberto': 'academic',
    'Assistant Professor at Universidad de Santiago de Chile': 'academic',
    'Assistant Professor at Universidad de Los Andes, Chile': 'academic',
    'Postdoc at CERDI': 'academic',
    'Postdoc at UCLA': 'academic',
    'Assistant Professor at Shiv Nadar Institute of Eminence': 'academic',
    'Assistant Professor at LMU Munich': 'academic',
    '1 year post-doc at Universitat Pompeu Fabra then ITAM Business (Mexico)': 'uncategorized',
    '- Universidad de los Andes': 'uncategorized',
    '-World Bank': 'private_company',
    'First Position': 'uncategorized',
    'Bureau of Economic Analysis, Research Economist': 'academic',
    'Bank of Portugal, Economist': 'private_company',
    'Charles River Associates, Economist': 'uncategorized',
    'Bates White, Economist': 'uncategorized',
    'Bank of Mexico, Research Economist': 'academic',
    'Georgia Institute of Technology, Lecturer': 'academic',
    'Central Bank of Chile, Senior Economist': 'private_company',
    "RIPL (Research Improving People's Lives), Economist": 'academic',
    'Federal Trade Commission, Economist': 'government',
    'Congressional Budget Office, Economist': 'uncategorized',
    'Meta (Facebook), Research Scientist': 'academic',
    'Revelio Labs, Economist': 'uncategorized',
    'UNC Greensboro, Assistant Professor': 'academic',
    'Integra, Economist': 'uncategorized',
    'Walmart, Senior Data Scientist': 'private_company',
    'Charles River Associates, Senior Associate': 'uncategorized',
    'Department of Justice (Antitrust Division), Economist': 'academic',
    'Microsoft Research, Economist': 'academic',
    'Korea Institute for International Economic Policy': 'academic',
    'Economists Incorporated, Economist': 'private_company',
    'Luohan Academy (Alibaba Group), Economist': 'private_company',
    'Indian Institute of Technology Kanpur, Assistant Professor': 'academic'
}



for key, value in classification.items():
    combined_df.loc[combined_df['placement'] == key, 'placement_type'] = value

In [100]:
job_list_2 = combined_df[combined_df['placement_type'] == 'unknown']['placement'].unique().tolist()

In [101]:
job_list_2

['INSTEAD',
 'Columbia, Department of\xa0Economics',
 'Asst. Research Fellow at U of Nottingham post-doc; Academia Sinica Institute\xa0of Economics',
 'Assistant Professor at\xa0Collegio Carlo Alberto',
 'Assistant Professor at\xa0Universidad de Santiago de Chile',
 'Assistant Professor at\xa0Universidad de Los Andes, Chile',
 'Postdoc at\xa0CERDI',
 'Postdoc at\xa0UCLA',
 'Assistant Professor at\xa0Shiv Nadar Institute of Eminence',
 'Assistant Professor at\xa0LMU Munich',
 '1\xa0year post-doc at Universitat Pompeu Fabra then ITAM Business (Mexico)',
 'Bank of Mexico,\xa0Research Economist']

In [102]:
# Change INSTEAD to private_company
combined_df.loc[combined_df['placement'] == 'INSTEAD', 'placement_type'] = 'private_company'

# Change remaining unknowns to university 
combined_df.loc[combined_df['placement_type'] == 'unknown', 'placement_type'] = 'academic'

combined_df

Unnamed: 0,year,name,placement,ranking,university,department,placement_type
0,2024,Jenna Anders,University of Virginia Batten,1,Harvard University,Economics,academic
1,2024,Martin Aragoneses,INSTEAD,1,Harvard University,Economics,private_company
2,2024,Michael Blank,"Stanford University, Graduate School of Business",1,Harvard University,Economics,academic
3,2024,Phoebe Cai,Link Logistics Real Estate,1,Harvard University,Economics,private_company
4,2024,Romaine Campbell,Cornell Brooks Policy School,1,Harvard University,Economics,academic
...,...,...,...,...,...,...,...
1595,2021,Jiwon Park,Korea Institute for International Economic Policy,20,University of Texas at Austin,Economics,academic
1596,2021,Pablo I. Varas,"Economists Incorporated, Economist",20,University of Texas at Austin,Economics,private_company
1597,2021,Sangwoo Choi,"Luohan Academy (Alibaba Group), Economist",20,University of Texas at Austin,Economics,private_company
1598,2021,Shenshen Yang,"Tianjin University, Assistant Professor",20,University of Texas at Austin,Economics,academic


In [103]:
# Create binary columns for each placement type
combined_df['academic'] = (combined_df['placement_type'] == 'academic').astype(int)
combined_df['private_company'] = (combined_df['placement_type'] == 'private_company').astype(int) 
combined_df['government'] = (combined_df['placement_type'] == 'government').astype(int)

combined_df

Unnamed: 0,year,name,placement,ranking,university,department,placement_type,academic,private_company,government
0,2024,Jenna Anders,University of Virginia Batten,1,Harvard University,Economics,academic,1,0,0
1,2024,Martin Aragoneses,INSTEAD,1,Harvard University,Economics,private_company,0,1,0
2,2024,Michael Blank,"Stanford University, Graduate School of Business",1,Harvard University,Economics,academic,1,0,0
3,2024,Phoebe Cai,Link Logistics Real Estate,1,Harvard University,Economics,private_company,0,1,0
4,2024,Romaine Campbell,Cornell Brooks Policy School,1,Harvard University,Economics,academic,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1595,2021,Jiwon Park,Korea Institute for International Economic Policy,20,University of Texas at Austin,Economics,academic,1,0,0
1596,2021,Pablo I. Varas,"Economists Incorporated, Economist",20,University of Texas at Austin,Economics,private_company,0,1,0
1597,2021,Sangwoo Choi,"Luohan Academy (Alibaba Group), Economist",20,University of Texas at Austin,Economics,private_company,0,1,0
1598,2021,Shenshen Yang,"Tianjin University, Assistant Professor",20,University of Texas at Austin,Economics,academic,1,0,0


In [104]:
combined_df['year'].unique()

array([  2024,   2023,   2022,   2021,   2020,   2019,   2018,   2017,
         2016,   2015,   2014,   2013,   2012,   2011,   2010,   2009,
         2008,   2007,   2006,   2005,   2004, 202016, 202015, 202014,
       202013, 202012, 202011, 202010, 202009, 202008, 202007, 202006,
       202005, 202004, 202003, 202002, 202001])

In [105]:
# Convert years starting with 20 to remove the prefix
combined_df.loc[combined_df['year'] >= 202000, 'year'] = combined_df.loc[combined_df['year'] >= 202000, 'year'] - 200000

combined_df['year'].unique()

array([2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014,
       2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003,
       2002, 2001])

Gender

In [106]:
import gender_guesser.detector as gender
d = gender.Detector()

# Apply gender_guesser to first names
combined_df['gender_guesser'] = combined_df['name'].apply(lambda x: d.get_gender(x.split()[0]))
combined_df[['name', 'gender_guesser']].head()


Unnamed: 0,name,gender_guesser
0,Jenna Anders,female
1,Martin Aragoneses,male
2,Michael Blank,male
3,Phoebe Cai,female
4,Romaine Campbell,female


In [107]:
combined_df['gender'] = combined_df['gender_guesser']
combined_df[combined_df['gender'] == 'unknown']

Unnamed: 0,year,name,placement,ranking,university,department,placement_type,academic,private_company,government,gender_guesser,gender
5,2024,Jiafeng Chen,Stanford University,1,Harvard University,Economics,academic,1,0,0,unknown,unknown
12,2024,Shresth Garg,University of Pennsylvania,1,Harvard University,Economics,academic,1,0,0,unknown,unknown
16,2024,Yihong Huang,Peking University,1,Harvard University,Economics,academic,1,0,0,unknown,unknown
19,2024,Ziqi Lu,Jinan University,1,Harvard University,Economics,academic,1,0,0,unknown,unknown
23,2024,Dev Patel,Brown University,1,Harvard University,Economics,academic,1,0,0,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...
1588,2022,Songjie Huang,"Walmart, Senior Data Scientist",20,University of Texas at Austin,Economics,private_company,0,1,0,unknown,unknown
1589,2021,Name,First Position,20,University of Texas at Austin,Economics,uncategorized,0,0,0,unknown,unknown
1595,2021,Jiwon Park,Korea Institute for International Economic Policy,20,University of Texas at Austin,Economics,academic,1,0,0,unknown,unknown
1597,2021,Sangwoo Choi,"Luohan Academy (Alibaba Group), Economist",20,University of Texas at Austin,Economics,private_company,0,1,0,unknown,unknown


In [90]:
combined_df[combined_df['gender'] == 'unknown']['name'].unique()
unknown_names = combined_df[combined_df['gender'] == 'unknown']['name'].unique().tolist()

with open('unknown_gender_names.txt', 'w') as f:
    for name in unknown_names:
        f.write(name + '\n')



In [108]:
gender_dict = {
    "Jiafeng Chen": "male",
    "Shresth Garg": "male",
    "Yihong Huang": "male",
    "Ziqi Lu": "female",
    "Dev Patel": "male",
    "Kunal Sangani": "male",
    "Yulu Tang": "female",
    "Hanbin Yang": "male",
    "Francois-Xavier Ladant": "male",
    "Shushu Liang": "female",
    "Ayushi Narayan": "female",
    "Jiacheng Feng": "male",
    "Riako Granzier": "male",
    "Tianwang Liu": "male",
    "Ashesh Rambachan": "male",
    "Mariia Voronina": "female",
    "Rishab Guha": "male",
    "Jetlir Duraj": "male",
    "Taehoon Kim": "male",
    "Tzachi Raz": "male",
    "Ruiqing Cao": "male",
    "Ellora Derenoncourt": "female",
    "Ashvin Gandhi": "male",
    "Siddharth George": "male",
    "Yizhou Jin": "male",
    "Yosub Jung": "male",
    "Weiling Liu": "female",
    "Chenzi Xu": "male",
    "Eben Lazarus": "male",
    "Seunghyup Lee": "male",
    "Yueran Ma": "female",
    "Xiaosheng Mu": "male",
    "Wentao Xiong": "male",
    "Yuxiao Huang": "male",
    "Thummim Cho": "male",
    "Jisung Park": "male",
    "Mingzhu Tai": "female",
    "Sangram Kadam": "male",
    "Zhaoning Wang": "male",
    "Lilei Xu": "male",
    "Rezwan Haque": "male",
    "Akos Lada": "male",
    "Fanyin Zheng": "female",
    "Zhenyu Lai": "male",
    "Yuhta Ishii": "male",
    "Charles-Henri Weymuller": "male",
    "Wenxin Du": "male",
    "Jisoo Hwang": "female",
    "Troiano Ugo": "male",
    "Ruchir Agarwal": "male",
    "Hongyi Li": "male",
    "Supreet Kaur": "female",
    "Jinzhu Chen": "male",
    "Soonjin Yim": "female",
    "Keyu Jin": "male",
    "Sandip Sukhtankar": "male",
    "Afua Branoah Banful": "female",
    "Hanley Chiang": "male",
    "Quoc-Ahn Do": "male",
    "Fuhito Kojima": "male",
    "Kartini Shastry": "female",
    "C. Kirabo Jackson": "male",
    "Harini Parthassarathy": "female",
    "Parag Pathak": "male",
    "C. Cynthia Lin": "female",
    "Yuhai Xuan": "male",
    "Miklos Koren": "male",
    "Man-Keung Tang": "male",
    "Shuqing Huang": "female",
    "Yinan Wang": "male",
    "Eunkyung An": "female",
    "Weiqing Zhang": "male",
    "Yongoh Roh": "male",
    "Hangyuan Shi": "male",
    "Prasanna Parasurama": "male",
    "Manav Raj": "male",
    "Nofar Duani": "female",
    "Weichi Yao": "male",
    "Danye Wang": "female",
    "Yanting Crystal Shi": "female",
    "Abhinav Gupta": "male",
    "Abhishek Bhardwaj": "male",
    "Botao Wu": "male",
    "Youngmin Kim": "male",
    "Chenshuo Sun": "male",
    "Semi Min": "female",
    "Stephane P. Francioli": "male",
    "Fanglin Chen": "female",
    "Heeyoung Yoon": "female",
    "Haotian Song": "male",
    "Xinyi Zhao": "female",
    "Jiashuo Jiang": "male",
    "Zhuoyi Yang": "male",
    "Hyeyoon Jung": "female",
    "Hongbum Lee": "male",
    "Qianyun Zhang": "female",
    "Shixin Wang": "male",
    "Jungbae Kim": "male",
    "Beumseok Shim": "male",
    "Heebum Lee": "male",
    "Manasa Gopal": "female",
    "Siyu Yu": "female",
    "Yichen Zhang": "male",
    "Disen Huang": "male",
    "Chenqi Zhu": "male",
    "Peifan Wu": "male",
    "Saptarshi Mukherjee": "male",
    "Mohsan Bilal": "male",
    "Tianyue Ruan": "female",
    "Siddharth Vij": "male",
    "Kyeonggook Park": "male",
    "Minjung Kwon": "female",
    "Jihye Jeon": "female",
    "Dongil Keum": "male",
    "Yuqian Xu": "female",
    "Seil Kim": "male",
    "Jianchuan Luo": "male",
    "Xuyang Ma": "male",
    "Tingting Nian": "female",
    "Yuzhou Liu": "male",
    "Yunok Cho": "male",
    "Shaojun Chang": "male",
    "Xiaohan Zhang": "male",
    "Tingting Fan": "female",
    "Shelle Santana": "female",
    "Jigar Patel": "male",
    "Saketh Aleti": "male",
    "Xinyue Bei": "female",
    "Seohee Kim": "female",
    "Eun-Seok Lee": "male",
    "JoonYup Park": "male",
    "Dongyoung Kim": "male",
    "Yonggyun Kim": "male",
    "Taishi Sassano": "male",
    "Zichang Wang": "male",
    "Haozhe Zhang": "male",
    "Usaid Awan": "male",
    "Yuxuan He": "male",
    "Ranae Jabri": "female",
    "Feifan Zhang": "male",
    "Xirui Zhang": "male",
    "Elessar Chen": "male",
    "Tiancheng Chen": "male",
    "Gabor Palinko": "male",
    "Qiushi Zhang": "female",
    "Yanyou Chen": "male",
    "Jiayun Dong": "female",
    "Rudolf-Harri Oberg": "male",
    "Mingzhe Yi": "male",
    "Chuhang Yin Geissler": "male",
    "Congshan Zhang": "male",
    "Yilin Jiang": "female",
    "Mengke Wang": "male",
    "Soroush Ghazi": "male",
    "Linxi Chen": "male",
    "Xiaohua Wu": "female",
    "Xiaomin Fu": "female",
    "Tedi Skiti": "male",
    "Bingzhi Zhao": "male",
    "Yichong Zhang": "male",
    "Kristoph Kleiner": "male",
    "Chung-Ying Lee": "male",
    "Chutima Tontarawongsa": "female",
    "Wenjing Wang": "female",
    "Correa Alvaro": "male",
    "Songman Kang": "male",
    "Zhengzi (Sophia) Li": "female",
    "Iaryna Grynkiv": "female",
    "Nujin Prasertsom": "female",
    "Hernan Seoane": "male",
    "Beia Spiller": "female",
    "Tongyai Iyavarakul": "male",
    "Natalisa Sizova": "female",
    "Liad Wagman": "male",
    "Padmaja Ayyagari": "female",
    "Shanjun Li": "male",
    "Maxym Dedov": "male",
    "Jinhan Jung": "male",
    "Kyoobok Lee": "male",
    "Bentley Coffey": "male",
    "C. Allan Bester": "male",
    "Varoujan Khatchatrian": "male",
    "Haofei Chen": "male",
    "Huayu Fang": "female",
    "Lijing Ouyang": "male",
    "Jingshu Wang": "male",
    "Jianguo Xu": "male",
    "Hanwook Yoo": "male",
    "Aristos Hudson": "male",
    "Yiming Liu": "male",
    "Shwetha Raghuraman": "female",
    "Dyanne Vaught": "female",
    "Yasar Ersan": "male",
    "Keshav Garud": "male",
    "Nishaad Rao": "male",
    "Xienan Cheng": "male",
    "Jaedo Choi": "male",
    "Aibo Gong": "male",
    "Bunyada (Mos) Laoprapassorn": "female",
    "Junwei Tang": "male",
    "Jiafu Wang": "male",
    "Yishu Zeng": "female",
    "Tangren Feng": "male",
    "Bhanu Gupta": "male",
    "Anirudh Jayanti": "male",
    "Parag Mahajan": "male",
    "Dhiren Patki": "male",
    "Shuqiao Sun": "male",
    "Huayu Xu": "female",
    "Wenjian Xu": "male",
    "Vybhavi Balasundharam": "female",
    "Xinwei Ma": "male",
    "Siprapai Sitapong": "female",
    "Tejaswi Velayudhan": "female",
    "Yipei Cao": "male",
    "Traviss Cassidy": "male",
    "DongIk Kang": "male",
    "Harim Kim": "male",
    "Yiyuan Zhang": "male",
    "Xiaoqing Zhou": "female",
    "Guodong Chen": "male",
    "Chenyue Hu": "female",
    "Prachi Jain": "female",
    "Gaurav Khanna": "male",
    "Minjoon Lee": "male",
    "Taejun Oh": "male",
    "Nitya Pandalai-Nayar": "female",
    "Rishi Sharma": "male",
    "Qinggong Wu": "male",
    "Chenyu Yang": "male",
    "Fudong Zhang": "male",
    "Jiyoon Kim": "female",
    "Changkeun Lee": "male",
    "Geetika Nagpal": "female",
    "Joao Garcia": "male",
    "Samsun Knight": "female",
    "Zeky Murra": "male",
    "Balazs Zelity": "male",
    "Pavitra Govindan": "female",
    "Hyojin Han": "female",
    "Jeongbin Kim": "female",
    "Nickolai Riabov": "male",
    "Kanghyock Koh": "male",
    "Hyunjoo Yang": "female",
    "Eecheng Ong": "male",
    "Sailesh Tiwari": "male",
    "Daeho Kim": "male",
    "Mongoljin Batsaikhan": "male",
    "Kenju Kamei": "male",
    "Momotazur Rahman": "male",
    "Zhaoguo Zhan": "male",
    "Debipriya Chatterjee": "female",
    "Chisoo Kim": "female",
    "Aniruddha Ghosh": "male",
    "Qingyang Han": "male",
    "Zixuan Huang": "female",
    "Jeongwon Son": "female",
    "Mingzuo Sun": "male",
    "Fangzhu Yang": "male",
    "Silin Huang": "female",
    "Chuhan Liu": "male",
    "Jiwon Kim": "female",
    "Xinyu Zhao": "female",
    "Jakree Koosakul": "female",
    "Himanshu Verma": "male",
    "Sahan Yildiz": "male",
    "Tongli Zhang": "male",
    "Derin Aksit": "female",
    "Prerna Rakheja": "female",
    "Shiqi Wang": "male",
    "Hanchen Jiang": "male",
    "Shujaat Khan": "male",
    "Jianhui Li": "male",
    "Delong Li": "male",
    "Shaiza Qayyum": "female",
    "Mingjian Wang": "male",
    "Liuchun Deng": "male",
    "Yunting Liu": "female",
    "Sohini Mahapatra": "female",
    "Yajing Jiang": "female",
    "Burcin Kisacikoglu": "female",
    "Boqun Wang": "male",
    "Jiaxiong Yao": "male",
    "Yiyang Li": "male",
    "Weining Bao": "female",
    "Ruli Xiao": "female",
    "Jiae Yoo": "female",
    "Prathi Seneviratne": "female",
    "Xiaochen Xu": "female",
    "Yizhen Zhao": "female",
    "Kue Peng Chauh": "male",
    "Weifeng Wu": "male",
    "Yonghong An": "male",
    "Tsogbadral Galaabaatar": "male",
    "Abhishek Gupta": "male",
    "Guofang Huang": "female",
    "Haomiao Yu": "female",
    "Su-Hsin Chang": "female",
    "Viplav Saini": "male",
    "Zhixiang Zhang": "male",
    "Anubha Dhasmana": "female",
    "Migiwa Tanaka": "female",
    "Jiawei Chen": "male",
    "Kuo-Lring Chen": "male",
    "Yingyao Hu": "male",
    "Sangjik Lee": "male",
    "Junfu Zhang": "male",
    "Jinyeong Son": "female",
    "Prankur Gupta": "male",
    "Yihang Zhou": "male",
    "Zhenning Zhao": "male",
    "Ziyue Xu": "female",
    "Abolfazl Rezghi": "male",
    "Bokyung Kim": "female",
    "Changseok Ma": "male",
    "Dongchen Zhao": "male",
    "Qingsong Pan": "male",
    "Sadhika Bagga": "female",
    "Gwangmin Kim": "male",
    "Molin Li": "female",
    "Shaofei Jiang": "male",
    "Songjie Huang": "female",
    "Jiwon Park": "female",
    "Sangwoo Choi": "male",
    "Shenshen Yang": "female"
}


In [109]:
# Update gender column based on gender_dict
# Only update gender for names that exist in gender_dict
mask = combined_df['name'].isin(gender_dict.keys())
combined_df.loc[mask, 'gender'] = combined_df.loc[mask, 'name'].map(gender_dict)


In [111]:
combined_df[combined_df['gender'] == 'unknown']['name'].unique().tolist()


['Name']

In [112]:
combined_df[combined_df['name'] != 'Name']
combined_df

Unnamed: 0,year,name,placement,ranking,university,department,placement_type,academic,private_company,government,gender_guesser,gender
0,2024,Jenna Anders,University of Virginia Batten,1,Harvard University,Economics,academic,1,0,0,female,female
1,2024,Martin Aragoneses,INSTEAD,1,Harvard University,Economics,private_company,0,1,0,male,male
2,2024,Michael Blank,"Stanford University, Graduate School of Business",1,Harvard University,Economics,academic,1,0,0,male,male
3,2024,Phoebe Cai,Link Logistics Real Estate,1,Harvard University,Economics,private_company,0,1,0,female,female
4,2024,Romaine Campbell,Cornell Brooks Policy School,1,Harvard University,Economics,academic,1,0,0,female,female
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,2021,Jiwon Park,Korea Institute for International Economic Policy,20,University of Texas at Austin,Economics,academic,1,0,0,unknown,female
1596,2021,Pablo I. Varas,"Economists Incorporated, Economist",20,University of Texas at Austin,Economics,private_company,0,1,0,male,male
1597,2021,Sangwoo Choi,"Luohan Academy (Alibaba Group), Economist",20,University of Texas at Austin,Economics,private_company,0,1,0,unknown,male
1598,2021,Shenshen Yang,"Tianjin University, Assistant Professor",20,University of Texas at Austin,Economics,academic,1,0,0,unknown,female


In [114]:
combined_df.to_csv('../data/analyzed_data/missing_data.csv', index=False)

In [1]:
import pandas as pd

df = pd.read_csv('../data/analyzed_data/missing_data.csv')
df['university'].unique()

array(['Harvard University', 'New York University', 'Duke University',
       'University of Michigan', 'Brown University',
       'Johns Hopkins University', 'University of Texas at Austin'],
      dtype=object)