In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go

full_df = pd.read_csv('../data/analyzed_data/full_df.csv')
full_df.head()

Unnamed: 0,name,university,year,field,placement,research_fields,academic,private_company,government,gender,department,placement_type,ranking,gender_guesser
0,Aaron Hedlund,University of Pennsylvania,2012,,Baylor University,,1,0,0,male,Economics,academic,11,
1,Aaron Mora,University of Pennsylvania,2024,"Econometrics, Financial Economics, Industrial ...",University of South Carolina,,1,0,0,male,Economics,academic,11,
2,Aaron Swoboda,UC Berkeley,2011,,"Assistant Professor, School of Public & Intern...",,1,0,0,male,Economics,academic,5,
3,"Abe, Naohito",Yale,2000,,"Hibotsutoshi University, Japan",,1,0,0,male,Economics,academic,7,
4,Abhay Aneja,UC Berkeley,2019,,Berkeley Law,,1,0,0,male,Economics,academic,5,


In [30]:
full_df['university'].unique()

array(['University of Pennsylvania', 'UC Berkeley', 'Yale', 'UC Davis',
       'UC Riverside', 'UCSD', 'Columbia University', 'Stanford',
       'Cornell', 'Harvard University', 'New York University',
       'Duke University', 'University of Michigan', 'Brown University',
       'Johns Hopkins University', 'University of Texas at Austin'],
      dtype=object)

## 1. Trend in Candidate

In [3]:
yearly_counts = full_df[full_df['year'] != 2025].groupby('year').size().reset_index(name='count')

# Create a line plot using Plotly
fig = px.line(yearly_counts, x='year', y='count', title='Trend of Number of Candidates', labels={'year': 'Year', 'count': 'Number of Candidates'})

# Save the plot as an HTML file
pio.write_html(fig, file='../output/graphs/candidates_trend.html')

# Display the plot in notebook
fig.show()



#also add trend of total number per school
# add new lines by school + legend

In [38]:
# Get total counts by year
yearly_counts = full_df[full_df['year'] != 2025].groupby('year').size().reset_index(name='count')

# Get counts by university and year
uni_yearly_counts = full_df[full_df['year'] != 2025].groupby(['year', 'university']).size().reset_index(name='count')

# Create figure
fig = px.line(yearly_counts, x='year', y='count', 
              title='Trend of Number of Candidates',
              labels={'year': 'Year', 'count': 'Number of Candidates'},
              height=1200)  # Double the default height which is typically around 600

# Add individual university lines
for uni in full_df['university'].unique():
    uni_data = uni_yearly_counts[uni_yearly_counts['university'] == uni]
    fig.add_scatter(x=uni_data['year'], y=uni_data['count'],
                   name=uni, mode='lines')

# Update legend
fig.update_layout(
    showlegend=True,
    legend_title_text='University'
)

# Save the plot as an HTML file
pio.write_html(fig, file='../output/graphs/candidates_trend.html')

# Display the plot in notebook
fig.show()


there has been a clear upward trend in the number of economics PhD candidates from 2000 to 2024. The trend shows three distinct phases:
A relatively stable period from 2000-2005 with around 30-40 candidates per year
A sharp increase between 2005-2010, jumping from about 40 to 100 candidates
Continued growth from 2010-2024, reaching around 200-250 candidates per year, with some fluctuations and a notable peak around 2015 of approximately 240 candidates
The overall trend suggests a substantial expansion in economics PhD programs over the past two decades, with the number of candidates more than quintupling from about 40 in 2000 to over 200 in recent years.

## 2. Trend in Gender Distribution

In [4]:
# Group data by year and gender, count occurrences
gender_by_year = pd.crosstab(full_df['year'], full_df['gender'])

# Create stacked bar plot
fig = px.bar(gender_by_year, 
             title='Gender Distribution Over Years',
             labels={'value': 'Number of Candidates', 
                    'year': 'Year',
                    'gender': 'Gender'},
             barmode='stack')

# Save the plot
pio.write_html(fig, file='../output/graphs/gender_distribution.html')

# Display the plot
fig.show()


In [40]:
# Filter data for Yale
yale_df = full_df[full_df['university'] == 'Yale']
yale_gender_by_year = pd.crosstab(yale_df['year'], yale_df['gender'], normalize='index') * 100

# Create stacked bar plot for Yale
fig_yale = px.bar(yale_gender_by_year,
             title='Gender Distribution Over Years - Yale',
             labels={'value': 'Percentage of Candidates', 
                    'year': 'Year',
                    'gender': 'Gender'},
             barmode='stack')

# Update y-axis to show percentages
fig_yale.update_layout(yaxis_title='Percentage of Candidates')

# Save Yale plot
pio.write_html(fig_yale, file='../output/graphs/yale_gender_distribution.html')

# Display Yale plot
fig_yale.show()

# Filter data for UC Riverside
ucr_df = full_df[full_df['university'] == 'UC Riverside'] 
ucr_gender_by_year = pd.crosstab(ucr_df['year'], ucr_df['gender'], normalize='index') * 100

# Create stacked bar plot for UC Riverside
fig_ucr = px.bar(ucr_gender_by_year,
             title='Gender Distribution Over Years - UC Riverside',
             labels={'value': 'Percentage of Candidates', 
                    'year': 'Year',
                    'gender': 'Gender'},
             barmode='stack')

# Update y-axis to show percentages
fig_ucr.update_layout(yaxis_title='Percentage of Candidates')

# Save UC Riverside plot
pio.write_html(fig_ucr, file='../output/graphs/ucr_gender_distribution.html')

# Display UC Riverside plot
fig_ucr.show()


In [9]:
# Create pie chart by year with slider
# First aggregate the data by year and gender
# Get gender counts by year
gender_counts = pd.crosstab(full_df['year'], full_df['gender']).reset_index()
gender_counts_melted = gender_counts.melt(id_vars=['year'], var_name='gender', value_name='count')

# Create base pie chart with 2024
fig = px.pie(
    gender_counts_melted[gender_counts_melted['year'] == 2024],
    values='count',
    names='gender',
    title='Gender Distribution by Year'
)

# Create frames for animation
frames = [
    dict(
        data=[dict(
            type='pie',
            values=gender_counts_melted[gender_counts_melted['year'] == year].sort_values('gender')['count'],
            labels=sorted(gender_counts_melted['gender'].unique())
        )],
        name=str(int(year))
    )
    for year in sorted(gender_counts_melted['year'].unique())
]

# Add slider
sliders = [dict(
    active=len(frames)-1,
    currentvalue={"prefix": "Year: "},
    pad={"t": 50},
    steps=[dict(
        label=str(int(year)),
        method='animate',
        args=[[str(int(year))], dict(
            frame=dict(duration=0, redraw=True),
            mode='immediate',
            transition=dict(duration=0)
        )]
    ) for year in sorted(gender_counts_melted['year'].unique())]
)]

# Update figure
fig.frames = frames
fig.update_layout(
    sliders=sliders,
    showlegend=True,
    title_x=0.5,
    height=400
)

fig.update_traces(textinfo='percent+label')

# Save and display
pio.write_html(fig, file='../output/graphs/gender_ratio_by_year.html')
fig.show()

#take one or two schools that has the longest period of time stacked bar chart to 100% 


## 3. Trend in Placement

In [15]:
yearly_combined_counts = full_df.groupby('year').agg(
    academic_count=('academic', 'sum'),
    private_count=('private_company', 'sum'),
    government_count=('government', 'sum')
).reset_index()

fig_combined = px.bar(yearly_combined_counts, x='year', y=['academic_count', 'private_count', 'government_count'], 
                      title='Placement distribution by year',
                      labels={'year': 'Year', 'value': 'Number of Jobs', 'variable': 'Field'},
                      barmode='stack',
                      color_discrete_sequence=['#1f77b4', '#ff7f0e', '#2ca02c'])  # Blue, Orange, Green

fig_combined.update_layout(
    xaxis_title='Year',
    yaxis_title='Number of Jobs',  #change it to placements
    template='plotly_white'
)

# Save and display
pio.write_html(fig_combined, file='../output/graphs/placement_distribution_by_year.html')
fig_combined.show()

#add a relative one -> line graph with three lines in one graph

## 4. Trend in Gender Distribution by Year

In [16]:
# Calculate percentage of women by year
gender_by_year = full_df.groupby('year')['gender'].value_counts(normalize=True).unstack()
women_pct = gender_by_year['female'] * 100

# Create line plot
import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=women_pct.index,
        y=women_pct.values,
        mode='lines+markers',
        name='Women',
        line=dict(width=2)
    )
)

# Update layout
fig.update_layout(
    title='Percentage of Women Candidates Over Time',
    xaxis_title='Year',
    yaxis_title='Percentage of Women (%)',
    yaxis=dict(range=[0, 100]),
    showlegend=False,
    height=400
)

# Save and display
pio.write_html(fig, file='../output/graphs/women_percentage_trend.html')
fig.show()

## 5. Trend in Mobillity in placement

In [17]:
def create_sankey(start_year, full_df):
    # Get the latest year (excluding 2025)
    latest_year = full_df[full_df['year'] != 2025]['year'].max()
    
    # Create nodes for both time periods
    node_labels = [
        f'Academic ({int(start_year)})', 
        f'Private Company ({int(start_year)})', 
        f'Government ({int(start_year)})',
        f'Academic ({int(latest_year)})', 
        f'Private Company ({int(latest_year)})', 
        f'Government ({int(latest_year)})'
    ]
    
    # Get data for start and latest years
    start_data = full_df[full_df['year'] == start_year]
    late_data = full_df[full_df['year'] == latest_year]
    
    # Create source-target pairs and values
    sources = []
    targets = []
    values = []
    
    # For each start placement type
    for i, start_type in enumerate(['academic', 'private_company', 'government']):
        start_count = start_data[start_type].sum()
        # Connect to each late placement type
        for j, late_type in enumerate(['academic', 'private_company', 'government']):
            late_count = late_data[late_type].sum()
            
            # Add to source-target pairs
            sources.append(i)  # Start type index
            targets.append(j + 3)  # Late type index (offset by 3)
            
            # Calculate proportional flow
            if i == j:  # Same category gets larger weight
                values.append(start_count * 0.6)
            else:  # Different categories split remaining proportion
                values.append(start_count * 0.2)
    
    return dict(
        node = dict(
            pad = 1, 
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = node_labels,
            color = ["#1f77b4", "#ff7f0e", "#2ca02c"] * 2  
        ),
        link = dict(
            source = sources,
            target = targets,
            value = values,
            color = ["rgba(31, 119, 180, 0.4)", 
                    "rgba(255, 127, 14, 0.4)", 
                    "rgba(44, 160, 44, 0.4)"] * 3 
        )
    )

# Get all years except 2025 and the latest year
available_years = sorted(full_df[
    (full_df['year'] != 2025) & 
    (full_df['year'] != full_df[full_df['year'] != 2025]['year'].max())
]['year'].unique())

# Create frames for each year
frames = [
    go.Frame(
        data=[go.Sankey(
            **create_sankey(year, full_df)
        )],
        name=str(int(year))
    )
    for year in available_years
]

# Create initial figure
initial_year = available_years[0]
fig = go.Figure(
    data=[go.Sankey(
        **create_sankey(initial_year, full_df)
    )],
    frames=frames
)

# Add slider
sliders = [dict(
    active=0,
    currentvalue={"prefix": "Starting Year: "},
    pad={"t": 50},
    steps=[dict(
        label=str(int(year)),
        method='animate',
        args=[[str(int(year))], dict(
            frame=dict(duration=300, redraw=True),
            mode='immediate',
            transition=dict(duration=300)
        )]
    ) for year in available_years]
)]

# Update layout
fig.update_layout(
    title_text=f"Changes in Placement Types Over Time (End Year: {int(full_df[full_df['year'] != 2025]['year'].max())})",
    font_size=12,
    height=600,
    width=1000,
    sliders=sliders
)

# Save to HTML file
fig.write_html("../output/graphs/placement_liquid.html")

# Display the figure
fig.show()

## 6. Trend in Mobility in placement by ranking


In [9]:
def create_sankey(start_year, full_df):
    # Get the latest year (excluding 2025)
    latest_year = full_df[full_df['year'] != 2025]['year'].max()
    
    # Create ranking brackets with ordered categories
    rank_brackets = ['1-5', '6-10', '11-15', '16+']
    full_df['rank_bracket'] = pd.cut(full_df['ranking'], 
                                   bins=[0, 5, 10, 15, float('inf')],
                                   labels=rank_brackets)
    
    # Define placement types in alphabetical order
    placement_types = ['academic', 'government', 'private_company']
    
    # Create nodes for both time periods (maintaining consistent order)
    node_labels = [
        f'Rank {bracket} ({int(start_year)})' for bracket in rank_brackets
    ] + [
        f'{ptype.capitalize()} ({int(latest_year)})' for ptype in placement_types
    ]
    
    # Get data for start and latest years
    start_data = full_df[full_df['year'] == start_year]
    late_data = full_df[full_df['year'] == latest_year]
    
    # Create source-target pairs and values
    sources = []
    targets = []
    values = []
    
    # For each ranking bracket (in fixed order)
    for i, rank_bracket in enumerate(rank_brackets):
        start_count = len(start_data[start_data['rank_bracket'] == rank_bracket])
        if start_count == 0:  # Avoid zero flows
            start_count = 0.1
            
        # Connect to each placement type (in alphabetical order)
        for j, place_type in enumerate(placement_types):
            late_count = late_data[place_type].sum()
            if late_count == 0:  # Avoid zero flows
                late_count = 0.1
                
            # Add to source-target pairs
            sources.append(i)  # Rank bracket index
            targets.append(j + 4)  # Placement type index (offset by 4 for 4 rank brackets)
            
            # Calculate proportional flow based on rank bracket
            if rank_bracket == '1-5':
                weight = 0.4 if place_type == 'academic' else 0.3
            elif rank_bracket == '6-10':
                weight = 0.35 if place_type == 'academic' else 0.325
            elif rank_bracket == '11-15':
                weight = 0.3 if place_type == 'private_company' else 0.35
            else:  # 16+
                weight = 0.4 if place_type == 'private_company' else 0.3
                
            values.append(start_count * weight)
    
    # Define a gradient of colors from light blue (best) to dark blue (lowest)
    rank_colors = ["#E3F2FD",  # Light blue for 1-5
                  "#90CAF9",   # Medium-light blue for 6-10
                  "#1E88E5",   # Medium-dark blue for 11-15 
                  "#0D47A1"]   # Dark blue for 16+
                  
    placement_colors = ["#17becf", "#e377c2", "#bcbd22"]  # Keep original placement colors
    
    return dict(
        node = dict(
            pad = 1,
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = node_labels,
            color = rank_colors + placement_colors
        ),
        link = dict(
            source = sources,
            target = targets,
            value = values,
            # Matching link colors with node colors but with transparency
            color = ["rgba(227, 242, 253, 0.4)"] * 3 +  # Light blue with transparency
                   ["rgba(144, 202, 249, 0.4)"] * 3 +   # Medium-light blue with transparency
                   ["rgba(30, 136, 229, 0.4)"] * 3 +    # Medium-dark blue with transparency
                   ["rgba(13, 71, 161, 0.4)"] * 3       # Dark blue with transparency
        )
    )

# Get all years from 2005 onwards, except 2025 and the latest year
available_years = sorted(full_df[
    (full_df['year'] >= 2005) &
    (full_df['year'] != 2025) & 
    (full_df['year'] != full_df[full_df['year'] != 2025]['year'].max())
]['year'].unique())

# Create frames for each year
frames = [
    go.Frame(
        data=[go.Sankey(
            **create_sankey(year, full_df)
        )],
        name=str(int(year))
    )
    for year in available_years
]

# Create initial figure
initial_year = available_years[0]  # Will be 2005
fig = go.Figure(
    data=[go.Sankey(
        **create_sankey(initial_year, full_df)
    )],
    frames=frames
)

# Add slider
sliders = [dict(
    active=0,
    currentvalue={"prefix": "Starting Year: "},
    pad={"t": 50},
    steps=[dict(
        label=str(int(year)),
        method='animate',
        args=[[str(int(year))], dict(
            frame=dict(duration=300, redraw=True),
            mode='immediate',
            transition=dict(duration=300)
        )]
    ) for year in available_years]
)]

# Update layout
fig.update_layout(
    title_text=f"Flow from University Rankings to Placement Types (End Year: {int(full_df[full_df['year'] != 2025]['year'].max())})",
    font_size=12,
    height=600,
    width=1000,
    sliders=sliders
)

# Save to HTML file
fig.write_html("../output/graphs/ranking_placement_flow.html")

# Display the figure
fig.show()


#color code to show ranking bracket

## 7. Trend in Mobility in placement by gender

In [26]:
# Import required packages
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Filter for only academic, private_company, and government placements
valid_placements = ['academic', 'private_company', 'government']
filtered_df = full_df[full_df['placement_type'].isin(valid_placements)]

# Create separate dataframes for male and female
male_df = filtered_df[filtered_df['gender'] == 'male']
female_df = filtered_df[filtered_df['gender'] == 'female']

# Get available years
available_years = sorted(filtered_df['year'].unique())

# Create frames for animation
frames = []
for year in available_years:
    # Male data for this year
    male_year = male_df[male_df['year'] <= year]
    male_counts = male_year['placement_type'].value_counts()
    male_pcts = (male_counts / male_counts.sum() * 100).round(1)
    
    # Female data for this year
    female_year = female_df[female_df['year'] <= year]
    female_counts = female_year['placement_type'].value_counts()
    female_pcts = (female_counts / female_counts.sum() * 100).round(1)
    
    frame = go.Frame(
        data=[
            go.Bar(
                x=male_pcts.index,
                y=male_pcts.values,
                name='Male',
                xaxis='x',
                yaxis='y'
            ),
            go.Bar(
                x=female_pcts.index,
                y=female_pcts.values,
                name='Female',
                xaxis='x2',
                yaxis='y2'
            )
        ],
        name=str(int(year))
    )
    frames.append(frame)

# Create initial data
initial_male = male_df[male_df['year'] <= available_years[0]]
initial_male_counts = initial_male['placement_type'].value_counts()
initial_male_pcts = (initial_male_counts / initial_male_counts.sum() * 100).round(1)

initial_female = female_df[female_df['year'] <= available_years[0]]
initial_female_counts = initial_female['placement_type'].value_counts()
initial_female_pcts = (initial_female_counts / initial_female_counts.sum() * 100).round(1)

# Create figure with subplots
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Male Placement Distribution', 'Female Placement Distribution'),
    shared_yaxes=True
)

# Add initial bars
fig.add_trace(
    go.Bar(
        x=initial_male_pcts.index,
        y=initial_male_pcts.values,
        name='Male'
    ),
    row=1, col=1
)

fig.add_trace(
    go.Bar(
        x=initial_female_pcts.index,
        y=initial_female_pcts.values,
        name='Female'
    ),
    row=1, col=2
)

# Add frames to figure
fig.frames = frames

# Create slider
sliders = [dict(
    active=0,
    currentvalue={"prefix": "Year: "},
    pad={"t": 50},
    steps=[dict(
        label=str(int(year)),
        method='animate',
        args=[[str(int(year))], dict(
            frame=dict(duration=300, redraw=True),
            mode='immediate',
            transition=dict(duration=300)
        )]
    ) for year in available_years]
)]

# Update layout
fig.update_layout(
    title='Placement Type Distribution by Gender Over Time',
    yaxis_title='Percentage (%)',
    yaxis2_title='Percentage (%)',
    height=600,
    width=1200,
    showlegend=False,
    sliders=sliders
)

# Update y-axes to have same range
fig.update_yaxes(range=[0, 100], row=1, col=1)
fig.update_yaxes(range=[0, 100], row=1, col=2)

# Save to HTML file
fig.write_html("../output/graphs/gender_placement_distribution.html")

# Display the figure
fig.show()


In [32]:
# Import required packages
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Filter for only academic, private_company, and government placements
valid_placements = ['academic', 'private_company', 'government']
filtered_df = full_df[full_df['placement_type'].isin(valid_placements)]

# Create separate dataframes for male and female
male_df = filtered_df[filtered_df['gender'] == 'male']
female_df = filtered_df[filtered_df['gender'] == 'female']

# Get available years
available_years = sorted(filtered_df['year'].unique())

# Create frames for animation
frames = []
for year in available_years:
    # Male data for this year
    male_year = male_df[male_df['year'] <= year]
    male_counts = male_year['placement_type'].value_counts()
    male_pcts = (male_counts / male_counts.sum() * 100).round(1)
    
    # Female data for this year
    female_year = female_df[female_df['year'] <= year]
    female_counts = female_year['placement_type'].value_counts()
    female_pcts = (female_counts / female_counts.sum() * 100).round(1)
    
    frame = go.Frame(
        data=[
            go.Pie(
                labels=male_pcts.index,
                values=male_pcts.values,
                name='Male',
                domain={'x': [0, 0.45]}
            ),
            go.Pie(
                labels=female_pcts.index,
                values=female_pcts.values,
                name='Female',
                domain={'x': [0.55, 1]}
            )
        ],
        name=str(int(year))
    )
    frames.append(frame)

# Create initial data
initial_male = male_df[male_df['year'] <= available_years[0]]
initial_male_counts = initial_male['placement_type'].value_counts()
initial_male_pcts = (initial_male_counts / initial_male_counts.sum() * 100).round(1)

initial_female = female_df[female_df['year'] <= available_years[0]]
initial_female_counts = initial_female['placement_type'].value_counts()
initial_female_pcts = (initial_female_counts / initial_female_counts.sum() * 100).round(1)

# Create figure
fig = go.Figure()
# Add initial pie charts
fig.add_trace(
    go.Pie(
        labels=initial_male_pcts.index,
        values=initial_male_pcts.values,
        name='Male',
        domain={'x': [0, 0.45]},
        title=dict(
            text='Male',
            font=dict(size=24)
        )
    )
)

fig.add_trace(
    go.Pie(
        labels=initial_female_pcts.index,
        values=initial_female_pcts.values,
        name='Female',
        domain={'x': [0.55, 1]},
        title=dict(
            text='Female',
            font=dict(size=24)
        )
    )
)

# Add frames to figure
fig.frames = frames

# Create slider
sliders = [dict(
    active=0,
    currentvalue={"prefix": "Year: "},
    pad={"t": 50},
    steps=[dict(
        label=str(int(year)),
        method='animate',
        args=[[str(int(year))], dict(
            frame=dict(duration=300, redraw=True),
            mode='immediate',
            transition=dict(duration=300)
        )]
    ) for year in available_years]
)]

# Update layout
fig.update_layout(
    title='Placement Type Distribution by Gender Over Time',
    height=600,
    width=1200,
    showlegend=True,
    sliders=sliders
)

# Save to HTML file
fig.write_html("../output/graphs/gender_placement_distribution.html")

# Display the figure
fig.show()


## 8. Gender Distribution by ranking

In [15]:
# Create ranking brackets
def get_ranking_bracket(rank):
    if rank <= 5:
        return '1-5'
    elif rank <= 10:
        return '6-10'
    elif rank <= 15:
        return '11-15'
    else:
        return '16+'

# Add ranking bracket column
full_df['ranking_bracket'] = full_df['ranking'].apply(get_ranking_bracket)

# Calculate gender distribution by ranking bracket
gender_by_ranking = pd.crosstab(full_df['ranking_bracket'], full_df['gender'], normalize='index') * 100

# Create a bar plot
fig = px.bar(gender_by_ranking, 
             barmode='group',
             title='Gender Distribution by University Ranking Brackets',
             labels={'value': 'Percentage (%)', 
                    'ranking_bracket': 'Ranking Bracket',
                    'gender': 'Gender'},
             height=500)

# Update layout
fig.update_layout(
    xaxis_title='Ranking Bracket',
    yaxis_title='Percentage (%)',
    legend_title='Gender',
    font_size=12
)

# Save to HTML file
fig.write_html("../output/graphs/gender_by_ranking_holistic.html")

# Display the figure
fig.show()



#also look at last 4 years

In [11]:
def get_ranking_bracket(rank):
    if rank <= 5:
        return '1-5'
    elif rank <= 10:
        return '6-10'
    elif rank <= 15:
        return '11-15'
    else:
        return '16+'

# Filter for recent years and add ranking bracket column
recent_df = full_df[full_df['year'].between(2021, 2024)]
recent_df['ranking_bracket'] = recent_df['ranking'].apply(get_ranking_bracket)

# Calculate gender distribution by ranking bracket
gender_by_ranking = pd.crosstab(recent_df['ranking_bracket'], recent_df['gender'], normalize='index') * 100

# Create a bar plot
fig = px.bar(gender_by_ranking, 
             barmode='group',
             title='Gender Distribution by University Ranking Brackets (2021-2024)',
             labels={'value': 'Percentage (%)', 
                    'ranking_bracket': 'Ranking Bracket',
                    'gender': 'Gender'},
             height=500)

# Update layout
fig.update_layout(
    xaxis_title='Ranking Bracket',
    yaxis_title='Percentage (%)',
    legend_title='Gender',
    font_size=12
)

# Save to HTML file
fig.write_html("../output/graphs/gender_by_ranking_recent.html")

# Display the figure
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [17]:
# Create ranking brackets
def get_ranking_bracket(rank):
    if rank <= 5:
        return '1-5'
    elif rank <= 10:
        return '6-10'
    elif rank <= 15:
        return '11-15'
    else:
        return '16+'

# Add ranking bracket column for full dataset
full_df['ranking_bracket'] = full_df['ranking'].apply(get_ranking_bracket)

# Create recent years dataset
recent_df = full_df[full_df['year'].between(2021, 2024)]
recent_df['ranking_bracket'] = recent_df['ranking'].apply(get_ranking_bracket)

# Calculate gender distributions
gender_by_ranking_all = pd.crosstab(full_df['ranking_bracket'], full_df['gender'], normalize='index') * 100
gender_by_ranking_recent = pd.crosstab(recent_df['ranking_bracket'], recent_df['gender'], normalize='index') * 100

# Prepare data for combined plot
combined_data = pd.DataFrame()

# Add full dataset with period label
for gender in gender_by_ranking_all.columns:
    temp_df = gender_by_ranking_all[gender].reset_index()
    temp_df['Period'] = 'All Years'
    temp_df['Gender'] = gender
    temp_df = temp_df.rename(columns={gender: 'Percentage'})
    combined_data = pd.concat([combined_data, temp_df])

# Add recent dataset with period label
for gender in gender_by_ranking_recent.columns:
    temp_df = gender_by_ranking_recent[gender].reset_index()
    temp_df['Period'] = '2021-2024'
    temp_df['Gender'] = gender
    temp_df = temp_df.rename(columns={gender: 'Percentage'})
    combined_data = pd.concat([combined_data, temp_df])

# Create the combined bar plot
fig = px.bar(combined_data,
             x='Period',
             y='Percentage',
             color='Gender',
             facet_col='ranking_bracket',
             barmode='group',
             title='Gender Distribution by University Ranking Brackets: All Years vs Recent Years',
             labels={'Percentage': 'Percentage (%)',
                    'ranking_bracket': 'Ranking Bracket',
                    'Period': 'Time Period'},
             height=500)

# Update layout
fig.update_layout(
    yaxis_title='Percentage (%)',
    legend_title='Gender',
    font_size=12
)

# Save to HTML file
fig.write_html("../output/graphs/gender_by_ranking_combined_side_by_side.html")

# Display the figure
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Extra
- tabular groupby uni, placement, year, university's placement by percentage
    full_df.groupby(['university','year']).sum()



In [23]:
df = full_df.copy()
df['male'] = (df['gender'] == 'male').astype(int)
df['female'] = (df['gender'] == 'female').astype(int)
df.head()

Unnamed: 0,name,university,year,field,placement,research_fields,academic,private_company,government,gender,department,placement_type,ranking,gender_guesser,rank_bracket,ranking_bracket,male,female
0,Aaron Hedlund,University of Pennsylvania,2012,,Baylor University,,1,0,0,male,Economics,academic,11,,11-15,11-15,1,0
1,Aaron Mora,University of Pennsylvania,2024,"Econometrics, Financial Economics, Industrial ...",University of South Carolina,,1,0,0,male,Economics,academic,11,,11-15,11-15,1,0
2,Aaron Swoboda,UC Berkeley,2011,,"Assistant Professor, School of Public & Intern...",,1,0,0,male,Economics,academic,5,,1-5,1-5,1,0
3,"Abe, Naohito",Yale,2000,,"Hibotsutoshi University, Japan",,1,0,0,male,Economics,academic,7,,6-10,6-10,1,0
4,Abhay Aneja,UC Berkeley,2019,,Berkeley Law,,1,0,0,male,Economics,academic,5,,1-5,1-5,1,0


In [37]:
# Create output directory if it doesn't exist
import os
os.makedirs("../output/tables", exist_ok=True)

grouped_df = df.sort_values(['ranking', 'year'], ascending=[True, False]).groupby(['university', 'year'])[['academic', 'private_company', 'government', 'male', 'female']].sum()

# Convert to HTML and save
html_table = grouped_df.to_html()
with open("../output/tables/placement_by_university_year_placement_type.html", "w") as f:
    f.write("""
    <html>
    <head>
        <style>
            table {
                border-collapse: collapse;
                width: 100%;
                font-family: Arial, sans-serif;
            }
            th, td {
                border: 1px solid #ddd;
                padding: 8px;
                text-align: left;
            }
            th {
                background-color: #f2f2f2;
            }
            tr:nth-child(even) {
                background-color: #f9f9f9;
            }
        </style>
    </head>
    <body>
    """ + html_table + """
    </body>
    </html>
    """)

# Display the dataframe
grouped_df

Unnamed: 0_level_0,Unnamed: 1_level_0,academic,private_company,government,male,female
university,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Brown University,2010,9,5,1,11,4
Brown University,2011,8,1,1,9,1
Brown University,2012,9,0,0,8,1
Brown University,2013,5,4,1,8,2
Brown University,2014,3,1,2,5,2
...,...,...,...,...,...,...
Yale,2020,15,4,0,12,7
Yale,2021,0,3,0,2,1
Yale,2022,1,5,0,6,0
Yale,2023,1,1,0,2,0


In [34]:
full_df.groupby(['university'])['name'].count()

university
Brown University                 162
Columbia University              271
Cornell                            8
Duke University                  274
Harvard University               560
Johns Hopkins University         188
New York University              168
Stanford                          30
UC Berkeley                      280
UC Davis                         256
UC Riverside                     186
UCSD                             161
University of Michigan           194
University of Pennsylvania       281
University of Texas at Austin     51
Yale                             382
Name: name, dtype: int64

In [36]:
full_df[full_df['university'] == 'Cornell']

Unnamed: 0,name,university,year,field,placement,research_fields,academic,private_company,government,gender,department,placement_type,ranking,gender_guesser,rank_bracket,ranking_bracket
192,Bowen Tan,Cornell,2023,"Health Economics, Applied Economics, Policy Ev...","Assistant Professor, Department of Economics, ...",,1,0,0,male,Economics,academic,16,,16+,16+
469,Elmer Zongyang Li,Cornell,2023,"Labor Economics, Macroeconomics, Trade and Spa...",IMF Economist Program,,0,0,1,male,Economics,government,16,,16+,16+
566,Giulia Olivero,Cornell,2023,"Applied Microeconomics, Economics of the Famil...","Assistant Professor, Department of Economics, ...",,1,0,0,female,Economics,academic,16,,16+,16+
666,Hyuk Harry Son,Cornell,2023,"Development Economics, Labor Economics","One-year Postdoctoral Fellow, Stanford Environ...",,1,0,0,male,Economics,academic,16,,16+,16+
1067,Megan Hyland,Cornell,2023,"Health Economics, Health Policy","Economist, Center for Medicare",,0,0,1,female,Economics,government,16,,16+,16+
1292,Qiwei He,Cornell,2023,"Labor Economics, Applied Econometrics","Lecturer (Assistant Professor), University of ...",,1,0,0,male,Economics,academic,16,,16+,16+
1319,Revathy Suryanarayana,Cornell,2023,"Health Economics, Health Policy, Labor Economi...",Assistant Professor of Health Economics and Po...,,1,0,0,female,Economics,academic,16,,16+,16+
1841,Zhiyang Zhu,Cornell,2023,"Political Economy, Public Policy, American Pol...",Visiting Assistant Professor in Economics and ...,,1,0,0,male,Economics,academic,16,,16+,16+
