In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

pd.set_option('display.max_columns', 100)


In [3]:
df = pd.read_csv('2024/survey_results_public.csv')

df.shape



(65437, 114)

In [4]:
df.isnull().sum()

ResponseId                 0
MainBranch                 0
Age                        0
Employment                 0
RemoteWork             10631
                       ...  
JobSatPoints_11        35992
SurveyLength            9255
SurveyEase              9199
ConvertedCompYearly    42002
JobSat                 36311
Length: 114, dtype: int64

In [5]:
def is_employed(status):
    """
    Determines if a respondent is employed based on their 'Employment' status.
    Returns 1 if employed, 0 otherwise.

    Parameters:
    status (str): The employment status string from the 'Employment' column.

    Returns:
    int: 1 if employed, 0 if not employed.
    """
    if pd.isnull(status):
        # If the status is missing, treat as not employed.
        return 0
    # List of keywords that indicate employment.
    employed_keywords = [
        "Employed, full-time",
        "Employed, part-time",
        "Independent contractor, freelancer, or self-employed"
    ]
    # Check if any employed keyword is present in the status string.
    for keyword in employed_keywords:
        if keyword in status:
            return 1
    # If none of the keywords are found, treat as not employed.
    return 0

# Apply the function to the 'Employment' column to create the new 'Employed' column.
df['Employed'] = df['Employment'].apply(is_employed)

# Display the first few rows to verify the new column.
df[['Employment', 'Employed']].head()

Unnamed: 0,Employment,Employed
0,"Employed, full-time",1
1,"Employed, full-time",1
2,"Employed, full-time",1
3,"Student, full-time",0
4,"Student, full-time",0


In [6]:
df[['Employment', 'Employed']].to_csv('2024/survey_results_public_cleaned.csv', index=False)

In [7]:
df.groupby(by='EdLevel')['Employed'].agg(['mean', 'count', 'sum'])


Unnamed: 0_level_0,mean,count,sum
EdLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Associate degree (A.A., A.S., etc.)",0.866704,1793,1554
"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",0.891228,24942,22229
"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",0.938549,15557,14601
Primary/elementary school,0.335079,1146,384
"Professional degree (JD, MD, Ph.D, Ed.D, etc.)",0.914815,2970,2717
"Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",0.529087,5793,3065
Some college/university study without earning a degree,0.793099,7651,6068
Something else,0.638412,932,595


In [108]:
# Unemployment Rate by Education Level (Global)

# --- Interactive Unemployment Rate Chart by Education Level using Plotly ---

import plotly.express as px

# Group and calculate unemployment rate as before
edlevel_counts = df.groupby('EdLevel').agg(
    total=('Employed', 'count'),
    unemployed=('Employed', lambda x: (x == 0).sum())
).reset_index()
edlevel_counts['Unemployment Rate (%)'] = 100 * edlevel_counts['unemployed'] / edlevel_counts['total']
edlevel_counts = edlevel_counts.sort_values('Unemployment Rate (%)', ascending=False)


# Define custom colors to match my website's palette
custom_bar_color = '#A89F91'  # Taupe/gray
background_color = '#E2DED3'  # Soft beige
font_color = '#333333'        # Dark gray for text

# Create the bar chart
fig = px.bar(
    edlevel_counts,
    x='Unemployment Rate (%)',
    y='EdLevel',
    orientation='h',
    color_discrete_sequence=[custom_bar_color],
    hover_data=['total', 'unemployed'],
    title='Unemployment Rate by Education Level (Global)',
    text='Unemployment Rate (%)'
)


# Update layout for better appearance
fig.update_traces(
    texttemplate='%{text:,.1f}%',
    textposition='outside',
    textfont=dict(size=12, color=font_color)
)
fig.update_layout(
    yaxis=dict(automargin=True, tickfont=dict(size=14, color=font_color)),
    xaxis=dict(tickfont=dict(size=14, color=font_color), title='Unemployment Rate (%)'),
    title=dict(x=0.5, font=dict(size=22, color=font_color)),
    plot_bgcolor=background_color,
    paper_bgcolor=background_color,
    bargap=0.3,
    coloraxis_showscale=False,
    height=500,
    font=dict(family='Arial, sans-serif', color=font_color)
)

# Show the interactive chart in the notebook
fig.show()
html_str = fig.to_html(full_html=False, include_plotlyjs='cdn')

# Save the snippet to a file
with open("unemployment_rate_by_education_level.html", "w") as f:
    f.write(html_str)

In [107]:
# Unemployment Rate by Education Level (US)

# --- Interactive Unemployment Rate Chart by Education Level using Plotly ---

import plotly.express as px

# Group and calculate unemployment rate as before
edlevel_counts = df[df['Country'] == 'United States of America'].groupby('EdLevel').agg(
    total=('Employed', 'count'),
    unemployed=('Employed', lambda x: (x == 0).sum())
).reset_index()
edlevel_counts['Unemployment Rate (%)'] = 100 * edlevel_counts['unemployed'] / edlevel_counts['total']
edlevel_counts = edlevel_counts.sort_values('Unemployment Rate (%)', ascending=False)

# Define custom colors to match my website's palette
custom_bar_color = '#A89F91'  # Taupe/gray
background_color = '#E2DED3'  # Soft beige
font_color = '#333333'        # Dark gray for text

# Create the bar chart
fig = px.bar(
    edlevel_counts,
    x='Unemployment Rate (%)',
    y='EdLevel',
    orientation='h',
    color_discrete_sequence=[custom_bar_color],
    hover_data=['total', 'unemployed'],
    title='Unemployment Rate by Education Level (US)',
    text='Unemployment Rate (%)'
)


# Update layout for better appearance
fig.update_traces(
    texttemplate='%{text:,.1f}%',
    textposition='outside',
    textfont=dict(size=12, color=font_color)
)
fig.update_layout(
    yaxis=dict(automargin=True, tickfont=dict(size=14, color=font_color)),
    xaxis=dict(tickfont=dict(size=14, color=font_color), title='Unemployment Rate (%)'),
    title=dict(x=0.5, font=dict(size=22, color=font_color)),
    plot_bgcolor=background_color,
    paper_bgcolor=background_color,
    bargap=0.3,
    coloraxis_showscale=False,
    height=500,
    font=dict(family='Arial, sans-serif', color=font_color)
)

# Show the interactive chart in the notebook
fig.show()
html_str = fig.to_html(full_html=False, include_plotlyjs='cdn')

# Save the snippet to a file
with open("unemployment_rate_by_education_level_US.html", "w") as f:
    f.write(html_str)

In [None]:
# Yearly Compensation Distribution by Education Level (with Outliers)

import plotly.express as px

# Remove rows with missing compensation or education level
df_box = df.dropna(subset=['ConvertedCompYearly', 'EdLevel'])

# Define custom theme colors
background_color = '#E2DED3'
font_color = '#333333'
box_color = '#A89F91'
outlier_color = '#6B6253'

# Create an interactive box plot with a single color for all boxes
fig = px.box(
    df_box,
    x='ConvertedCompYearly',
    y='EdLevel',
    orientation='h',
    points='outliers',  # Show outlier points
    color_discrete_sequence=[box_color],  # Use your theme color for all boxes
    title='Yearly Compensation Distribution by Education Level (with Outliers)',
    labels={
        'ConvertedCompYearly': 'Converted Compensation (Yearly)',
        'EdLevel': 'Education Level'
    },
    width=1000,
    height=600
)

# Update traces for outlier color and marker style
fig.update_traces(
    marker=dict(
        outliercolor=outlier_color,
        line=dict(outliercolor=outlier_color, width=1)
    ),
    boxmean=True  # Optionally show the mean as a line
)

# Improve layout for readability and theme
fig.update_layout(
    yaxis=dict(automargin=True, tickfont=dict(size=14, color=font_color)),
    xaxis=dict(tickfont=dict(size=14, color=font_color), title='Converted Compensation (Yearly)'),
    showlegend=False,
    plot_bgcolor=background_color,
    paper_bgcolor=background_color,
    title=dict(x=0.5, font=dict(size=22, color=font_color)),
    font=dict(family='Arial, sans-serif', color=font_color)
)

fig.show()
html_str = fig.to_html(full_html=False, include_plotlyjs='cdn')

# Save the snippet to a file
with open("box_plot_comp_by_edlevel_snippet.html", "w") as f:
    f.write(html_str)

In [None]:
# Yearly Compensation Distribution by Education Level (Under $300,000)

import plotly.express as px

# Remove rows with missing compensation or education level
df_box = df.dropna(subset=['ConvertedCompYearly', 'EdLevel'])

# Filter to only show salaries below $300,000
df_box = df_box[df_box['ConvertedCompYearly'] < 300000]

# Create an interactive box plot
fig = px.box(
    df_box,
    x='ConvertedCompYearly',
    y='EdLevel',
    orientation='h',
    points='outliers',  # Show outlier points within this range
    color='EdLevel',
    title='Yearly Compensation Distribution by Education Level (Under $300,000)',
    labels={
        'ConvertedCompYearly': 'Converted Compensation (Yearly)',
        'EdLevel': 'Education Level'
    },
    width=1000,
    height=600
)

fig.update_layout(
    yaxis=dict(automargin=True, tickfont=dict(size=12)),
    xaxis=dict(tickfont=dict(size=12), title='Converted Compensation (Yearly)'),
    showlegend=False,
    plot_bgcolor='white',
    title=dict(x=0.5, font=dict(size=20))
)

fig.show()

In [12]:
df.groupby('EdLevel')['ConvertedCompYearly'].median().sort_values(ascending=False)

EdLevel
Professional degree (JD, MD, Ph.D, Ed.D, etc.)                                        79962.0
Master’s degree (M.A., M.S., M.Eng., MBA, etc.)                                       68203.0
Bachelor’s degree (B.A., B.S., B.Eng., etc.)                                          67129.0
Associate degree (A.A., A.S., etc.)                                                   60147.0
Some college/university study without earning a degree                                59288.0
Something else                                                                        53793.0
Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)    45111.0
Primary/elementary school                                                             36088.0
Name: ConvertedCompYearly, dtype: float64

In [None]:
# Median Yearly Compensation by Education Level (Global)

import plotly.express as px

# Remove rows with missing compensation or education level
df_median = df.dropna(subset=['ConvertedCompYearly', 'EdLevel'])

# Group by 'EdLevel' and calculate the median compensation
median_comp = (
    df_median.groupby('EdLevel')['ConvertedCompYearly']
    .median()
    .reset_index()
    .rename(columns={'ConvertedCompYearly': 'MedianComp'})
    .sort_values('MedianComp', ascending=False)
)

custom_bar_color = '#A89F91'
background_color = '#E2DED3'
font_color = '#333333'

# Create the interactive bar chart
fig = px.bar(
    median_comp,
    x='MedianComp',
    y='EdLevel',
    orientation='h',
    color_discrete_sequence=[custom_bar_color],
    title='Median Yearly Compensation by Education Level',
    labels={
        'MedianComp': 'Median of Converted Compensation (Yearly)',
        'EdLevel': 'Education Level'
    },
    text='MedianComp'
)

# Update layout for aesthetics and theme matching
fig.update_traces(
    texttemplate='%{text:,.0f}',
    textposition='outside',
    textfont=dict(size=12, color=font_color)
)
fig.update_layout(
    yaxis=dict(automargin=True, tickfont=dict(size=14, color=font_color)),
    xaxis=dict(tickfont=dict(size=14, color=font_color), title='Median of Converted Compensation (Yearly)'),
    title=dict(x=0.5, font=dict(size=22, color=font_color)),
    plot_bgcolor=background_color,
    paper_bgcolor=background_color,
    bargap=0.3,
    coloraxis_showscale=False,
    height=500,
    font=dict(family='Arial, sans-serif', color=font_color)
)

fig.show()
html_str = fig.to_html(full_html=False, include_plotlyjs='cdn')

# Save the snippet to a file
with open("median_salary_by_edlevel_snippet.html", "w") as f:
    f.write(html_str)

In [109]:
# Median Yearly Compensation by Age (Global)

import plotly.express as px

# Remove rows with missing compensation or age
df_median = df.dropna(subset=['ConvertedCompYearly', 'Age'])

# Group by 'EdLevel' and calculate the median compensation
median_comp = (
    df_median.groupby('Age')['ConvertedCompYearly']
    .median()
    .reset_index()
    .rename(columns={'ConvertedCompYearly': 'MedianComp'})
    .sort_values('MedianComp', ascending=False)
)

custom_bar_color = '#A89F91'
background_color = '#E2DED3'
font_color = '#333333'

# Create the interactive bar chart
fig = px.bar(
    median_comp,
    x='MedianComp',
    y='Age',
    orientation='h',
    color_discrete_sequence=[custom_bar_color],
    title='Median Yearly Compensation by Age',
    labels={
        'MedianComp': 'Median of Converted Compensation (Yearly)',
        'Age': 'Age'
    },
    text='MedianComp'
)

# Update layout for aesthetics and theme matching
fig.update_traces(
    texttemplate='%{text:,.0f}',
    textposition='outside',
    textfont=dict(size=12, color=font_color)
)
fig.update_layout(
    yaxis=dict(automargin=True, tickfont=dict(size=14, color=font_color)),
    xaxis=dict(tickfont=dict(size=14, color=font_color), title='Median of Converted Compensation (Yearly)'),
    title=dict(x=0.5, font=dict(size=22, color=font_color)),
    plot_bgcolor=background_color,
    paper_bgcolor=background_color,
    bargap=0.3,
    coloraxis_showscale=False,
    height=500,
    font=dict(family='Arial, sans-serif', color=font_color)
)

fig.show()
html_str = fig.to_html(full_html=False, include_plotlyjs='cdn')

# Save the snippet to a file
with open("median_salary_by_age_snippet.html", "w") as f:
    f.write(html_str)

In [None]:
# Median Yearly Compensation by Remote Work (Global)

import plotly.express as px

# Remove rows with missing compensation or RemoteWork
df_median = df.dropna(subset=['ConvertedCompYearly', 'RemoteWork'])

# Group by 'EdLevel' and calculate the median compensation
median_comp = (
    df_median.groupby('RemoteWork')['ConvertedCompYearly']
    .median()
    .reset_index()
    .rename(columns={'ConvertedCompYearly': 'MedianComp'})
    .sort_values('MedianComp', ascending=False)
)

custom_bar_color = '#A89F91'
background_color = '#E2DED3'
font_color = '#333333'

# Create the bar chart
fig = px.bar(
    median_comp,
    x='MedianComp',
    y='RemoteWork',
    orientation='h',
    color_discrete_sequence=[custom_bar_color],
    title='Median Yearly Compensation by Remote Work',
    labels={
        'MedianComp': 'Median of Converted Compensation (Yearly)',
        'RemoteWork': 'Remote Work'
    },
    text='MedianComp'
)

# Update layout for aesthetics and theme matching
fig.update_traces(
    texttemplate='%{text:,.0f}',
    textposition='outside',
    textfont=dict(size=12, color=font_color)
)
fig.update_layout(
    yaxis=dict(automargin=True, tickfont=dict(size=14, color=font_color)),
    xaxis=dict(tickfont=dict(size=14, color=font_color), title='Median of Converted Compensation (Yearly)'),
    title=dict(x=0.5, font=dict(size=22, color=font_color)),
    plot_bgcolor=background_color,
    paper_bgcolor=background_color,
    bargap=0.3,
    coloraxis_showscale=False,
    height=500,
    font=dict(family='Arial, sans-serif', color=font_color)
)

fig.show()
html_str = fig.to_html(full_html=False, include_plotlyjs='cdn')

# Save the snippet to a file
with open("median_salary_by_remote_work_snippet.html", "w") as f:
    f.write(html_str)

In [None]:
# Median Yearly Compensation by Country

import plotly.express as px

# Remove rows with missing compensation or country
df_median = df.dropna(subset=['ConvertedCompYearly', 'Country'])

# Group by 'EdLevel' and calculate the median compensation
median_comp = (
    df_median
    .groupby('Country')['ConvertedCompYearly']
    .median()
    .reset_index()
    .rename(columns={'ConvertedCompYearly': 'MedianComp'})
    .sort_values('MedianComp', ascending=False)
)

median_comp = median_comp[median_comp['Country'] != 'Gabon']

custom_bar_color = '#A89F91'
background_color = '#E2DED3'
font_color = '#333333'

# Create the bar chart
fig = px.bar(
    median_comp,
    x='MedianComp',
    y='Country',
    orientation='h',
    color_discrete_sequence=[custom_bar_color],
    title='Median Yearly Compensation by Country',
    labels={
        'MedianComp': 'Median of Converted Compensation (Yearly)',
        'Country': 'Country'
    },
    text='MedianComp'
)

# Update layout for aesthetics and theme matching
fig.update_traces(
    texttemplate='%{text:,.0f}',
    textposition='outside',
    textfont=dict(size=12, color=font_color)
)
fig.update_layout(
    yaxis=dict(automargin=True, tickfont=dict(size=14, color=font_color)),
    xaxis=dict(tickfont=dict(size=14, color=font_color), title='Median of Converted Compensation (Yearly)'),
    title=dict(x=0.5, font=dict(size=22, color=font_color)),
    plot_bgcolor=background_color,
    paper_bgcolor=background_color,
    bargap=0.3,
    coloraxis_showscale=False,
    height=500,
    font=dict(family='Arial, sans-serif', color=font_color)
)

fig.show()
html_str = fig.to_html(full_html=False, include_plotlyjs='cdn')

# Save the snippet to a file
with open("median_salary_by_country_snippet.html", "w") as f:
    f.write(html_str)

In [None]:
# Median Yearly Compensation by DevType (Global)

import plotly.express as px

# Remove rows with missing compensation or DevType
df_median = df.dropna(subset=['ConvertedCompYearly', 'DevType'])

# Group by 'EdLevel' and calculate the median compensation
median_comp = (
    df_median.groupby('DevType')['ConvertedCompYearly']
    .median()
    .reset_index()
    .rename(columns={'ConvertedCompYearly': 'MedianComp'})
    .sort_values('MedianComp', ascending=False)
)

custom_bar_color = '#A89F91'
background_color = '#E2DED3'
font_color = '#333333'

# Create the bar chart
fig = px.bar(
    median_comp,
    x='MedianComp',
    y='DevType',
    orientation='h',
    color_discrete_sequence=[custom_bar_color],
    title='Median Yearly Compensation by DevType (Global)',
    labels={
        'MedianComp': 'Median of Converted Compensation (Yearly)',
        'DevType': 'DevType'
    },
    text='MedianComp'
)

# Update layout for aesthetics and theme matching
fig.update_traces(
    texttemplate='%{text:,.0f}',
    textposition='outside',
    textfont=dict(size=12, color=font_color)
)
fig.update_layout(
    yaxis=dict(automargin=True, tickfont=dict(size=14, color=font_color)),
    xaxis=dict(tickfont=dict(size=14, color=font_color), title='Median of Converted Compensation (Yearly)'),
    title=dict(x=0.5, font=dict(size=22, color=font_color)),
    plot_bgcolor=background_color,
    paper_bgcolor=background_color,
    bargap=0.3,
    coloraxis_showscale=False,
    height=500,
    font=dict(family='Arial, sans-serif', color=font_color)
)

fig.show()
html_str = fig.to_html(full_html=False, include_plotlyjs='cdn')

# Save the snippet to a file
with open("median_salary_by_devtype_snippet.html", "w") as f:
    f.write(html_str)

In [None]:
# Median Salary by DevType just for the US

import plotly.express as px

# Remove rows with missing compensation or DevType
df_median = df[df['Country'] == "United States of America"].dropna(subset=['ConvertedCompYearly', 'DevType'])

# Group by 'EdLevel' and calculate the median compensation
median_comp = (
    df_median.groupby('DevType')['ConvertedCompYearly']
    .median()
    .reset_index()
    .rename(columns={'ConvertedCompYearly': 'MedianComp'})
    .sort_values('MedianComp', ascending=False)
)

custom_bar_color = '#A89F91'
background_color = '#E2DED3'
font_color = '#333333'

# Create the bar chart
fig = px.bar(
    median_comp,
    x='MedianComp',
    y='DevType',
    orientation='h',
    color_discrete_sequence=[custom_bar_color],
    title='Median Yearly Compensation by DevType (US)',
    labels={
        'MedianComp': 'Median of Converted Compensation (Yearly)',
        'DevType': 'DevType'
    },
    text='MedianComp'
)

# Update layout for aesthetics and theme matching
fig.update_traces(
    texttemplate='%{text:,.0f}',
    textposition='outside',
    textfont=dict(size=12, color=font_color)
)
fig.update_layout(
    yaxis=dict(automargin=True, tickfont=dict(size=14, color=font_color)),
    xaxis=dict(tickfont=dict(size=14, color=font_color), title='Median of Converted Compensation (Yearly)'),
    title=dict(x=0.5, font=dict(size=22, color=font_color)),
    plot_bgcolor=background_color,
    paper_bgcolor=background_color,
    bargap=0.3,
    coloraxis_showscale=False,
    height=500,
    font=dict(family='Arial, sans-serif', color=font_color)
)

fig.show()
html_str = fig.to_html(full_html=False, include_plotlyjs='cdn')

# Save the snippet to a file
with open("median_salary_by_devtype_US_snippet.html", "w") as f:
    f.write(html_str)