### Horizontal stacked bar chart with:

- Number of students on the x-axis

- Kvinnor (women) at the bottom of the stacked bars

- Män (men) stacked on top of kvinnor

- A dot marker representing the total for each educational area

In [2]:
import pandas as pd

# Load data
file_path = "../data/scb/Antal antagna som påbörjat studier.csv"
df = pd.read_csv(file_path, encoding="latin1")

# Clean column name for easier access
df.columns = df.columns.str.strip()

In [3]:
def plot_stacked_bar_antagna(df, year=None):
    """
    Plots a horizontal stacked bar chart for a selected year showing:
    - Kvinnor and Män as stacked bars
    - A dot marker for Totalt
    Data: Antal antagna som påbörjat studier per utbildningsområde.
    Parameters:
        df: DataFrame with columns: kön, utbildningsområde, ålder, [2020–2024]
        year: str or int, e.g., 2024. If None, uses latest year.
    """
    import plotly.graph_objects as go

    # Rename columns for consistency
    df.columns = ["kön", "utbildningsområde", "ålder", "2020", "2021", "2022", "2023", "2024"]
    df = df.fillna(0)

    # Ensure year columns are int
    for col in ["2020", "2021", "2022", "2023", "2024"]:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)

    # Melt to long format
    df_long = df.melt(
        id_vars=["kön", "utbildningsområde", "ålder"],
        var_name="år",
        value_name="antal"
    )

    # Ensure antal is int
    df_long["antal"] = pd.to_numeric(df_long["antal"], errors="coerce").fillna(0).astype(int)

    # Filter for totals per age (ålder == 'totalt')
    df_filtered = df_long[df_long["ålder"].str.lower() == "totalt"]

    # Select year
    if year is None:
        year = df_filtered["år"].max()
    year = str(year)
    latest_data = df_filtered[df_filtered["år"] == year]

    # Pivot to get kvinnor, män, totalt as columns
    pivot_df = latest_data.pivot_table(
        index="utbildningsområde",
        columns="kön",
        values="antal",
        aggfunc="sum"
    ).fillna(0).reset_index()

    pivot_df.columns.name = None
    pivot_df.rename(columns={
        "kvinnor": "Kvinnor",
        "män": "Män",
        "totalt": "Totalt"
    }, inplace=True)

    # Ensure numbers are int
    for col in ["Kvinnor", "Män", "Totalt"]:
        if col in pivot_df.columns:
            pivot_df[col] = pd.to_numeric(pivot_df[col], errors="coerce").fillna(0).astype(int)

    # Sort for better visual layout
    pivot_df = pivot_df.sort_values("Totalt")

    # Create figure
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=pivot_df["Kvinnor"],
        y=pivot_df["utbildningsområde"],
        name="Kvinnor",
        orientation="h",
        marker_color="#f59e0b"
    ))
    fig.add_trace(go.Bar(
        x=pivot_df["Män"],
        y=pivot_df["utbildningsområde"],
        name="Män",
        orientation="h",
        marker_color="#0284c7"
    ))
    fig.add_trace(go.Scatter(
        x=pivot_df["Totalt"],
        y=pivot_df["utbildningsområde"],
        mode="markers",
        name="Totalt",
        marker=dict(color="#4A606C", size=10, symbol="circle"),
        showlegend=True
    ))

    fig.update_layout(
        barmode="stack",
        title=f"Antal antagna per utbildningsområde ({year})",
        xaxis_title="Antal sökande",
        yaxis_title="Utbildningsområde",
        plot_bgcolor="white",
        paper_bgcolor="white",
        height=600,
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5)
    )
    fig.show()

# Example usage:
plot_stacked_bar_antagna(df, year=2024)

In [4]:
def plot_stacked_bar_antagna(df: pd.DataFrame, year: int = None) -> None:
    """
    Plots a horizontal stacked bar chart showing gender distribution of admitted students by education area.
    
    Parameters:
        df: DataFrame with columns: kön, utbildningsområde, ålder, [2020-2024]
        year: Year to plot data for (int or str). If None, uses latest available year
    
    Returns:
        None - displays a Plotly figure
    """
    import plotly.graph_objects as go
    import pandas as pd
    
    # Input validation
    if df.empty:
        raise ValueError("DataFrame is empty")
    
    # Rename columns for consistency
    df.columns = ["kön", "utbildningsområde", "ålder", "2020", "2021", "2022", "2023", "2024"]
    df = df.fillna(0)

    # Ensure year columns are int
    year_columns = ["2020", "2021", "2022", "2023", "2024"]
    for col in year_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)

    # Melt to long format
    df_long = df.melt(
        id_vars=["kön", "utbildningsområde", "ålder"],
        var_name="år",
        value_name="antal"
    )
    df_long["antal"] = pd.to_numeric(df_long["antal"], errors="coerce").fillna(0).astype(int)

    # Filter data
    df_filtered = df_long[
        (df_long["ålder"].str.lower() == "totalt") & 
        (df_long["utbildningsområde"].str.lower() != "totalt")
    ]

    # Select year
    if year is None:
        year = df_filtered["år"].max()
    year = str(year)
    
    if year not in year_columns:
        raise ValueError(f"Year must be one of {year_columns}")
        
    current_data = df_filtered[df_filtered["år"] == year]

    # Create pivot table
    pivot_df = current_data.pivot_table(
        index="utbildningsområde",
        columns="kön",
        values="antal",
        aggfunc="sum"
    ).fillna(0).reset_index()

    # Format column names
    pivot_df.columns.name = None
    pivot_df.rename(columns={
        "kvinnor": "Kvinnor",
        "män": "Män",
        "totalt": "Totalt"
    }, inplace=True)

    # Ensure numbers are int
    for col in ["Kvinnor", "Män", "Totalt"]:
        if col in pivot_df.columns:
            pivot_df[col] = pd.to_numeric(pivot_df[col], errors="coerce").fillna(0).astype(int)

    # Sort by total admitted students
    pivot_df = pivot_df.sort_values("Totalt")

    # Create visualization
    fig = go.Figure()
    
    # Add stacked bars
    fig.add_trace(go.Bar(
        x=pivot_df["Kvinnor"],
        y=pivot_df["utbildningsområde"],
        name="Kvinnor",
        orientation="h",
        marker_color="#f59e0b"
    ))
    fig.add_trace(go.Bar(
        x=pivot_df["Män"],
        y=pivot_df["utbildningsområde"],
        name="Män",
        orientation="h",
        marker_color="#0284c7"
    ))
    
    # Add total markers
    fig.add_trace(go.Scatter(
        x=pivot_df["Totalt"],
        y=pivot_df["utbildningsområde"],
        mode="markers",
        name="Totalt",
        marker=dict(color="#4A606C", size=10, symbol="circle"),
        showlegend=True
    ))

    # Layout configuration
    fig.update_layout(
        barmode="stack",
        title=f"Antal antagna per utbildningsområde ({year})",
        xaxis_title="Antal sökande",
        yaxis_title="Utbildningsområde",
        plot_bgcolor="white",
        paper_bgcolor="white",
        height=600,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5
        )
    )
    
    fig.show()

# Example usage:
plot_stacked_bar_antagna(df, year=2024)

# Matplotlib version, sorted after the female ratio

In [5]:
def plot_stacked_bar_antagna_matplotlib(df: pd.DataFrame, year: int = None) -> None:
    """
    Plots a horizontal stacked bar chart showing gender distribution of admitted students by education area.
    Uses matplotlib with sorting by female percentage and annotation of percentages.
    
    Parameters:
        df: DataFrame with columns: kön, utbildningsområde, ålder, [2020-2024]
        year: Year to plot data for (int or str). If None, uses latest available year
    
    Returns:
        None - displays a matplotlib figure
    """
    import matplotlib.pyplot as plt
    import numpy as np
    
    # Input validation
    if df.empty:
        raise ValueError("DataFrame is empty")
    
    # Rename columns for consistency
    df.columns = ["kön", "utbildningsområde", "ålder", "2020", "2021", "2022", "2023", "2024"]
    df = df.fillna(0)

    # Ensure year columns are int
    year_columns = ["2020", "2021", "2022", "2023", "2024"]
    for col in year_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)

    # Melt to long format
    df_long = df.melt(
        id_vars=["kön", "utbildningsområde", "ålder"],
        var_name="år",
        value_name="antal"
    )
    df_long["antal"] = pd.to_numeric(df_long["antal"], errors="coerce").fillna(0).astype(int)

    # Filter data
    df_filtered = df_long[
        (df_long["ålder"].str.lower() == "totalt") & 
        (df_long["utbildningsområde"].str.lower() != "totalt")
    ]

    # Select year
    if year is None:
        year = df_filtered["år"].max()
    year = str(year)
    
    if year not in year_columns:
        raise ValueError(f"Year must be one of {year_columns}")
        
    current_data = df_filtered[df_filtered["år"] == year]

    # Create pivot table
    pivot_df = current_data.pivot_table(
        index="utbildningsområde",
        columns="kön",
        values="antal",
        aggfunc="sum"
    ).fillna(0).reset_index()

    # Format column names
    pivot_df.columns.name = None
    pivot_df.rename(columns={
        "kvinnor": "Kvinnor",
        "män": "Män",
        "totalt": "Totalt"
    }, inplace=True)

    # Ensure numbers are int
    for col in ["Kvinnor", "Män", "Totalt"]:
        if col in pivot_df.columns:
            pivot_df[col] = pd.to_numeric(pivot_df[col], errors="coerce").fillna(0).astype(int)
    
    # Calculate percentage of women
    pivot_df["Kvinnor_Pct"] = (pivot_df["Kvinnor"] / pivot_df["Totalt"] * 100).round(1)
    
    # Sort by percentage of women (descending)
    pivot_df = pivot_df.sort_values("Kvinnor_Pct", ascending=True)
    
    # Define colors (matching the Plotly version)
    color_women = "#f59e0b"  # Orange for women
    color_men = "#0284c7"    # Blue for men
    color_total = "#4A606C"  # Dark gray for total
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 10))
    
    # Plot data
    y_pos = np.arange(len(pivot_df["utbildningsområde"]))
    width = 0.8
    
    # Plot stacked bars
    ax.barh(y_pos, pivot_df["Män"], height=width, color=color_men, label="Män")
    ax.barh(y_pos, pivot_df["Kvinnor"], height=width, color=color_women, left=pivot_df["Män"], label="Kvinnor")
    
    # Plot total markers (circles)
    for i, (total, area) in enumerate(zip(pivot_df["Totalt"], pivot_df["utbildningsområde"])):
        ax.scatter(total, i, color=color_total, s=80, zorder=3, label="_nolegend_")
    
    # Add a single circle to the legend for "Totalt"
    ax.scatter([], [], color=color_total, s=80, label="Totalt")
    
    # Add percentage annotations
    for i, (kvinnor_pct, total) in enumerate(zip(pivot_df["Kvinnor_Pct"], pivot_df["Totalt"])):
        män_pct = 100 - kvinnor_pct
        
        # Add percentage text for women
        ax.text(
            total + (total * 0.03),  # Position after the bar
            i,
            f"{kvinnor_pct}% kvinnor",
            va='center',
            ha='left',
            color=color_women,
            fontweight='bold',
            fontsize=9
        )
        
        # Add percentage text for men (optional)
        ax.text(
            total + (total * 0.03),
            i - 0.3,  # Slightly below
            f"{män_pct}% män",
            va='center',
            ha='left',
            color=color_men,
            fontsize=9
        )
    
    # Customize plot
    ax.set_yticks(y_pos)
    ax.set_yticklabels(pivot_df["utbildningsområde"])
    ax.set_xlabel("Antal antagna")
    ax.set_title(f"Antal antagna per utbildningsområde ({year})\nSorterat efter andel kvinnor", 
                 fontweight='bold', fontsize=14, loc='left', pad=10)
    
    # Add legend
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=3)
    
    # Remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    # Add explanation text
    plt.figtext(
        0.5, 0.01, 
        "Cirklarna representerar det totala antalet antagna studenter per område.", 
        ha='center', fontsize=10, style='italic'
    )
    
    # Adjust layout
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.12)
    
    # Show plot
    plt.show()
    
    return fig, ax

# Example usage:
# plot_stacked_bar_antagna_matplotlib(df, year=2024)

#####  A function that creates a horizontal bar chart showing the percentage distribution of students across educational areas for a selected year, sex, and age group.
This function:

- Takes parameters for year, sex, and age group
- Filters the data accordingly
- Calculates the percentage distribution
- Creates a horizontal bar chart showing what percentage of students (of the selected sex and age group) chose each educational area
- Includes percentage labels on the bars
- Sorts the areas by percentage (ascending)
- Handles input validation and error cases
- The bars show the percentage of total students in each educational area, making it easy to compare the relative popularity of different areas for specific demographic groups.

In [6]:
def plot_education_distribution(df: pd.DataFrame, year: int = None, sex: str = "totalt", age: str = "totalt") -> None:
    """
    Plot percentage distribution of students across educational areas for selected parameters.
    
    Parameters:
        df: DataFrame with columns: kön, utbildningsområde, ålder, [2020-2024]
        year: Year to analyze (int). If None, uses latest available year
        sex: "kvinnor", "män", or "totalt"
        age: Age group to analyze (str), e.g., "24 år eller yngre", "25-29 år", etc.
    """
    import plotly.graph_objects as go
    
    # Input validation
    if df.empty:
        raise ValueError("DataFrame is empty")
        
    # Normalize inputs
    sex = sex.lower()
    age = age.lower()
    
    # Rename columns for consistency
    df.columns = ["kön", "utbildningsområde", "ålder", "2020", "2021", "2022", "2023", "2024"]
    df = df.fillna(0)

    # Convert year columns to int
    year_columns = ["2020", "2021", "2022", "2023", "2024"]
    for col in year_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)

    # Select year first
    if year is None:
        year = str(max(map(int, year_columns)))
    else:
        year = str(year)
    
    if year not in year_columns:
        raise ValueError(f"Year must be one of {year_columns}")

    # Melt to long format
    df_long = df.melt(
        id_vars=["kön", "utbildningsområde", "ålder"],
        var_name="år",
        value_name="antal"
    )
    df_long["antal"] = pd.to_numeric(df_long["antal"], errors="coerce").fillna(0).astype(int)

    # Filter data all at once
    df_filtered = df_long[
        (df_long["kön"].str.lower() == sex) &
        (df_long["ålder"].str.lower() == age) &
        (df_long["utbildningsområde"].str.lower() != "totalt") &
        (df_long["år"] == year)
    ].copy()

    # Check if we have data
    if df_filtered.empty:
        raise ValueError(f"No data found for year={year}, sex={sex}, age={age}")

    # Calculate percentages
    total_students = df_filtered["antal"].sum()
    df_filtered.loc[:, "procent"] = round(df_filtered["antal"] / total_students * 100, 1)
    
    # Sort by percentage
    df_filtered = df_filtered.sort_values("procent", ascending=True)

    # Create visualization
    fig = go.Figure()
    
    # Add percentage bars
    fig.add_trace(go.Bar(
        x=df_filtered["procent"],
        y=df_filtered["utbildningsområde"],
        orientation="h",
        marker_color="#0284c7",
        text=df_filtered["procent"].apply(lambda x: f"{x}%"),
        textposition="outside"
    ))

    # Layout configuration
    sex_title = sex.capitalize()
    age_title = age.capitalize()
    
    fig.update_layout(
        title=f"Fördelning av {sex_title} per utbildningsområde ({year}, {age_title})",
        xaxis_title="Procent",
        yaxis_title="Utbildningsområde",
        plot_bgcolor="white",
        paper_bgcolor="white",
        height=600,
        showlegend=False,
        xaxis=dict(
            range=[0, max(df_filtered["procent"]) * 1.1]  # Add 10% padding for labels
        )
    )
    
    fig.show()

# Example usage:
plot_education_distribution(df, year=2024, sex="kvinnor", age="-24 år")
plot_education_distribution(df, year=2024, sex="kvinnor", age="25-29 år")
plot_education_distribution(df, year=2024, sex="kvinnor", age="30-34 år")
plot_education_distribution(df, year=2024, sex="kvinnor", age="35-39 år")
plot_education_distribution(df, year=2024, sex="kvinnor", age="40-44 år")
plot_education_distribution(df, year=2024, sex="kvinnor", age="45+ år")
