#### Constants

In [9]:
# Color constants
GRAY_1 = "#CCCCCC"
GRAY_2 = "#657072"
GRAY_3 = "#4A606C"
BLUE_1 = "#1f77b4"

### Line Chart – Trend of total eligible applicants over time

In [None]:
import pandas as pd
import plotly.express as px

# Load data
file_path = "../data/scb/behoriga_sokande_YH_kurser_2020_2024.csv"
df = pd.read_csv(file_path, encoding="latin1")

# Clean column name for easier access
df.columns = df.columns.str.strip()

# Rename the target column for simplicity
df = df.rename(columns={
    "Antal behöriga sökande till yrkeshögskolans kurser": "Antal"
})

# Convert "Antal" to numeric (remove any commas or dashes if needed)
df["Antal"] = pd.to_numeric(df["Antal"].astype(str).str.replace(" ", "").str.replace("-", "0"), errors='coerce')

# Filter only "totalt" gender (if you want overall total, not per gender)
df_total = df[df["kön"].str.lower() == "totalt"]
df_women = df[df["kön"].str.lower() == "kvinnor"]
df_men = df[df["kön"].str.lower() == "män"]

# Group by year and sum the applicants
yearly_summary = df_total.groupby("år")["Antal"].sum().reset_index()

# Create line plot
fig = px.line(
    yearly_summary,
    x="år",
    y="Antal",
    markers=True,
    title="Totalt antal behöriga sökande till YH-kurser (2020–2024)",
    labels={"år": "År", "Antal": "Antal behöriga sökande"}
)

fig.update_traces(line=dict(color="#0284c7", width=3), marker=dict(size=8))
fig.update_layout(
    plot_bgcolor="white",
    xaxis=dict(showline=True, linewidth=1, linecolor="#aaa"),
    yaxis=dict(showline=True, linewidth=1, linecolor="#aaa"),
    font=dict(family="Arial", size=14)
)

fig.show()

In [12]:
import pandas as pd
import plotly.graph_objects as go

# Load data
file_path = "../data/scb/behoriga_sokande_YH_kurser_2020_2024.csv"
df = pd.read_csv(file_path, encoding="latin1")
df.columns = ["kön", "utbildningsområde", "ålder", "år", "antal_behöriga"]

# Convert to numeric, force errors to NaN and drop rows with missing values
df["antal_behöriga"] = (
    df["antal_behöriga"]
    .astype(str)
    .str.replace("\u202f", "", regex=False)  # Remove narrow non-breaking space
    .str.replace(" ", "")                   # Remove regular spaces
    .str.replace(",", "")                   # Just in case commas are used
)

df["antal_behöriga"] = pd.to_numeric(df["antal_behöriga"], errors="coerce")

# Filter relevant rows
filtered_df = df[
    (df["ålder"].str.lower() == "totalt") &
    (df["utbildningsområde"].str.lower() == "totalt") &
    (df["kön"].str.lower().isin(["totalt", "kvinnor", "män"]))
]

# Pivot so each gender becomes a column
pivot_df = filtered_df.pivot_table(
    index="år",
    columns="kön",
    values="antal_behöriga",
    aggfunc="sum"
).reset_index()

# Convert 'år' to numeric (just in case it's read as string)
pivot_df["år"] = pd.to_numeric(pivot_df["år"])

# Sort by year
pivot_df = pivot_df.sort_values("år")

# Plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=pivot_df["år"],
    y=pivot_df["män"],
    mode='lines+markers',
    name='Män',
    line=dict(color=GRAY_1, width=2)
))

fig.add_trace(go.Scatter(
    x=pivot_df["år"],
    y=pivot_df["kvinnor"],
    mode='lines+markers',
    name='Kvinnor',
    line=dict(color=GRAY_3, width=2)
))

fig.add_trace(go.Scatter(
    x=pivot_df["år"],
    y=pivot_df["totalt"],
    mode='lines+markers',
    name='Totalt',
    line=dict(color=BLUE_1, width=2, )
))

fig.update_layout(
    title="Antal behöriga sökande till YH-kurser per år",
    xaxis_title="År",
    yaxis_title="Antal sökande",
    plot_bgcolor="white",
    paper_bgcolor="white",
    xaxis=dict(
        type="linear", 
        tickmode="linear",
        dtick=1,
        tickformat=".0f"
    ),
    yaxis=dict(
        tickformat=",",
        rangemode="tozero"
    ),
    legend=dict(orientation="h", y=1.05, x=0.5, xanchor="center")
)

fig.show()


### Same data presented as a stacked bar chart with:

- Years on the x-axis

- Kvinnor (women) at the bottom of the stacked bars

- Män (men) stacked on top of kvinnor

- A dot marker representing the total for each year

In [13]:
# Create figure
fig = go.Figure()

# Kvinnor (bottom of stacked bar)
fig.add_trace(go.Bar(
    x=pivot_df["år"],
    y=pivot_df["kvinnor"],
    name="Kvinnor",
    marker_color=BLUE_1  # Orange
))

# Män (top of stacked bar)
fig.add_trace(go.Bar(
    x=pivot_df["år"],
    y=pivot_df["män"],
    name="Män",
    marker_color=GRAY_1  # Blue
))

# Totalt (dot)
fig.add_trace(go.Scatter(
    x=pivot_df["år"],
    y=pivot_df["totalt"],
    name="Totalt",
    mode="markers+text",
    marker=dict(color="#4A606C", size=10, symbol="circle"),
    text=pivot_df["totalt"],
    textposition="top center",
    textfont=dict(color=GRAY_3)
))

# Layout
fig.update_layout(
    barmode="stack",
    title="Antal behöriga sökande till YH-kurser per år",
    xaxis_title="År",
    yaxis_title="Antal sökande",
    plot_bgcolor="white",
    paper_bgcolor="white",
    xaxis=dict(type="category"),
    yaxis=dict(tickformat=","),
    legend=dict(orientation="h", y=1.05, x=0.5, xanchor="center")
)

fig.show()

In [16]:
pivot_df["kvinnor_%"] = (pivot_df["kvinnor"] / pivot_df["totalt"] * 100).round(1)
pivot_df["män_%"] = (pivot_df["män"] / pivot_df["totalt"] * 100).round(1)
pivot_df["kvinnor_growth_%"] = pivot_df["kvinnor"].pct_change() * 100
pivot_df["kvinnor_growth_%"] = pivot_df["kvinnor_growth_%"].round(1)
pivot_df["män_growth_%"] = pivot_df["män"].pct_change() * 100
pivot_df["män_growth_%"] = pivot_df["män_growth_%"].round(1)
pivot_df

kön,år,kvinnor,män,totalt,kvinnor_%,män_%,kvinnor_growth_%,män_growth_%
0,2020,4418.0,2646.0,7064.0,62.5,37.5,,
1,2021,5520.0,2872.0,8392.0,65.8,34.2,24.9,8.5
2,2022,6816.0,3574.0,10390.0,65.6,34.4,23.5,24.4
3,2023,11830.0,5812.0,17642.0,67.1,32.9,73.6,62.6
4,2024,13687.0,7694.0,21381.0,64.0,36.0,15.7,32.4


In [17]:
df.head()

Unnamed: 0,kön,utbildningsområde,ålder,år,antal_behöriga
0,totalt,Totalt,totalt,2020,7064.0
1,totalt,Totalt,totalt,2021,8392.0
2,totalt,Totalt,totalt,2022,10390.0
3,totalt,Totalt,totalt,2023,17642.0
4,totalt,Totalt,totalt,2024,21381.0


In [29]:
# Filter for one year and 'Totalt' utbildningsområde
year = 2024
df_filtered = df[
    (df["år"] == year) &
    (df["utbildningsområde"].str.lower() == "totalt") &
    (df["ålder"].str.lower() != "totalt") &  # Exclude 'totalt' ålder
    (df["kön"].str.lower().isin(["kvinnor", "män"]))
]
df_filtered


Unnamed: 0,kön,utbildningsområde,ålder,år,antal_behöriga
569,kvinnor,Totalt,-24 år,2024,411.0
574,kvinnor,Totalt,25-29 år,2024,1411.0
579,kvinnor,Totalt,30-34 år,2024,2248.0
584,kvinnor,Totalt,35-39 år,2024,2451.0
589,kvinnor,Totalt,40-44 år,2024,2105.0
594,kvinnor,Totalt,45+ år,2024,5061.0
1129,män,Totalt,-24 år,2024,400.0
1134,män,Totalt,25-29 år,2024,916.0
1139,män,Totalt,30-34 år,2024,1401.0
1144,män,Totalt,35-39 år,2024,1423.0


In [31]:
# Pivot to have one row per ålder, columns for kvinnor and män
pivot_age = df_filtered.pivot_table(
    index="ålder",
    columns="kön",
    values="antal_behöriga",
    aggfunc="sum"
).fillna(0)

# Optional: ensure age groups are sorted logically (you may customize this order)
age_order = ["-24 år", "25-29 år", "30-34 år", "35-39 år", "40-44 år", "45-49 år", "50-54 år", "55-59 år", "60-64 år", "65- år"]
pivot_age = pivot_age.reindex(age_order).dropna(how='all')


# Plotly stacked bar
fig = go.Figure()

# Add women bars
fig.add_trace(go.Bar(
    x=pivot_age.index,
    y=pivot_age["kvinnor"],
    name="Kvinnor",
    marker_color="#f59e0b"
))

# Add men bars
fig.add_trace(go.Bar(
    x=pivot_age.index,
    y=pivot_age["män"],
    name="Män",
    marker_color="#0284c7"
))

# Layout
fig.update_layout(
    barmode='group',
    title=f"Behöriga sökande per ålder ({year})",
    xaxis_title="Åldersgrupp",
    yaxis_title="Antal sökande",
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend=dict(orientation="h", y=1.1, x=0.5, xanchor="center"),
    xaxis=dict(type="category", tickangle=0),
    yaxis=dict(tickformat=",")
)

fig.show()

In [32]:
# Add women bars
fig.add_trace(go.Bar(
    x=pivot_age.index,
    y=pivot_age["kvinnor"],
    name="Kvinnor",
    marker_color="#f59e0b",
    opacity=0.6
))

# Add men bars
fig.add_trace(go.Bar(
    x=pivot_age.index,
    y=pivot_age["män"],
    name="Män",
    marker_color="#0284c7",
    opacity=0.6
))
fig.update_layout(barmode='overlay')

In [35]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=pivot_age.index, y=pivot_age["kvinnor"], mode='lines+markers', name="Kvinnor", line=dict(color="#f59e0b")))
fig.add_trace(go.Scatter(x=pivot_age.index, y=pivot_age["män"], mode='lines+markers', name="Män", line=dict(color="#0284c7")))


In [38]:
pivot_pct = pivot_age.div(pivot_age.sum(axis=0), axis=1) * 100  # percentage per gender
fig = go.Figure()
fig.add_trace(go.Scatter(x=pivot_pct.index, y=pivot_pct["kvinnor"], mode='lines+markers', name="Kvinnor", line=dict(color="#f59e0b")))
fig.add_trace(go.Scatter(x=pivot_pct.index, y=pivot_pct["män"], mode='lines+markers', name="Män", line=dict(color="#0284c7")))


In [47]:
import plotly.express as px

# Filter for kvinnor
df_kvinnor = df[
    (df["kön"].str.lower() == "kvinnor") &
    (df["utbildningsområde"].str.lower() == "totalt") &
    (df["ålder"].str.lower() != "totalt")
]

fig = px.bar(
    df_kvinnor,
    x="ålder",
    y="antal_behöriga",
    facet_col="år",
    color_discrete_sequence=["#f59e0b"],
    category_orders={"ålder": sorted(df_kvinnor["ålder"].unique(), key=lambda x: x)}
)

fig.update_layout(
    title="Fördelning av behöriga kvinnor per åldersgrupp och år",
    yaxis_title="Antal sökande",
    xaxis_title="Ålder",
    height=500
)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))  # Clean facet titles
fig.show()

In [57]:
import plotly.express as px

def plot_histogram_by_gender(df, gender="kvinnor", color="#f59e0b"):
    """
    Plots a histogram of eligible applicants per age group and year for the selected gender.
    gender: "kvinnor", "män", or "totalt"
    color: bar color (default orange for kvinnor, blue for män, gray for totalt)
    """
    gender = gender.lower()
    color_map = {
        "kvinnor": "#f59e0b",
        "män": "#0284c7",
        "totalt": "#4A606C"
    }
    bar_color = color_map.get(gender, color)
    df_gender = df[
        (df["kön"].str.lower() == gender) &
        (df["utbildningsområde"].str.lower() == "totalt") &
        (df["ålder"].str.lower() != "totalt")
    ]
    fig = px.bar(
        df_gender,
        x="ålder",
        y="antal_behöriga",
        facet_col="år",
        color_discrete_sequence=[bar_color],
        category_orders={"ålder": sorted(df_gender["ålder"].unique(), key=lambda x: x)}
    )
    fig.update_layout(
        title=f"Fördelning av behöriga {gender} per åldersgrupp och år",
        yaxis_title="Antal sökande",
        xaxis_title="Ålder",
        height=500
    )
    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))  # Clean facet titles
    fig.show()

# Example usage:
plot_histogram_by_gender(df, gender="kvinnor")
plot_histogram_by_gender(df, gender="män")
plot_histogram_by_gender(df, gender="totalt")

In [58]:
df_kvinnor
# Normalize antal_behöriga within each year
df_kvinnor.loc[:, "procent"] = df_kvinnor.groupby("år")["antal_behöriga"].transform(lambda x: round(x / x.sum() * 100,1))
#df_kvinnor

fig = px.bar(
    df_kvinnor,
    x="ålder",
    y="procent",
    facet_col="år",
    color_discrete_sequence=["#f59e0b"],
    category_orders={"ålder": sorted(df_kvinnor["ålder"].unique(), key=lambda x: x)}
)

fig.update_layout(
    title="Fördelning av behöriga kvinnor per åldersgrupp och år",
    yaxis_title="Procent sökande",
    xaxis_title="Ålder",
    height=500
)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))  # Clean facet titles
fig.show()

In [60]:
import plotly.express as px

def plot_histogram_by_gender(df, gender="kvinnor", color="#f59e0b", values="absolute"):
    """
    Plots a histogram of eligible applicants per age group and year for the selected gender.
    gender: "kvinnor", "män", or "totalt"
    color: bar color (default orange for kvinnor, blue for män, gray for totalt)
    values: "absolute" (number of applicants) or "normalized" (percent of total for that gender/year)
    """
    gender = gender.lower()
    color_map = {
        "kvinnor": "#f59e0b",
        "män": "#0284c7",
        "totalt": "#4A606C"
    }
    bar_color = color_map.get(gender, color)
    df_gender = df[
        (df["kön"].str.lower() == gender) &
        (df["utbildningsområde"].str.lower() == "totalt") &
        (df["ålder"].str.lower() != "totalt")
    ].copy()

    if values == "normalized":
        # Normalize antal_behöriga within each year for this gender
        df_gender["procent"] = df_gender.groupby("år")["antal_behöriga"].transform(
            lambda x: round(x / x.sum() * 100, 1)
        )
        y_col = "procent"
        y_title = "Procent sökande"
    else:
        y_col = "antal_behöriga"
        y_title = "Antal sökande"

    fig = px.bar(
        df_gender,
        x="ålder",
        y=y_col,
        facet_col="år",
        color_discrete_sequence=[bar_color],
        category_orders={"ålder": sorted(df_gender["ålder"].unique(), key=lambda x: x)}
    )
    fig.update_layout(
        title=f"Fördelning av behöriga {gender} per åldersgrupp och år",
        yaxis_title=y_title,
        xaxis_title="Ålder",
        height=500
    )
    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))  # Clean facet titles
    fig.show()

# Example usage:
plot_histogram_by_gender(df, gender="kvinnor", values="absolute")
plot_histogram_by_gender(df, gender="kvinnor", values="normalized")
plot_histogram_by_gender(df, gender="män", values="normalized")
plot_histogram_by_gender(df, gender="totalt", values="absolute")

### Distribution of applicants across different educational areas (utbildningsområde) by sex (kvinnor, män, totalt)
- X-axis: Educational area (utbildningsområde)
- Y-axis: Number of eligible applicants (antal_behöriga)
- Grouped by: Sex (kvinnor, män, totalt)
- Filter: Most recent year (e.g., 2024)
- Plot type: Horizontal bar chart

In [63]:
import pandas as pd
import plotly.express as px

# Load data
file_path = "../data/scb/behoriga_sokande_YH_kurser_2020_2024.csv"
df = pd.read_csv(file_path, encoding="latin1")

# Rename columns
df.columns = ["kön", "utbildningsområde", "ålder", "år", "antal_behöriga"]

# Replace empty or NaN values with 0
df = df.fillna(0)

# Ensure numeric for 'antal_behöriga'
df["antal_behöriga"] = pd.to_numeric(df["antal_behöriga"], errors="coerce").fillna(0)
df["år"] = df["år"].astype(str)

# Choose which sex to plot: "kvinnor", "män", or "totalt"
sex_filter = "kvinnor"  # <-- change this as needed

# Filter for latest year, total ålder, total or specific kön
latest_year = df["år"].max()
filtered_df = df[
    (df["år"] == latest_year) &
    (df["ålder"].str.lower() == "totalt") &
    (df["kön"].str.lower() == sex_filter.lower()) &
    (df["utbildningsområde"].str.lower() != "totalt")
]

# Sort for cleaner bar chart
filtered_df = filtered_df.sort_values("antal_behöriga", ascending=True)

# Plot
fig = px.bar(
    filtered_df,
    x="antal_behöriga",
    y="utbildningsområde",
    orientation="h",
    title=f"Behöriga sökande per utbildningsområde – {sex_filter.capitalize()} ({latest_year})",
    color_discrete_sequence=["#f59e0b"] if sex_filter == "kvinnor" else
                           ["#0284c7"] if sex_filter == "män" else
                           ["#4A606C"]
)

fig.update_layout(
    xaxis_title="Antal behöriga sökande",
    yaxis_title="Utbildningsområde",
    plot_bgcolor="white",
    paper_bgcolor="white",
    showlegend=False
)

fig.show()


In [66]:
import plotly.express as px

def plot_eligible_by_field(df, year=None, sex="kvinnor"):
    """
    Plots a horizontal bar chart of eligible applicants per educational area (utbildningsområde)
    for a selected year and sex ("kvinnor", "män", or "totalt").
    """
    color_map = {
        "kvinnor": "#f59e0b",
        "män": "#0284c7",
        "totalt": "#4A606C"
    }
    sex = sex.lower()
    bar_color = color_map.get(sex, "#4A606C")
    # Use latest year if not specified
    if year is None:
        year = df["år"].max()
    year = str(year)
    filtered_df = df[
        (df["år"].astype(str) == year) &
        (df["ålder"].str.lower() == "totalt") &
        (df["kön"].str.lower() == sex) &
        (df["utbildningsområde"].str.lower() != "totalt")
    ].copy()
    filtered_df = filtered_df.sort_values("antal_behöriga", ascending=True)
    fig = px.bar(
        filtered_df,
        x="antal_behöriga",
        y="utbildningsområde",
        orientation="h",
        title=f"Behöriga sökande per utbildningsområde – {sex.capitalize()} ({year})",
        color_discrete_sequence=[bar_color]
    )
    fig.update_layout(
        xaxis_title="Antal behöriga sökande",
        yaxis_title="Utbildningsområde",
        plot_bgcolor="white",
        paper_bgcolor="white",
        showlegend=False
    )
    fig.show()

# Example usage:
plot_eligible_by_field(df, year=2022, sex="kvinnor")
plot_eligible_by_field(df, year=2022, sex="män")
plot_eligible_by_field(df, year=2022, sex="totalt")  # Uses latest year by default

In [67]:
import plotly.express as px

def plot_eligible_by_field(df, year=None, sex="kvinnor", top_n=None):
    """
    Plots a horizontal bar chart of eligible applicants per educational area (utbildningsområde)
    for a selected year and sex ("kvinnor", "män", or "totalt").
    top_n: Show only the top N areas (e.g., 3, 5, 10). If None, show all.
    """
    color_map = {
        "kvinnor": "#f59e0b",
        "män": "#0284c7",
        "totalt": "#4A606C"
    }
    sex = sex.lower()
    bar_color = color_map.get(sex, "#4A606C")
    # Use latest year if not specified
    if year is None:
        year = df["år"].max()
    year = str(year)
    filtered_df = df[
        (df["år"].astype(str) == year) &
        (df["ålder"].str.lower() == "totalt") &
        (df["kön"].str.lower() == sex) &
        (df["utbildningsområde"].str.lower() != "totalt")
    ].copy()
    filtered_df = filtered_df.sort_values("antal_behöriga", ascending=False)
    if top_n is not None:
        filtered_df = filtered_df.head(top_n)
    filtered_df = filtered_df.sort_values("antal_behöriga", ascending=True)  # For horizontal bar order

    fig = px.bar(
        filtered_df,
        x="antal_behöriga",
        y="utbildningsområde",
        orientation="h",
        title=f"Behöriga sökande per utbildningsområde – {sex.capitalize()} ({year})",
        color_discrete_sequence=[bar_color]
    )
    fig.update_layout(
        xaxis_title="Antal behöriga sökande",
        yaxis_title="Utbildningsområde",
        plot_bgcolor="white",
        paper_bgcolor="white",
        showlegend=False
    )
    fig.show()

# Example usage:
plot_eligible_by_field(df, year=2024, sex="kvinnor", top_n=3)
plot_eligible_by_field(df, year=2024, sex="män", top_n=5)
plot_eligible_by_field(df, year=2024, sex="totalt", top_n=10)
plot_eligible_by_field(df, year=2024, sex="kvinnor")  # Show all