In [70]:
import os
import json
from functools import partial

import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex
from shapely.geometry import shape
import folium
from folium import FeatureGroup
from folium.plugins import TimestampedGeoJson
import ipywidgets as widgets
from IPython.display import display, clear_output

pd.set_option('future.no_silent_downcasting', True)

# Election data

## Get Geography Data

In [71]:
# Load the data
df_geo = pd.read_csv('Data/raw/Geografi.csv', sep=';', decimal=',', na_values='-')
df_geo.columns = df_geo.columns.str.replace(' ', '', regex=True)

# Filter the data to only keep Copnhagen and its surroundings
df_geo_cph = df_geo.query("Storkredsnavn=='Københavns Storkreds'").drop_duplicates()
df_geo_cph['KredsNr'] = df_geo_cph['KredsNr'].astype(int)

# Keep only the necessary columns
df_geo_cph = df_geo_cph[['KredsNr', 'Kredsnavn','KommuneNr','Kommunenavn']]

# Replace Utterslev with Bispebjerg
df_geo_cph['Kredsnavn'] = df_geo_cph['Kredsnavn'].replace({'6. Utterslev':'6. Bispebjerg'})

In [3]:
kreds_split = df_geo_cph['Kredsnavn'].drop_duplicates().str.split('.', n=1, expand=True)
kreds_split.columns = ['ID', 'Name']
kreds_split['ID'] = kreds_split['ID'].astype(int)
kreds_split['Name'] = kreds_split['Name'].str.strip()  # Remove leading/trailing spaces
kreds_map = dict(zip(kreds_split['ID'], kreds_split['Name']))
kreds_name_map = {v: k for k, v in kreds_map.items()}

## Get Election Data

In [4]:
# Load the election data
df_elec_data = pd.read_csv('Data/raw/Valgdata.csv', sep=';' , decimal=',', na_values='-')
df_elec_data.columns = df_elec_data.columns.str.replace(' ', '', regex=True)

# Filter the election data to only keep the relevant KredsNr
df_elec_cph = df_elec_data[df_elec_data['KredsNr'].isin([str(k) for k in kreds_map.keys()])]
df_elec_cph.loc[:, 'KredsNr'] = df_elec_cph['KredsNr'].astype(int)

#### Format into long format

In [5]:
# Select vote cols (start with "FV")
vote_columns = [col for col in df_elec_cph.columns if col.startswith("FV")]

# Melt the dataframe to long format
df_elec_cph_long = df_elec_cph.melt(
    id_vars=['KredsNr'],
    value_vars=vote_columns,
    var_name='YearParty',
    value_name='Votes'
)

# Split the 'YearParty' column into 'Year' and 'Party'
df_elec_cph_long[['Year', 'Partyname']] = df_elec_cph_long['YearParty'].str.extract(r'FV(\d{4})-(.+)')

# Drop columns and reorder
df_elec_cph_long = df_elec_cph_long.drop(columns='YearParty')
df_elec_cph_long = df_elec_cph_long[['KredsNr', 'Year', 'Partyname', 'Votes']]

# Drop nan if the whole row is nan
df_elec_cph_long = df_elec_cph_long.dropna(how='all')

# Replace NaN values in Votes with 0
df_elec_cph_long['Votes'] = df_elec_cph_long['Votes'].fillna(0)

# Convert 'Year' and KredsNr to integer
df_elec_cph_long['Year'] = df_elec_cph_long['Year'].astype(int)
df_elec_cph_long['KredsNr'] = df_elec_cph_long['KredsNr'].astype(int)

In [6]:
parties = ['A.Socialdemokratiet','B.DetRadikaleVenstre', 'C.DetKonservativeFolkeparti','D.Centrum-Demokraterne', 'F.SF-SocialistiskFolkeparti',
           'I.LiberalAlliance', 'K.Kristendemokraterne', 'O.DanskFolkeparti','M.Minoritetspartiet', 'V.Venstre,DanmarksLiberaleParti',
           'Y.NyAlliance', 'Ø.Enhedslisten-DeRød-Grønne','Q.FrieGrønne,DanmarksNyeVenstrefløjsparti','Å.Alternativet',   'P.StramKurs', 
           'Æ.Danmarksdemokraterne-IngerStøjberg', 'E.KlausRiskærPedersen']

blocks = {
    "left": ['A.Socialdemokratiet',
             'F.SF-SocialistiskFolkeparti',
             'Ø.Enhedslisten-DeRød-Grønne',
             'Q.FrieGrønne,DanmarksNyeVenstrefløjsparti',
             'Å.Alternativet',
             'B.DetRadikaleVenstre',
             'D.Centrum-Demokraterne',
             'M.Minoritetspartiet'],
    "right": ['C.DetKonservativeFolkeparti',
              'V.Venstre,DanmarksLiberaleParti',
              'I.LiberalAlliance',
              'O.DanskFolkeparti',
              'Æ.Danmarksdemokraterne-IngerStøjberg',
              'P.StramKurs',
              'K.Kristendemokraterne',
              'Y.NyAlliance',
              'E.KlausRiskærPedersen']}

In [7]:
df_elec_cph_long_filtered = df_elec_cph_long[df_elec_cph_long['Partyname'].isin(parties)].reset_index(drop=True)

In [8]:
df_elec_cph_long_filtered['block'] = df_elec_cph_long_filtered['Partyname'].apply(
    lambda x: 'left' if x in blocks['left'] else ('right' if x in blocks['right'] else 'other')
)

In [None]:
df_block_freq = df_elec_cph_long_filtered.groupby(['Year', 'block', 'KredsNr']).agg({'Votes': 'sum'})/df_elec_cph_long_filtered.groupby(['Year', 'KredsNr']).agg({'Votes': 'sum'})*100
df_block_freq = df_block_freq.reset_index().rename(columns={'Votes': 'Votes (%)'}).round(2)

In [66]:
def blend_red_blue(pct_right):
    pct_right = min(max(pct_right, 0), 1)  # Clamp
    r = int((1 - pct_right) * 255)
    g = 0
    b = int(pct_right * 255)
    return f"#{r:02x}{g:02x}{b:02x}"

pivot = df_block_freq.pivot_table(
    index=["Year", "KredsNr"],
    columns="block",
    values="Votes (%)",
    fill_value=0
).reset_index()

pivot["pct_left"] = pivot["left"] / (pivot["right"] + pivot["left"])
pivot["pct_right"] = pivot["right"] / (pivot["right"] + pivot["left"])

geojson_folder = "Data/CopenhagenGeoData"

m = folium.Map(location=[55.6761, 12.5683], zoom_start=12, tiles="cartodbpositron")

for year in sorted(pivot["Year"].unique()):
    fg = FeatureGroup(name=str(year))

    df_year = pivot[pivot["Year"] == year]
    
    for _, row in df_year.iterrows():
        kreds_nr = row["KredsNr"]
        kreds_name = kreds_map.get(kreds_nr)
        if not kreds_name:
            continue

        pct_right = row["pct_right"]
        pct_left = row["pct_left"]
        fill_color = blend_red_blue(pct_right)

        geojson_path = os.path.join(geojson_folder, f"{kreds_name}.json")
        if not os.path.exists(geojson_path):
            continue

        with open(geojson_path, "r", encoding="utf-8") as f:
            geojson = json.load(f)
            geom = geojson["geometry"] 

        folium.GeoJson(
            data={
                "type": "Feature",
                "geometry": geom,
                "properties": {
                    "popup": f"{kreds_name} – {round(pct_left * 100)}% left"
                }
            },
            name=f"{kreds_name}_{year}",
            style_function=lambda x, c=fill_color: {
                "fillColor": c,
                "color": "black",
                "weight": 1,
                "fillOpacity": 1,
            },
            tooltip=folium.GeoJsonTooltip(fields=["popup"])
        ).add_to(fg)

    fg.add_to(m)

folium.LayerControl().add_to(m)
m.save("CopenhagenElectionMap_layers.html")

In [67]:
def blend_red_blue(pct_right):
    pct_right = min(max(pct_right, 0), 1)  # Clamp
    r = int((1 - pct_right) * 255)
    g = 0
    b = int(pct_right * 255)
    return f"#{r:02x}{g:02x}{b:02x}"

# Your processed data
pivot = df_block_freq.pivot_table(
    index=["Year", "KredsNr"],
    columns="block",
    values="Votes (%)",
    fill_value=0
).reset_index()

pivot["pct_left"] = pivot["left"] / (pivot["right"] + pivot["left"])
pivot["pct_right"] = pivot["right"] / (pivot["right"] + pivot["left"])

# Set up slicer (dropdown)
year_selector = widgets.Dropdown(
    options=sorted(pivot["Year"].unique()),
    description="Year:",
    style={"description_width": "initial"},
    layout=widgets.Layout(width="300px"))

output = widgets.Output()

def generate_map(year):
    m = folium.Map(location=[55.6761, 12.5683], zoom_start=12, tiles="cartodbpositron")
    df_year = pivot[pivot["Year"] == year]

    for _, row in df_year.iterrows():
        kreds_nr = row["KredsNr"]
        kreds_name = kreds_map.get(kreds_nr)
        if not kreds_name:
            continue

        pct_right = row["pct_right"]
        pct_left = row["pct_left"]
        fill_color = blend_red_blue(pct_right)

        geojson_path = os.path.join(geojson_folder, f"{kreds_name}.json")
        if not os.path.exists(geojson_path):
            continue

        with open(geojson_path, "r", encoding="utf-8") as f:
            geojson = json.load(f)
            geom = geojson["geometry"]

        folium.GeoJson(
            data={
                "type": "Feature",
                "geometry": geom,
                "properties": {
                    "popup": f"{kreds_name} – {round(pct_left * 100)}% left"
                }
            },
            style_function=lambda x, c=fill_color: {
                "fillColor": c,
                "color": "black",
                "weight": 1,
                "fillOpacity": 1,
            },
            tooltip=folium.GeoJsonTooltip(fields=["popup"])
        ).add_to(m)

    return m

def update_map(change):
    with output:
        clear_output(wait=True)
        selected_year = change["new"]
        m = generate_map(selected_year)
        display(m)

# Link the dropdown to the map update
year_selector.observe(update_map, names="value")

# # Show the widgets and map initially
# display(year_selector)
# with output:
#     display(generate_map(year_selector.value))
# display(output)

# Save the map to an HTML file
output_map = generate_map(year_selector.value)
output_map.save("CopenhagenElectionMap_layers_slicer.html")


In [72]:
# Path to your GeoJSON folder
geojson_folder = "Data/CopenhagenGeoData"

# Create the base map centered on Copenhagen
copenhagen_map = folium.Map(location=[55.6761, 12.5683], zoom_start=12)


colors = [
    "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728",
    "#9467bd", "#8c564b", "#e377c2", "#7f7f7f",
    "#bcbd22", "#17becf", "#aec7e8", "#ffbb78"
]

def style_function(feature, col):
    return {
        'fillColor': col,
        'color': 'black',
        'weight': 2,
        'fillOpacity': 0.6,
    }

# Add each GeoJSON file to the map
for i, filename in enumerate(os.listdir(geojson_folder)):
    if filename.endswith(".json"):
        path = os.path.join(geojson_folder, filename)
        
        with open(path, 'r', encoding='utf-8') as f:
            geojson_data = json.load(f)
        
        district_name = os.path.splitext(filename)[0]
        
        color = colors[i % len(colors)]  # cycle through colors
        
        folium.GeoJson(
            geojson_data,
            name=district_name,
            style_function=partial(style_function, col=color),
            tooltip=district_name
        ).add_to(copenhagen_map)
        
        # Extract centroid from 'geometry'
        geom = shape(geojson_data["geometry"])
        centroid = geom.centroid

        # Place district name at centroid
        folium.Marker(
            location=[centroid.y, centroid.x],
            icon=folium.DivIcon(
                html=f"""
                <div style="
                    font-size:10pt;
                    font-weight:bold;
                    text-align:center;
                    transform: translate(-50%, -50%);
                    white-space: nowrap;
                ">
                    {district_name}
                </div>
                """
            )
        ).add_to(copenhagen_map)
        

# Add layer control
folium.LayerControl().add_to(copenhagen_map)
# To save to file
copenhagen_map.save("copenhagen_districts_map.html")


## Get meta data for the parties

In [None]:
# Load the parties metadata
df_parties_meta = pd.read_csv('ElectionData_DS/Partier.csv', sep=';')
df_parties_meta.columns = df_parties_meta.columns.str.replace(' ', '', regex=True)

In [11]:
df_parties_meta.head(5)

Unnamed: 0,Partinavn,Kortpartinavn,Parti,Nummer
0,Stemmeberettigede,Berret,BE,-5
1,Afgivne stemmer,Total,TO,-4
2,Blanke stemmer,Blanke,UB,-3
3,Andre ugyldige stemmer,Ugyldige,UA,-2
4,Gyldige stemmer,Gyldige,GY,-1


In [12]:
# Merge the election data with the parties metadata
df_elec_cph_long = pd.merge(
    df_elec_cph_long,
    df_parties_meta[['Partinavn', 'Kortpartinavn', 'Parti']],
    on='Partinavn',
    how='left'
)

In [13]:
# Merge with the geography data
df_elec_cph_long = pd.merge(
    df_elec_cph_long,
    df_geo_cph[['KredsNr', 'Kredsnavn', 'Kommunenavn']],
    on='KredsNr',
    how='left'
)

In [76]:
df_elec_cph_long.groupby('Year')['KredsNr'].nunique()

Year
2005    12
2007    12
2011    12
2015    12
2019    12
2022    12
Name: KredsNr, dtype: int64

# Population Data

In [27]:
# Load the population data
df_population = pd.read_csv("ElectionData_DS/Befolkning.csv", sep=';', low_memory=False, decimal=',')
df_population.columns = df_population.columns.str.replace(' ', '', regex=True)

# Filter the population data to only keep the relevant KredsNr
df_population['KredsNr'] = df_population['KredsNr'].astype(str)
df_population_cph = df_population[df_population['KredsNr'].isin(KredsNr_list)]

In [None]:
# Drop columns with all NaN values
df_population_cph = df_population_cph.replace("-", np.nan)
df_population_cph = df_population_cph.dropna(axis=1, how='all')
df_population_cph.fillna(0, inplace=True)
df_population_cph = df_population_cph.apply(pd.to_numeric, errors='coerce')

## Get demographics data

In [41]:
demographics_cols = [
    col for col in df_population_cph.columns
    if col.startswith("FV") and "Antalpersoneropgjortefter" in col
]

# Melt the DataFrame
df_pop_demographics = df_population_cph.melt(
    id_vars=['Gruppe', 'KredsNr'],
    value_vars=demographics_cols,
    var_name='RawColumn',
    value_name='Count'
)

# Extract the fields using regex
# Pattern: FV<year>-Antalpersoner..._<Gender><Age>_<CitizenshipCode>.<CitizenshipName>
df_pop_demographics[['Year', 'GenderAge', 'Citizenship']] = df_pop_demographics['RawColumn'].str.extract(
    r'FV(\d{4})-Antalpersoner.*?_(\w+\d+-?\d*år)_(?:\d+\.)?(.+)$'
)

# Separate Gender and Age
df_pop_demographics[['Gender', 'Age']] = df_pop_demographics['GenderAge'].str.extract(r'(\D+)(\d+-?\d*år)')

# Reorder and clean
df_pop_demographics = df_pop_demographics.drop(columns=['RawColumn', 'GenderAge'])
df_pop_demographics = df_pop_demographics[['Gruppe', 'KredsNr','Year', 'Gender', 'Age', 'Citizenship', 'Count']]

## Get the housing size data

In [20]:
hoursing_size_columns = [
    col for col in df_population_cph.columns
    if col.startswith("FV") and "boligstørrelse" in col
]

# Melt to long format
df_house_size = df_population_cph.melt(
    id_vars=['Gruppe', 'KredsNr'],
    value_vars=hoursing_size_columns,
    var_name='RawColumn',
    value_name='Count'
)

# Extract year, aggregate level, and size category
df_house_size[['Year', 'AggregateLevel', 'HourseSize']] = df_house_size['RawColumn'].str.extract(
    r'FV(\d{4})-Boligerogpersonerefterboligstørrelse_\d+\.(?:Antal)?(boliger|personer)_(.+)'
)

df_house_size['AggregateLevel'] = df_house_size['AggregateLevel'].map({
    'boliger': 'Units',
    'personer': 'Residents'
})

# Reorder columns
df_house_size = df_house_size[[
    'Gruppe', 'KredsNr', 'Year', 'AggregateLevel', 'HourseSize', 'Count'
]]


In [22]:
df_house_size.head(3)

Unnamed: 0,Gruppe,KredsNr,Year,AggregateLevel,HourseSize,Count
0,101002,1,2015,Units,0-39m2,1884
1,101009,2,2015,Units,0-39m2,2509
2,101011,3,2015,Units,0-39m2,1305


## Get the housing type data

In [23]:
# Identify relevant boligtype columns
housing_type_cols = [
    col for col in df_population_cph.columns
    if col.startswith("FV") and "Boligtype" in col
]

# Melt the DataFrame
df_housing_type = df_population_cph.melt(
    id_vars=['Gruppe', 'KredsNr'],
    value_vars=housing_type_cols,
    var_name='RawColumn',
    value_name='Count'
)

# Extract: year, aggregate level (boliger/personer), housing type
df_housing_type[['Year', 'AggregateLevel', 'HousingType']] = df_housing_type['RawColumn'].str.extract(
    r'FV(\d{4})-Boligtype_Antal_(boliger|personer)_\d+\.(.+)'
)

df_housing_type['AggregateLevel'] = df_housing_type['AggregateLevel'].map({
    'boliger': 'Units',
    'personer': 'Residents'
})

# Reorder columns if you want
df_housing_type = df_housing_type[[
    'Gruppe', 'KredsNr', 'Year', 'AggregateLevel', 'HousingType', 'Count'
]]


In [24]:
df_housing_type.head(3)

Unnamed: 0,Gruppe,KredsNr,Year,AggregateLevel,HousingType,Count
0,101002,1,2015,Units,Stuehuseogparcelhuse,263
1,101009,2,2015,Units,Stuehuseogparcelhuse,2489
2,101011,3,2015,Units,Stuehuseogparcelhuse,163


## Get Ownership data

In [26]:
# Identify relevant boligtype columns
ownership_cols = [
    col for col in df_population_cph.columns
    if col.startswith("FV") and "Ejerforhold" in col
]

# Melt the DataFrame
df_ownership_type = df_population_cph.melt(
    id_vars=['Gruppe', 'KredsNr'],
    value_vars=ownership_cols,
    var_name='RawColumn',
    value_name='Count'
)

# Extract: year, aggregate level (boliger/personer), housing type
df_ownership_type[['Year', 'AggregateLevel', 'OwnershipType']] = df_ownership_type['RawColumn'].str.extract(
    r'FV(\d{4})-Ejerforhold_Antal_(boliger|personer)_\d+\.(.+)'
)

df_ownership_type['AggregateLevel'] = df_ownership_type['AggregateLevel'].map({
    'boliger': 'Units',
    'personer': 'Residents'
})

# Reorder columns if you want
df_ownership_type = df_ownership_type[[
    'Gruppe', 'KredsNr', 'Year', 'AggregateLevel', 'OwnershipType', 'Count'
]]


In [27]:
df_ownership_type.head(3)

Unnamed: 0,Gruppe,KredsNr,Year,AggregateLevel,OwnershipType,Count
0,101002,1,2015,Units,Ejerbolig,6238
1,101009,2,2015,Units,Ejerbolig,6891
2,101011,3,2015,Units,Ejerbolig,4840


## Get Income data

In [29]:
# Step 1: Filter relevant income columns
income_cols = [
    col for col in df_population_cph.columns
    if col.startswith("FV") and "Husstandsindkomsterfordeltpåafstemningsområder" in col
]

# Step 2: Melt
df_income = df_population_cph.melt(
    id_vars=['Gruppe', 'KredsNr'],
    value_vars=income_cols,
    var_name='RawColumn',
    value_name='Value'
)

# Step 3: Extract Year + IncomeMetric
df_income[['Year', 'IncomeMetric']] = df_income['RawColumn'].str.extract(
    r'FV(\d{4})-Husstandsindkomsterfordeltpåafstemningsområder_(.+)'
)

# Step 4: Clean up
df_income = df_income.drop(columns='RawColumn')
df_income = df_income[[
    'Gruppe', 'KredsNr', 'Year', 'IncomeMetric', 'Value'
]]


In [30]:
df_income.head(3)

Unnamed: 0,Gruppe,KredsNr,Year,IncomeMetric,Value
0,101002,1,2015,100.000-149.999kr,1717
1,101009,2,2015,100.000-149.999kr,1600
2,101011,3,2015,100.000-149.999kr,1074


## Get Education data

In [31]:
educ_columns = [
    col for col in df_population_cph.columns
    if col.startswith("FV") and "Højstfuldførteerhvervsuddannelseogaldersgrupper" in col
]

# Melt the DataFrame
df_pop_educ = df_population_cph.melt(
    id_vars=['Gruppe', 'KredsNr'],
    value_vars=educ_columns,
    var_name='RawColumn',
    value_name='Count'
)

# Extract the fields using regex
# Pattern: FV<year>-Højstfuldførteer..._<Age>_<EducationLevel>
df_pop_educ[['Year', 'Age', 'EducationLevel']] = df_pop_educ['RawColumn'].str.extract(
    r'FV(\d{4})-Højstfuldførteerhvervsuddannelseogaldersgrupper_(\d{1,3}-?\d*år)(?:_(?:\d+\.)?(.+))?$'
)

# Remove 'år' from age
df_pop_educ['Age'] = df_pop_educ['Age'].str.replace('år', '', regex=False)

# Reorder and clean
df_pop_educ = df_pop_educ.drop(columns=['RawColumn'])
df_pop_educ = df_pop_educ[['Gruppe', 'KredsNr', 'Year', 'Age', 'EducationLevel', 'Count']]

# hvordan skal vi håndtere de manglende værdier i 'EducationLevel'?
# df_pop_educ['EducationLevel'] = df_pop_educ['EducationLevel'].fillna('Uoplyst')
# så skal vi summe count for Uoplyst og nan 

In [32]:
df_pop_educ.head(3)

Unnamed: 0,Gruppe,KredsNr,Year,Age,EducationLevel,Count
0,101002,1,2015,18-19,Grundskole,854
1,101009,2,2015,18-19,Grundskole,763
2,101011,3,2015,18-19,Grundskole,635


## Get Socio Economic data

In [33]:
# Select relevant columns for socio-economic status and industries
socioeconomic_cols = [
    col for col in df_population_cph.columns
    if col.startswith("FV") and "Socio-økonomiskstatusogbrancherfordeltpåafstemningsområder" in col
]

# Melt to long format
df_socio = df_population_cph.melt(
    id_vars=['Gruppe', 'KredsNr'],
    value_vars=socioeconomic_cols,
    var_name='RawColumn',
    value_name='Count'
)

# Extract Year, Employment Group, and Industry
df_socio[['Year', 'EmploymentGroup', 'Industry']] = df_socio['RawColumn'].str.extract(
    r'FV(\d{4})-Socio-økonomiskstatusogbrancherfordeltpåafstemningsområder_\d+\.(.+?)_(.+)'
)

# Clean up columns
df_socio = df_socio.drop(columns=['RawColumn'])
df_socio = df_socio[['Gruppe', 'KredsNr', 'Year', 'EmploymentGroup', 'Industry', 'Count']]


In [34]:
df_socio.head(3)

Unnamed: 0,Gruppe,KredsNr,Year,EmploymentGroup,Industry,Count
0,101002,1,2015,Selvstændigogmedhj.,Byggeoganlæg,135
1,101009,2,2015,Selvstændigogmedhj.,Byggeoganlæg,107
2,101011,3,2015,Selvstændigogmedhj.,Byggeoganlæg,77


## Get support data

In [36]:
support_cols = [
    col for col in df_population_cph.columns
    if col.startswith("FV") and "Personerefterforsørgelsestype" in col
]

# Melt the DataFrame
df_support = df_population_cph.melt(
    id_vars=['Gruppe', 'KredsNr'],
    value_vars=support_cols,
    var_name='RawColumn',
    value_name='Count'
)

# Extract Year and SupportType
df_support[['Year', 'SupportType']] = df_support['RawColumn'].str.extract(
    r'FV(\d{4})-Personerefterforsørgelsestype_\d+\.(.+)'
)

# Clean up
df_support = df_support.drop(columns='RawColumn')
df_support = df_support[['Gruppe', 'KredsNr', 'Year', 'SupportType', 'Count']]


In [37]:
df_support.head(3)

Unnamed: 0,Gruppe,KredsNr,Year,SupportType,Count
0,101002,1,2015,Arbejdsløshedsdagpenge,4618
1,101009,2,2015,Arbejdsløshedsdagpenge,4029
2,101011,3,2015,Arbejdsløshedsdagpenge,2966


## Get origin data

In [38]:
# Filter relevant columns
origin_cols = [
    col for col in df_population_cph.columns
    if "Indvandrereogefterkommerefordeltefteroprindelsesland" in col
]

# Melt to long format
df_origin_long = df_population_cph.melt(
    id_vars=['Gruppe', 'KredsNr'],
    value_vars=origin_cols,
    var_name='RawColumn',
    value_name='Count'
)

# Extract year, gender, age, origin
df_origin_long[['Year', 'GenderAge', 'Origin']] = df_origin_long['RawColumn'].str.extract(
    r'FV(\d{4})-Indvandrereogefterkommerefordeltefteroprindelsesland_(\D+\d+-?\d*år)_(?:\d+\.)?(.+)$'
)

# Split gender and age
df_origin_long[['Gender', 'Age']] = df_origin_long['GenderAge'].str.extract(r'(\D+)(\d+-?\d*år)')

# Clean up
df_origin_long = df_origin_long.drop(columns=['RawColumn', 'GenderAge'])
df_origin_long['Age'] = df_origin_long['Age'].str.replace('år', '', regex=False)


In [39]:
df_origin_long.head(3)

Unnamed: 0,Gruppe,KredsNr,Count,Year,Origin,Gender,Age
0,101002,1,20,2015,Nordiskelande,Kvinder,0-4
1,101009,2,36,2015,Nordiskelande,Kvinder,0-4
2,101011,3,22,2015,Nordiskelande,Kvinder,0-4
