In [1]:
#Imports 
import pandas as pd

# Create DataFrame with right encoding for norwegian letters
df = pd.read_csv("../data/jentenavn.csv", sep=";", skiprows=2, encoding="latin1")
df.head()

Unnamed: 0,fornavn,Personer 2013,Personer 2014,Personer 2015,Personer 2016,Personer 2017,Personer 2018,Personer 2019,Personer 2020,Personer 2021,Personer 2022,Personer 2023,Personer 2024
0,Aagot,331,309,291,264,248,231,215,201,.,.,.,.
1,Aase,4049,3872,3711,3575,3421,3268,3122,2997,2963,2835,2712,2570
2,Aashild,343,337,327,315,304,293,287,282,273,262,253,246
3,Aasta,369,352,323,312,293,270,256,248,238,223,212,206
4,Abigail,.,.,.,.,.,.,214,226,250,270,299,308


In [2]:
# Convert columns with text to numerical
for col in df.columns[1:]:
    df[col] = pd.to_numeric(df[col].replace(".", pd.NA), errors="coerce")

In [3]:
# Remove "Personer" from column head
df.columns = [col.replace("Personer ", "") if col.startswith("Personer ") else col for col in df.columns]

In [4]:
# Convert df from wide to long format (i.e. better for analysis)
# Now we get three columns instead
df_long = pd.melt(df,
    id_vars="fornavn",
    var_name="år",
    value_name="antall")

In [5]:
# Sort for name and year for better readability
df_long = df_long.sort_values(by=["fornavn", "år"]).reset_index(drop=True)

In [6]:
# Save the cleaned and reshaped dataset to disk in two formats:
# - CSV for easy inspection and sharing
# - Parquet for efficient loading in backend/API
df_long.to_csv("../data/renset_jentenavn.csv", index=False)
df_long.to_parquet("../data/renset_jentenavn.parquet")