In [1]:
#Imports 
import pandas as pd

# Create DataFrame with right encoding for norwegian letters
df = pd.read_csv("../data/guttenavn.csv", sep=";", skiprows=2, encoding="latin1")
df.head()

Unnamed: 0,fornavn,Personer 2013,Personer 2014,Personer 2015,Personer 2016,Personer 2017,Personer 2018,Personer 2019,Personer 2020,Personer 2021,Personer 2022,Personer 2023,Personer 2024
0,Aage,1476,1422,1383,1347,1315,1275,1245,1206,1163,1126,1095,1054
1,Aaron,277,306,336,374,402,438,472,504,544,576,599,617
2,Aasmund,313,312,314,313,305,299,297,295,286,280,273,271
3,Abbas,224,233,244,263,271,280,283,286,290,299,311,313
4,Abdallah,.,.,.,.,.,.,.,200,209,214,220,230


In [2]:
# Convert columns with text to numerical
for col in df.columns[1:]:
    df[col] = pd.to_numeric(df[col].replace(".", pd.NA), errors="coerce")

In [3]:
# Remove "Personer" from column head
df.columns = [col.replace("Personer ", "") if col.startswith("Personer ") else col for col in df.columns]
df.head()

Unnamed: 0,fornavn,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Aage,1476.0,1422.0,1383.0,1347.0,1315.0,1275.0,1245.0,1206.0,1163.0,1126.0,1095.0,1054.0
1,Aaron,277.0,306.0,336.0,374.0,402.0,438.0,472.0,504.0,544.0,576.0,599.0,617.0
2,Aasmund,313.0,312.0,314.0,313.0,305.0,299.0,297.0,295.0,286.0,280.0,273.0,271.0
3,Abbas,224.0,233.0,244.0,263.0,271.0,280.0,283.0,286.0,290.0,299.0,311.0,313.0
4,Abdallah,,,,,,,,200.0,209.0,214.0,220.0,230.0


In [4]:
# Convert df from wide to long format (i.e. better for analysis)
# Now we get three columns instead
df_long = pd.melt(df,
    id_vars="fornavn",
    var_name="år",
    value_name="antall")

In [5]:
# Sort for name and year for better readability
df_long = df_long.sort_values(by=["fornavn", "år"]).reset_index(drop=True)

In [6]:
# Save the cleaned and reshaped dataset to disk in two formats:
# - CSV for easy inspection and sharing
# - Parquet for efficient loading in backend/API
df_long.to_csv("../data/renset_guttenavn.csv", index=False)
df_long.to_parquet("../data/renset_guttenavn.parquet")