In [30]:
import pandas as pd
import json

# 1. Load Raw Data

In [31]:
with open("../data/cost_of_living_raw.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

df_raw = pd.DataFrame(raw_data)

# 2. Select and Rename Relevant Fields

In [32]:
columns_to_keep = {
    "Average Monthly Net Salary (After Tax)": "net_salary",
    "Apartment (1 bedroom) in City Centre": "apt1_city_centre",
    "Meal, Inexpensive Restaurant": "meal_cheap",
    "Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)": "internet",
    "Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment": "utilities",
    "Cappuccino (regular)": "coffee",
    "Gasoline (1 gallon)": "gasoline",
    "Domestic Beer (0.5 liter bottle)": "beer",
    "One-way Ticket (Local Transport)": "ticket_oneway",
    "Monthly Pass (Regular Price)": "ticket_monthly"
}

base_cols = ["City Name", "country"]
df_reduced = df_raw[base_cols + list(columns_to_keep.keys())].copy()
df_reduced = df_reduced.rename(columns=columns_to_keep)

# 3. Convert All Values to Euro

In [33]:
def convert_to_eur(value):
    if not isinstance(value, str):
        return None
    value = value.replace(",", "").strip()

    if value.endswith("€"):
        return float(value.replace("€", "").strip())
    elif value.endswith("$"):
        return float(value.replace("$", "").strip()) / 1.15
    elif value.endswith("Fr."):
        return float(value.replace("Fr.", "").strip()) / 0.94
    else:
        return None

for col in columns_to_keep.values():
    df_reduced[col] = df_reduced[col].apply(convert_to_eur)

In [34]:
df_reduced = df_reduced.rename(columns={"City Name": "city"})

# 4. Fill Missing Values

### Fill in missing values using the average per country for each column.

In [35]:
for col in df_reduced.columns:
    if col not in ["city", "country"]:
        df_reduced[col] = df_reduced.groupby("country")[col].transform(lambda x: x.fillna(x.mean()))


# 5. Round all fields to two decimal places

In [36]:
numeric_cols = df_reduced.select_dtypes(include="number").columns
df_reduced[numeric_cols] = df_reduced[numeric_cols].round(2)


# 6. Output

In [37]:
df_reduced

Unnamed: 0,city,country,net_salary,apt1_city_centre,meal_cheap,internet,utilities,coffee,gasoline,beer,ticket_oneway,ticket_monthly
0,Vienna,austria,2641.54,1032.14,15.0,36.0,285.15,4.22,6.1,1.29,2.4,40.0
1,graz,austria,2551.07,700.0,14.0,31.34,323.19,3.8,5.83,1.33,3.0,62.75
2,linz,austria,2229.89,615.44,18.0,34.67,300.42,3.8,5.7,1.34,2.7,54.45
3,salzburg,austria,2763.55,1106.25,15.0,37.33,330.32,4.12,5.72,1.29,2.4,75.0
4,Berlin,germany,2891.64,1229.35,15.0,40.59,313.46,3.77,6.54,1.01,3.5,58.0
5,munich,germany,3382.65,1523.38,18.0,40.13,349.58,3.79,6.54,1.02,4.0,58.0
6,hamburg,germany,2540.87,1043.48,8.7,130.43,79.17,4.35,2.61,2.49,1.74,86.96
7,dusseldorf,germany,3060.21,1070.0,15.0,42.05,305.82,3.59,6.57,0.9,3.4,56.5
8,frankfurt,germany,3666.67,685.0,7.0,45.42,284.0,1.0,6.52,0.89,1.95,53.5
9,hannover,germany,2434.78,1063.16,16.52,62.8,403.26,3.12,3.16,2.18,3.05,61.28


# 7. Save as CSV

In [38]:
output_path = "../data/cost_of_living_cleaned.csv"
df_reduced.to_csv(output_path, index=False, encoding="utf-8")
print(f"CSV-Datei gespeichert unter: {output_path}")


CSV-Datei gespeichert unter: ../data/cost_of_living_cleaned.csv
