## What happens here?
- The grunnkrets and age distribution data is loaded.
- 2016-values are kept if they exist
- Total population and age groups 0-14, 15-64, 64-90 is aggregated
- Population density is computed

## Remarks
- There are 1891 NaNs, around 15%, because we miss age data for some grunnkrets'.

## Further improvements
- Outlier detection: Small areas yield large densities. Small areas and small populations might be set to NaNs.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [2]:
grunnkrets = pd.read_csv('../../data/grunnkrets_norway_stripped.csv')
grunnkrets_ages = pd.read_csv('../../data/grunnkrets_age_distribution.csv')
grunnkrets_household = pd.read_csv("../../data/grunnkrets_households_num_persons.csv")
grunnkrets_household = grunnkrets_household.sort_values(by=["grunnkrets_id", "year"]).drop_duplicates(subset=["grunnkrets_id"], keep='last')
grunnkrets_ages = grunnkrets_ages.sort_values(by=["grunnkrets_id", "year"]).drop_duplicates(subset=["grunnkrets_id"], keep='last')
grunnkrets = grunnkrets.sort_values(by=["grunnkrets_id", "year"]).drop_duplicates(subset=["grunnkrets_id"], keep='last')

In [3]:
grunnkrets_household.drop(columns="year", inplace=True)
grunnkrets_household["sum_people"] = grunnkrets_household.drop(columns="grunnkrets_id").sum(axis=1)

In [7]:
df = grunnkrets_ages.drop_duplicates(subset=["grunnkrets_id"], keep='last')
df = grunnkrets.merge(df, how='left', on='grunnkrets_id')
df["grunnkrets_population"] = df.drop(columns=["grunnkrets_id", "year_x", "year_y", "area_km2", "geometry", "municipality_name", "district_name", "grunnkrets_name"]).sum(axis=1)
df = df.merge(grunnkrets_household[['grunnkrets_id', "sum_people"]], how="left", on="grunnkrets_id")
df['grunnkrets_population'] = df.apply(
    lambda row: row['sum_people'] if row['grunnkrets_population'] == 0 else row['grunnkrets_population'],
    axis=1
)
df["district_population"] = df.groupby("district_name")["grunnkrets_population"].transform("sum")
df["municipality_population"] = df.groupby("municipality_name")["grunnkrets_population"].transform("sum")
df["district_area"] = df.groupby("district_name")["area_km2"].transform("sum")
df["municipality_area"] = df.groupby("municipality_name")["area_km2"].transform("sum")
ages = [0,3,7,13,18,25,31,41,54,65,78,91]
# ages = [0,5,12,18,30,45,65,91]
ages = [0,15,35,65,91]
# ages = list(range(92))
for j in range(1, len(ages)):  
    age_from = ages[j-1]
    age_to = ages[j]
    df[f"grunnkrets_age_{age_from}-{age_to-1}"] = df[df.columns[df.columns.isin([f"age_{i}" for i in range(age_from, age_to)])]].sum(axis=1)
    df[f"grunnkrets_age_{age_from}-{age_to-1}_distribution"] = df[f"grunnkrets_age_{age_from}-{age_to-1}"]/df["grunnkrets_population"]

for name in ["district_name", "municipality_name"]:
    for j, age in enumerate(ages):
        prefix = name.split("_")[0]
        if j == 0:
            pass
            # columns = [f"age_{i}" for i in range(age)]
            # columns.append(name)
            # df[f"{prefix}_age_0-{ages[j]-1}"] = df[df.columns[df.columns.isin(columns)]].groupby(name).transform("sum").sum(axis=1)
        else:
            columns = [f"age_{i}" for i in range(ages[j-1], age)]
            columns.append(name)
            df[f"{prefix}_age_{ages[j-1]}-{ages[j]-1}"] = df[df.columns[df.columns.isin(columns)]].groupby(name).transform("sum").sum(axis=1)
            df[f"{prefix}_age_{ages[j-1]}-{ages[j]-1}_distribution"] = df[f"{prefix}_age_{ages[j-1]}-{ages[j]-1}"]/df[f"{prefix}_population"]
# There is currently an error with this, so it is currently discarded
# for name in ["grunnkrets", "district", "municipality"]:
#     print(df.columns.tolist())
#     df[f"{name}_pop_number"] = np.sum([df[f"{name}_age_{ages[j-1]}-{ages[j]-1}"]*(j - len(ages)/2)/(1-len(ages)/2) for j in range(1, len(ages))])/df[f"{name}_population"] 
df["grunnkrets_density"] = df.grunnkrets_population/df.area_km2
df["district_density"] = df.district_population/df.district_area
df["municipality_density"] = df.municipality_population/df.municipality_area

df = df.drop(columns=[f"age_{i}" for i in range(91)])
df = df.drop(columns=["year_y"])
df = df.rename(columns={"year_x": "year"})
# df.T

In [8]:
df.to_csv("../../own_data/grunnkrets_norway_large.csv", index=False)