# Grunnkrets investigation

In [105]:
# Magic to automatically update imports if functions in utils are changed
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from utils import group_df, preprocess_grunnkrets_df

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [106]:
age_distribution = pd.read_csv("data/grunnkrets_age_distribution.csv")
age_distribution.shape

(22620, 93)

In [107]:
age_distribution.head()

Unnamed: 0,grunnkrets_id,year,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,...,age_81,age_82,age_83,age_84,age_85,age_86,age_87,age_88,age_89,age_90
0,16013117,2015,14,14,14,14,13,13,12,11,...,0,0,0,0,0,0,0,0,0,0
1,16013117,2016,10,10,10,10,10,10,9,9,...,0,0,0,0,0,0,0,0,0,0
2,11030206,2015,5,5,5,5,5,4,4,3,...,0,0,0,0,0,0,0,0,0,0
3,16011203,2016,2,2,2,2,2,3,3,3,...,1,1,1,1,1,1,1,0,0,0
4,3011601,2016,7,7,7,7,6,6,5,4,...,1,1,0,0,0,0,0,0,0,0


IDEAS: group by grunnkrets_id, first 2 digits is "fylke", first 4 corresponds to "kommune" (see https://no.wikipedia.org/wiki/Grunnkretser_i_Norge). First 6 digits are also related in some sense.

In [108]:
age_distribution = preprocess_grunnkrets_df(age_distribution, "grunnkrets_id")

In [109]:
age_list = []
for col in list(age_distribution.columns):
    if col.startswith("age"):
        _, age_num = col.split("_")
        age_list.append([int(age_num), col])
age_list = sorted(age_list, key=lambda x: (x[0]))
print(age_list)

[[0, 'age_0'], [1, 'age_1'], [2, 'age_2'], [3, 'age_3'], [4, 'age_4'], [5, 'age_5'], [6, 'age_6'], [7, 'age_7'], [8, 'age_8'], [9, 'age_9'], [10, 'age_10'], [11, 'age_11'], [12, 'age_12'], [13, 'age_13'], [14, 'age_14'], [15, 'age_15'], [16, 'age_16'], [17, 'age_17'], [18, 'age_18'], [19, 'age_19'], [20, 'age_20'], [21, 'age_21'], [22, 'age_22'], [23, 'age_23'], [24, 'age_24'], [25, 'age_25'], [26, 'age_26'], [27, 'age_27'], [28, 'age_28'], [29, 'age_29'], [30, 'age_30'], [31, 'age_31'], [32, 'age_32'], [33, 'age_33'], [34, 'age_34'], [35, 'age_35'], [36, 'age_36'], [37, 'age_37'], [38, 'age_38'], [39, 'age_39'], [40, 'age_40'], [41, 'age_41'], [42, 'age_42'], [43, 'age_43'], [44, 'age_44'], [45, 'age_45'], [46, 'age_46'], [47, 'age_47'], [48, 'age_48'], [49, 'age_49'], [50, 'age_50'], [51, 'age_51'], [52, 'age_52'], [53, 'age_53'], [54, 'age_54'], [55, 'age_55'], [56, 'age_56'], [57, 'age_57'], [58, 'age_58'], [59, 'age_59'], [60, 'age_60'], [61, 'age_61'], [62, 'age_62'], [63, 'age_6

In [112]:
from utils import group_age_columns


new_age_distribution = group_age_columns(age_distribution_df=age_distribution, span_size=5)

In [113]:
new_age_distribution.head()

Unnamed: 0,grunnkrets_id,year,fylke,kommune,delomrade,age_0-4,age_5-9,age_10-14,age_15-19,age_20-24,...,age_45-49,age_50-54,age_55-59,age_60-64,age_65-69,age_70-74,age_75-79,age_80-84,age_85-89,age_90-90
6784,2190812,2015,2,219,21908,0,0,0,0,0,...,11,10,6,4,0,0,0,0,0,0.0
8175,2190914,2015,2,219,21909,18,32,41,32,28,...,33,34,27,19,15,10,5,5,4,0.0
8529,20300310,2015,20,2030,203003,2,0,0,5,8,...,5,5,5,3,0,0,0,0,0,0.0
8577,5170101,2015,5,517,51701,2,0,0,4,9,...,0,0,1,2,0,0,0,0,0,0.0
16348,2191013,2015,2,219,21910,50,54,44,24,28,...,44,36,32,33,25,20,20,15,7,0.0


This allows us to group by either "fylke", "kommune", "delomrade" or "grunnkrets"

We can now do this for all grunnkrets-related dataframes

## Preprocessing ALL the dataframes :)

In [114]:
age_distribution = pd.read_csv("data/grunnkrets_age_distribution.csv")
households_num_persons = pd.read_csv("data/grunnkrets_households_num_persons.csv")
income_households = pd.read_csv("data/grunnkrets_income_households.csv")
norway_stripped = pd.read_csv("data/grunnkrets_norway_stripped.csv")


In [115]:
# add all dataframes to dict
grunnkrets_dfs = {
    "age_distribution" : age_distribution, 
    "households_num_persons" : households_num_persons, 
    "income_households" : income_households, 
    "norway_stripped" : norway_stripped, 
    }

In [116]:
# preprocess dataframes
for df_name, df in grunnkrets_dfs.items():
    grunnkrets_dfs[df_name] = preprocess_grunnkrets_df(df)

# group age categories and sum up values
# span_size determines how many age-columns should be grouped each time
span_size = 7 # I chose 7 because it is a prime factor of 91, and makes all spans have same size
grunnkrets_dfs["age_distribution"] = group_age_columns(age_distribution_df=age_distribution, span_size=span_size)

In [31]:
# add dataframes grouped by fylke, kommune and delomrade
for df_name, df in grunnkrets_dfs.copy().items():
    grunnkrets_dfs[df_name + "_fylke"] = group_df(df, "fylke")
    grunnkrets_dfs[df_name + "_kommune"] = group_df(df, "kommune")
    grunnkrets_dfs[df_name + "_delomrade"] = group_df(df, "delomrade")


In [32]:
grunnkrets_dfs.keys()

dict_keys(['age_distribution', 'households_num_persons', 'income_households', 'norway_stripped', 'age_distribution_fylke', 'age_distribution_kommune', 'age_distribution_delomrade', 'households_num_persons_fylke', 'households_num_persons_kommune', 'households_num_persons_delomrade', 'income_households_fylke', 'income_households_kommune', 'income_households_delomrade', 'norway_stripped_fylke', 'norway_stripped_kommune', 'norway_stripped_delomrade'])

In [45]:
grunnkrets_dfs["age_distribution"]

Unnamed: 0,grunnkrets_id,year,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,...,age_84,age_85,age_86,age_87,age_88,age_89,age_90,fylke,kommune,delomrade
6784,2190812,2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,219,21908
8175,2190914,2015,4,3,3,4,4,5,6,6,...,1,1,1,1,1,0,0,2,219,21909
8529,20300310,2015,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,20,2030,203003
8577,5170101,2015,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,5,517,51701
16348,2191013,2015,10,10,10,10,10,10,11,11,...,2,2,2,1,1,1,1,2,219,21910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10062,1061804,2016,4,4,4,4,4,4,4,4,...,2,2,2,1,1,1,1,1,106,10618
10064,1063202,2016,4,4,4,4,4,4,4,5,...,1,1,1,1,1,0,0,1,106,10632
10065,11010402,2016,3,3,3,3,3,3,3,3,...,2,2,1,1,1,1,1,11,1101,110104
10038,9260106,2016,4,4,4,4,4,5,5,5,...,1,1,1,1,1,0,0,9,926,92601
