# Grunnkrets investigation

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from utils import drop_oldest_duplicates, group_df, create_geographical_columns

In [2]:
age_distribution = pd.read_csv("data/grunnkrets_age_distribution.csv")
age_distribution.shape

(22620, 93)

In [3]:
age_distribution.head()

Unnamed: 0,grunnkrets_id,year,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,...,age_81,age_82,age_83,age_84,age_85,age_86,age_87,age_88,age_89,age_90
0,16013117,2015,14,14,14,14,13,13,12,11,...,0,0,0,0,0,0,0,0,0,0
1,16013117,2016,10,10,10,10,10,10,9,9,...,0,0,0,0,0,0,0,0,0,0
2,11030206,2015,5,5,5,5,5,4,4,3,...,0,0,0,0,0,0,0,0,0,0
3,16011203,2016,2,2,2,2,2,3,3,3,...,1,1,1,1,1,1,1,0,0,0
4,3011601,2016,7,7,7,7,6,6,5,4,...,1,1,0,0,0,0,0,0,0,0


In [4]:
age_distribution = create_geographical_columns(age_distribution)

In [5]:
age_distribution.head()

Unnamed: 0,grunnkrets_id,year,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,...,age_84,age_85,age_86,age_87,age_88,age_89,age_90,fylke,delomrade,kommune
0,16013117,2015,14,14,14,14,13,13,12,11,...,0,0,0,0,0,0,0,16,160131,1601
1,16013117,2016,10,10,10,10,10,10,9,9,...,0,0,0,0,0,0,0,16,160131,1601
2,11030206,2015,5,5,5,5,5,4,4,3,...,0,0,0,0,0,0,0,11,110302,1103
3,16011203,2016,2,2,2,2,2,3,3,3,...,1,1,1,1,0,0,0,16,160112,1601
4,3011601,2016,7,7,7,7,6,6,5,4,...,0,0,0,0,0,0,0,3,30116,301


In [52]:
age_distribution["grunnkrets_length"] = age_distribution["grunnkrets_id"].apply(lambda x: len(str(x)))

In [53]:
age_distribution["grunnkrets_length"].value_counts()

8    11395
7    11225
Name: grunnkrets_length, dtype: int64

In reality all have length 8, but the starting 0 is often omitted

IDEAS: group by grunnkrets_id, first 2 digits is "fylke", first 4 corresponds to "kommune" (see https://no.wikipedia.org/wiki/Grunnkretser_i_Norge). First 6 digits are also related in some sense.

In [54]:
print(f'# of unique grunnkrets_ids (2015)  -> {age_distribution[age_distribution["year"] == 2015].shape[0]}')
print(f'# of unique grunnkrets_ids (2016)  -> {age_distribution[age_distribution["year"] == 2016].shape[0]}')
print(f'# of unique grunnkrets_ids (total) -> {age_distribution["grunnkrets_id"].nunique()}')

# of unique grunnkrets_ids (2015)  -> 11316
# of unique grunnkrets_ids (2016)  -> 11304
# of unique grunnkrets_ids (total) -> 11379


In [56]:
age_distribution_clean = drop_oldest_duplicates(age_distribution, "grunnkrets_id")

In [57]:
age_distribution_clean["year"].value_counts()

2016    11304
2015       75
Name: year, dtype: int64

In [58]:
age_distribution_clean = group_df(age_distribution_clean, "grunnkrets_id")