# Grunnkrets investigation

In [36]:
# Magic to automatically update imports if functions in utils are changed
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from utils import group_df, preprocess_grunnkrets_df

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
age_distribution = pd.read_csv("data/grunnkrets_age_distribution.csv")
age_distribution.shape

(22620, 93)

In [38]:
age_distribution.head()

Unnamed: 0,grunnkrets_id,year,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,...,age_81,age_82,age_83,age_84,age_85,age_86,age_87,age_88,age_89,age_90
0,16013117,2015,14,14,14,14,13,13,12,11,...,0,0,0,0,0,0,0,0,0,0
1,16013117,2016,10,10,10,10,10,10,9,9,...,0,0,0,0,0,0,0,0,0,0
2,11030206,2015,5,5,5,5,5,4,4,3,...,0,0,0,0,0,0,0,0,0,0
3,16011203,2016,2,2,2,2,2,3,3,3,...,1,1,1,1,1,1,1,0,0,0
4,3011601,2016,7,7,7,7,6,6,5,4,...,1,1,0,0,0,0,0,0,0,0


Look at the distribution of stores at different granularities (grunnkrets, delomrade, kommune, fylke) 

In [39]:
from sklearn.model_selection import train_test_split

stores_train = pd.read_csv("data/stores_train.csv")
stores_train, _ = train_test_split(stores_train, test_size=0.2, random_state=0)
stores_train_explore = stores_train[ ['grunnkrets_id'] ].copy()
stores_train_explore["store_count"] = 1
stores_train_explore

Unnamed: 0,grunnkrets_id,store_count
2506,6020601,1
12305,2350203,1
4142,6240307,1
8561,15040206,1
818,3012312,1
...,...,...
4859,18330207,1
3264,16017522,1
9845,15040101,1
10799,12012008,1


In [40]:
from utils import create_geographical_columns, group_df
stores_train_explore = create_geographical_columns(stores_train_explore)


In [41]:
group_df(stores_train_explore, 'grunnkrets_id')[ ['store_count'] ].value_counts()

store_count
1              1842
2               581
3               285
4               158
5               107
6                79
7                58
8                44
9                40
11               28
10               24
12               19
13               16
14               13
15               11
16               11
18               11
19                8
20                8
17                7
23                6
24                5
21                4
28                4
25                4
32                4
22                3
26                3
29                2
30                2
37                2
27                1
31                1
33                1
35                1
36                1
41                1
42                1
44                1
47                1
51                1
61                1
67                1
dtype: int64

This is very sparsely populated 

In [42]:
group_df(stores_train_explore, 'delomrade')[ ['store_count'] ].value_counts()

store_count
1              269
2              192
3              133
4              114
5               76
              ... 
48               1
27               1
42               1
40               1
205              1
Length: 65, dtype: int64

We see that each delomrade is also quite sparsely populated, except for a few

In [43]:
group_df(stores_train_explore, 'kommune')[ ['store_count'] ].value_counts()

store_count
4              39
1              30
5              29
3              26
8              26
               ..
53              1
52              1
51              1
50              1
1481            1
Length: 76, dtype: int64

We see that most "kommuner" contain few stores

In [44]:
group_df(stores_train_explore, 'fylke')[ ['store_count'] ].value_counts()

store_count
213            1
505            1
944            1
860            1
854            1
592            1
586            1
581            1
573            1
462            1
241            1
414            1
379            1
364            1
360            1
345            1
275            1
258            1
1481           1
dtype: int64

We can see that each fylke contains many stores

IDEAS: group by grunnkrets_id, first 2 digits is "fylke", first 4 corresponds to "kommune" (see https://no.wikipedia.org/wiki/Grunnkretser_i_Norge). First 6 digits are also related in some sense.

In [45]:
age_distribution = preprocess_grunnkrets_df(age_distribution, "grunnkrets_id")

In [46]:
age_list = []
for col in list(age_distribution.columns):
    if col.startswith("age"):
        _, age_num = col.split("_")
        age_list.append([int(age_num), col])
age_list = sorted(age_list, key=lambda x: (x[0]))
print(age_list)

[[0, 'age_0'], [1, 'age_1'], [2, 'age_2'], [3, 'age_3'], [4, 'age_4'], [5, 'age_5'], [6, 'age_6'], [7, 'age_7'], [8, 'age_8'], [9, 'age_9'], [10, 'age_10'], [11, 'age_11'], [12, 'age_12'], [13, 'age_13'], [14, 'age_14'], [15, 'age_15'], [16, 'age_16'], [17, 'age_17'], [18, 'age_18'], [19, 'age_19'], [20, 'age_20'], [21, 'age_21'], [22, 'age_22'], [23, 'age_23'], [24, 'age_24'], [25, 'age_25'], [26, 'age_26'], [27, 'age_27'], [28, 'age_28'], [29, 'age_29'], [30, 'age_30'], [31, 'age_31'], [32, 'age_32'], [33, 'age_33'], [34, 'age_34'], [35, 'age_35'], [36, 'age_36'], [37, 'age_37'], [38, 'age_38'], [39, 'age_39'], [40, 'age_40'], [41, 'age_41'], [42, 'age_42'], [43, 'age_43'], [44, 'age_44'], [45, 'age_45'], [46, 'age_46'], [47, 'age_47'], [48, 'age_48'], [49, 'age_49'], [50, 'age_50'], [51, 'age_51'], [52, 'age_52'], [53, 'age_53'], [54, 'age_54'], [55, 'age_55'], [56, 'age_56'], [57, 'age_57'], [58, 'age_58'], [59, 'age_59'], [60, 'age_60'], [61, 'age_61'], [62, 'age_62'], [63, 'age_6

In [47]:
from utils import group_age_columns


new_age_distribution = group_age_columns(age_distribution_df=age_distribution, span_size=5)

In [48]:
new_age_distribution.head()

Unnamed: 0,grunnkrets_id,year,fylke,kommune,delomrade,age_0-4,age_5-9,age_10-14,age_15-19,age_20-24,...,age_45-49,age_50-54,age_55-59,age_60-64,age_65-69,age_70-74,age_75-79,age_80-84,age_85-89,age_90-90
6784,2190812,2015,2,219,21908,0,0,0,0,0,...,11,10,6,4,0,0,0,0,0,0.0
8175,2190914,2015,2,219,21909,18,32,41,32,28,...,33,34,27,19,15,10,5,5,4,0.0
8529,20300310,2015,20,2030,203003,2,0,0,5,8,...,5,5,5,3,0,0,0,0,0,0.0
8577,5170101,2015,5,517,51701,2,0,0,4,9,...,0,0,1,2,0,0,0,0,0,0.0
16348,2191013,2015,2,219,21910,50,54,44,24,28,...,44,36,32,33,25,20,20,15,7,0.0


This allows us to group by either "fylke", "kommune", "delomrade" or "grunnkrets"

We can now do this for all grunnkrets-related dataframes

In [49]:
from utils import join_grouped_df, age_bins

full_population_df = create_geographical_columns(stores_train)

join_grouped_df(full_population_df, new_age_distribution[age_bins(age_list) + ['delomrade']], 'delomrade')




Unnamed: 0_level_0,store_id,year,store_name,plaace_hierarchy_id,sales_channel_name,grunnkrets_id,address,lat,lon,chain_name,...,age_45-49,age_50-54,age_55-59,age_60-64,age_65-69,age_70-74,age_75-79,age_80-84,age_85-89,age_90-90
delomrade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10101,999597858-999651496-455509,2016,GAIA BALANSE AS,3.2.4.0,Spas,1010105,NILS ANKERS GATE 1,59.124371,11.386474,,...,466.0,466.0,455.0,480.0,565.0,428.0,267.0,251.0,170.0,0.0
10101,913105435-913152654-673226,2016,SARAH LEGEKONTOR AS,2.8.1.0,Grocery stores,1010105,VIOLGATA 8,59.123472,11.385975,,...,466.0,466.0,455.0,480.0,565.0,428.0,267.0,251.0,170.0,0.0
10101,890542042-890577172-7948,2016,BAGORAMA TISTA SENTER,2.7.3.0,Travel accessories stores,1010105,WALKERS GATE 4,59.121578,11.381457,BAGORAMA,...,466.0,466.0,455.0,480.0,565.0,428.0,267.0,251.0,170.0,0.0
10101,946411507-971603070-42872,2016,KØHN LIBRIS,2.6.1.0,Bookstores,1010105,STORGATA 24,59.124712,11.386952,LIBRIS,...,466.0,466.0,455.0,480.0,565.0,428.0,267.0,251.0,170.0,0.0
10101,912536548-912547078-645239,2016,LN CLOTHING AS,2.4.6.0,Clothing stores,1010105,WALKERS GATE 4,59.121578,11.381457,,...,466.0,466.0,455.0,480.0,565.0,428.0,267.0,251.0,170.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203003,953341018-972184314-410,2016,G-SPORT RANDAL,2.6.3.2,Sporting goods stores,20300303,WIULLS GATE 7,69.727282,30.042873,G-SPORT,...,248.0,248.0,246.0,213.0,153.0,117.0,108.0,101.0,62.0,0.0
203003,915503586-915538878-771259,2016,STAS INTERIØR,2.9.1.0,Gifts and interior design shops,20300303,KIRKEGATA 1,69.728172,30.039062,,...,248.0,248.0,246.0,213.0,153.0,117.0,108.0,101.0,62.0,0.0
203003,917411395-917523592-849841,2016,KAFE VISIT AS,1.1.6.4,Belongs to dining,20300310,HAVNEVEIEN 1,69.726680,30.063655,,...,248.0,248.0,246.0,213.0,153.0,117.0,108.0,101.0,62.0,0.0
203003,913413105-975842983-9719,2016,REMA 1000 KIRKENES,2.8.1.0,Grocery stores,20300310,SMIEVEIEN 2,69.726299,30.065194,REMA FRANCHISE NORGE,...,248.0,248.0,246.0,213.0,153.0,117.0,108.0,101.0,62.0,0.0


## Preprocessing ALL the dataframes :)

In [49]:
age_distribution = pd.read_csv("data/grunnkrets_age_distribution.csv")
households_num_persons = pd.read_csv("data/grunnkrets_households_num_persons.csv")
income_households = pd.read_csv("data/grunnkrets_income_households.csv")
norway_stripped = pd.read_csv("data/grunnkrets_norway_stripped.csv")


In [50]:
# add all dataframes to dict
grunnkrets_dfs = {
    "age_distribution" : age_distribution, 
    "households_num_persons" : households_num_persons, 
    "income_households" : income_households, 
    "norway_stripped" : norway_stripped, 
    }

In [51]:
# preprocess dataframes
for df_name, df in grunnkrets_dfs.items():
    grunnkrets_dfs[df_name] = preprocess_grunnkrets_df(df)

# group age categories and sum up values
# span_size determines how many age-columns should be grouped each time
span_size = 7 # I chose 7 because it is a prime factor of 91, and makes all spans have same size
grunnkrets_dfs["age_distribution"] = group_age_columns(age_distribution_df=grunnkrets_dfs["age_distribution"], span_size=span_size)

In [52]:
# add dataframes grouped by fylke, kommune and delomrade
for df_name, df in grunnkrets_dfs.copy().items():
    grunnkrets_dfs[df_name + "_fylke"] = group_df(df, "fylke")
    grunnkrets_dfs[df_name + "_kommune"] = group_df(df, "kommune")
    grunnkrets_dfs[df_name + "_delomrade"] = group_df(df, "delomrade")


In [53]:
grunnkrets_dfs.keys()

dict_keys(['age_distribution', 'households_num_persons', 'income_households', 'norway_stripped', 'age_distribution_fylke', 'age_distribution_kommune', 'age_distribution_delomrade', 'households_num_persons_fylke', 'households_num_persons_kommune', 'households_num_persons_delomrade', 'income_households_fylke', 'income_households_kommune', 'income_households_delomrade', 'norway_stripped_fylke', 'norway_stripped_kommune', 'norway_stripped_delomrade'])

In [54]:
grunnkrets_dfs["age_distribution"]

Unnamed: 0,grunnkrets_id,year,fylke,kommune,delomrade,age_0-6,age_7-13,age_14-20,age_21-27,age_28-34,age_35-41,age_42-48,age_49-55,age_56-62,age_63-69,age_70-76,age_77-83,age_84-90
6784,2190812,2015,2,219,21908,0,0,0,3,16,21,18,14,7,1,0,0,0
8175,2190914,2015,2,219,21909,29,54,46,35,23,32,44,47,33,22,12,7,5
8529,20300310,2015,20,2030,203003,2,0,6,16,24,11,7,7,7,0,0,0,0
8577,5170101,2015,5,517,51701,2,0,5,19,32,14,1,0,3,0,0,0,0
16348,2191013,2015,2,219,21910,71,70,36,44,63,76,67,50,47,37,28,25,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10062,1061804,2016,1,106,10618,28,26,25,42,33,31,41,49,48,56,29,8,9
10064,1063202,2016,1,106,10632,28,36,35,33,28,38,42,39,35,27,18,11,5
10065,11010402,2016,11,1101,110104,21,18,25,56,32,37,44,45,39,35,26,16,8
10038,9260106,2016,9,926,92601,30,38,26,21,24,31,35,39,42,47,26,9,5
