In [None]:
import pandas as pd

from attributes.individual.household_position.household_position import (get_household_position_joint_age_gender,
                                                                         read_local_household_composition)
from attributes.marginal_data_reader import read_marginal_data



In [None]:
df_households = read_marginal_data(['population', 'households', 'single_person', 'without_children', 'with_children'],
                                   'households')
df_households = df_households.pivot(index='neighb_code', columns='households', values='count')
df_households

households,households,population,single_person,with_children,without_children
neighb_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BU05181785,10,55,10,5,5
BU05183284,7840,15010,3735,2275,1840
BU05183387,4015,8730,1880,1525,620
BU05183396,3745,7955,1690,1370,695
BU05183398,4940,10820,2255,1830,865
BU05183399,820,1525,435,185,210
BU05183480,3225,6205,1650,1020,565
BU05183488,3465,7505,1500,1275,695
BU05183489,3230,5630,1865,860,510
BU05183536,115,155,95,5,25


From the first row, it immediately becomes clear that the total reported number of households is not very meaningful:

In [None]:
df_households.loc['BU05181785']

households
households          10
population          55
single_person       10
with_children        5
without_children     5
Name: BU05181785, dtype: int64

The sum of `single_person` = 10, `with_children` = 5 and `without_children` = 5 is 20, while `households` reports a total of 10 households.
This is because of the rounding that is meant to maintain privacy. 
The true number of households is anywhere between 8 and 12, the true number of single person households is also between 8 and 12, and the number of households with or without children is either between 3 and 7.

In this instance, we can see the data cannot be made consistent, by assuming the true number of households is supposed to be 12 (upper bound), with 8 single-person households (lower bound), and 3 with-children and without children households each (again lower bound). 

In [None]:
df_households.loc[:,
'total_households'] = df_households.single_person + df_households.with_children + df_households.without_children
df_households.total_households - df_households.households

neighb_code
BU05181785    10
BU05183284    10
BU05183387    10
BU05183396    10
BU05183398    10
BU05183399    10
BU05183480    10
BU05183488     5
BU05183489     5
BU05183536    10
BU05183620    10
BU05183637    10
BU05183638    10
BU05183639    10
dtype: int64

Based on the above, let's not look at the total reported number of households at all

In [None]:
df_households.drop(['households', 'total_households'], axis=1, inplace=True)
df_households

households,population,single_person,with_children,without_children
neighb_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BU05181785,55,10,5,5
BU05183284,15010,3735,2275,1840
BU05183387,8730,1880,1525,620
BU05183396,7955,1690,1370,695
BU05183398,10820,2255,1830,865
BU05183399,1525,435,185,210
BU05183480,6205,1650,1020,565
BU05183488,7505,1500,1275,695
BU05183489,5630,1865,860,510
BU05183536,155,95,5,25


The first data set, `df_household_position`, is on the level of individuals, while the second, `df_households` is on the level of households.
If we want to combine the two, we want to find out how many people live in each of the household types `single_person`, `with_children` and `without_children`.

A single-person household by definition contains a single individual.
The semantics of `without_children` in this case seems to indicate married or non-married couples, but also includes households with the designation `other`.
We have no idea how big an `other`-type household could be, so let's assume this just relates to couples without children, in which case this household type is by definition a two-person household type.

That means that in each neighborhood, `single_person` is also the count for number of people living in a single-person household, `without_children` is half the count of people living in a `without_children` household, which means that the rest of the population should be living in a `with_children` household.

Note that `with_children` includes married and non-married couples with children, but also single-parent households

In [None]:
df_households.loc[:, 'in_hh_without_children'] = df_households.without_children * 2
df_households.loc[:,
'in_hh_with_children'] = df_households.population - df_households.single_person - df_households.in_hh_without_children
df_households

households,population,single_person,with_children,without_children,in_hh_without_children,in_hh_with_children
neighb_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BU05181785,55,10,5,5,10,35
BU05183284,15010,3735,2275,1840,3680,7595
BU05183387,8730,1880,1525,620,1240,5610
BU05183396,7955,1690,1370,695,1390,4875
BU05183398,10820,2255,1830,865,1730,6835
BU05183399,1525,435,185,210,420,670
BU05183480,6205,1650,1020,565,1130,3425
BU05183488,7505,1500,1275,695,1390,4615
BU05183489,5630,1865,860,510,1020,2745
BU05183536,155,95,5,25,50,10


This means the average number of children per household per neighborhood is the following:

In [None]:
(df_households.in_hh_with_children / df_households.with_children) - 2

neighb_code
BU05181785    5.000000
BU05183284    1.338462
BU05183387    1.678689
BU05183396    1.558394
BU05183398    1.734973
BU05183399    1.621622
BU05183480    1.357843
BU05183488    1.619608
BU05183489    1.191860
BU05183536    0.000000
BU05183620    1.495238
BU05183637    1.354545
BU05183638    1.500000
BU05183639    1.458599
dtype: float64

There are two outliers. The small `BU05181785` neighborhood would have 5 households with children, each with 5 children, which sounds like a lot.
The other interesting household reports no children at all. Let's ignore this for now.

Now let's get back the the household positions:

In [None]:
df_household_position = get_household_position_joint_age_gender()
df_household_position

Unnamed: 0,gender,age_group,household_position,count
0,male,0-5,child,15683
1,male,5-10,child,15479
2,male,10-15,child,14741
3,male,15-20,child,13696
4,male,20-25,child,8645
...,...,...,...,...
275,female,75-80,single_parent,202
276,female,80-85,single_parent,165
277,female,85-90,single_parent,114
278,female,90-95,single_parent,69


In [None]:
df_household_position.household_position.unique()

array(['child', 'single', 'non_married_no_children',
       'married_no_children', 'non_married_with_children',
       'married_with_children', 'single_parent'], dtype=object)

The goal is to label each individual with their position in a household, _and_ the type of household they live in.

* `child` => `in_hh_with_children`
* `single` => `single_person`
* `non_married_no_children` => `in_hh_without_children`
* `married_no_children` => `in_hh_without_children`
* `non_married_with_children` => `in_hh_with_children`
* `married_with_children` => `in_hh_with_children`
* `single_parent` => `in_hh_with_children`

In [None]:
df_household_position.set_index('household_position', inplace=True)
df_household_position.loc[:, 'household_type'] = None
df_household_position.loc['child', 'household_type'] = 'in_hh_with_children'
df_household_position.loc['single', 'household_type'] = 'single_person'
df_household_position.loc['non_married_no_children', 'household_type'] = 'in_hh_without_children'
df_household_position.loc['married_no_children', 'household_type'] = 'in_hh_without_children'
df_household_position.loc['non_married_with_children', 'household_type'] = 'in_hh_with_children'
df_household_position.loc['married_with_children', 'household_type'] = 'in_hh_with_children'
df_household_position.loc['single_parent', 'household_type'] = 'in_hh_with_children'
df_household_position.reset_index(inplace=True)
df_household_position

Unnamed: 0,household_position,gender,age_group,count,household_type
0,child,male,0-5,15683,in_hh_with_children
1,child,male,5-10,15479,in_hh_with_children
2,child,male,10-15,14741,in_hh_with_children
3,child,male,15-20,13696,in_hh_with_children
4,child,male,20-25,8645,in_hh_with_children
...,...,...,...,...,...
275,single_parent,female,75-80,202,in_hh_with_children
276,single_parent,female,80-85,165,in_hh_with_children
277,single_parent,female,85-90,114,in_hh_with_children
278,single_parent,female,90-95,69,in_hh_with_children


In [None]:
df_household_position.household_type.isna().sum()

0

In [None]:
df_households.reset_index()

households,neighb_code,population,single_person,with_children,without_children,in_hh_without_children,in_hh_with_children
0,BU05181785,55,10,5,5,10,35
1,BU05183284,15010,3735,2275,1840,3680,7595
2,BU05183387,8730,1880,1525,620,1240,5610
3,BU05183396,7955,1690,1370,695,1390,4875
4,BU05183398,10820,2255,1830,865,1730,6835
5,BU05183399,1525,435,185,210,420,670
6,BU05183480,6205,1650,1020,565,1130,3425
7,BU05183488,7505,1500,1275,695,1390,4615
8,BU05183489,5630,1865,860,510,1020,2745
9,BU05183536,155,95,5,25,50,10


In [None]:
df_households.reset_index().melt(id_vars='neighb_code',
                                 value_vars=['single_person', 'in_hh_with_children', 'in_hh_without_children'],
                                 value_name='count',
                                 var_name='household_type').groupby(['neighb_code', 'household_type']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
neighb_code,household_type,Unnamed: 2_level_1
BU05181785,in_hh_with_children,35
BU05181785,in_hh_without_children,10
BU05181785,single_person,10
BU05183284,in_hh_with_children,7595
BU05183284,in_hh_without_children,3680
BU05183284,single_person,3735
BU05183387,in_hh_with_children,5610
BU05183387,in_hh_without_children,1240
BU05183387,single_person,1880
BU05183396,in_hh_with_children,4875


In [None]:
df_households_with_position = df_household_position.merge(
        df_households.melt(
                value_vars=['single_person', 'in_hh_with_children', 'in_hh_without_children'],
                value_name='count',
                var_name='household_type').groupby('household_type').sum(),
        on='household_type',
        how='left')
df_households_with_position

Unnamed: 0,household_position,gender,age_group,count_x,household_type,count_y
0,child,male,0-5,15683,in_hh_with_children,48750
1,child,male,5-10,15479,in_hh_with_children,48750
2,child,male,10-15,14741,in_hh_with_children,48750
3,child,male,15-20,13696,in_hh_with_children,48750
4,child,male,20-25,8645,in_hh_with_children,48750
...,...,...,...,...,...,...
275,single_parent,female,75-80,202,in_hh_with_children,48750
276,single_parent,female,80-85,165,in_hh_with_children,48750
277,single_parent,female,85-90,114,in_hh_with_children,48750
278,single_parent,female,90-95,69,in_hh_with_children,48750


Now we can adjust the `count_y`, with the frequencies in `count_x`

In [None]:
df_households_with_position.loc[:, 'count'] = df_households_with_position.groupby(
        'household_type').count_x.transform(
        lambda x: x / x.sum() * df_households_with_position.count_y)
df_households_with_position

Unnamed: 0,household_position,gender,age_group,count_x,household_type,count_y,count
0,child,male,0-5,15683,in_hh_with_children,48750,2688.296859
1,child,male,5-10,15479,in_hh_with_children,48750,2653.328258
2,child,male,10-15,14741,in_hh_with_children,48750,2526.824204
3,child,male,15-20,13696,in_hh_with_children,48750,2347.695835
4,child,male,20-25,8645,in_hh_with_children,48750,1481.880147
...,...,...,...,...,...,...,...
275,single_parent,female,75-80,202,in_hh_with_children,48750,34.625771
276,single_parent,female,80-85,165,in_hh_with_children,48750,28.283427
277,single_parent,female,85-90,114,in_hh_with_children,48750,19.541277
278,single_parent,female,90-95,69,in_hh_with_children,48750,11.827615


In [None]:
df_households_with_position.groupby('household_type')['count'].sum().reset_index()

Unnamed: 0,household_type,count
0,in_hh_with_children,48750.0
1,in_hh_without_children,15280.0
2,single_person,20850.0


That looks good

In [None]:
df_households_with_position = df_households_with_position[
    ["age_group", "gender", "household_position", "household_type", "count"]]
df_households_with_position

Unnamed: 0,age_group,gender,household_position,household_type,count
0,0-5,male,child,in_hh_with_children,2688.296859
1,5-10,male,child,in_hh_with_children,2653.328258
2,10-15,male,child,in_hh_with_children,2526.824204
3,15-20,male,child,in_hh_with_children,2347.695835
4,20-25,male,child,in_hh_with_children,1481.880147
...,...,...,...,...,...
275,75-80,female,single_parent,in_hh_with_children,34.625771
276,80-85,female,single_parent,in_hh_with_children,28.283427
277,85-90,female,single_parent,in_hh_with_children,19.541277
278,90-95,female,single_parent,in_hh_with_children,11.827615


# Number of children in a household

Now for our next trick:

## Finding the percentages

In [None]:
df_household_composition = read_local_household_composition()
df_household_composition

Unnamed: 0,reference_person_age,single,no_children,has_children,single_parent_1_children,single_parent_2_children,single_parent_3_children,miscellaneous,couple_0_children,couple_1_children,couple_2_children,couple_3_children
0,15 tot 20 jaar,2587,175,36,29,1,0,34,141,4,0,2
1,20 tot 25 jaar,11344,2028,697,349,75,15,257,1771,203,43,12
2,25 tot 30 jaar,14158,5765,3386,922,426,128,513,5252,1302,504,104
3,30 tot 35 jaar,11041,5561,8219,1202,815,378,401,5160,3019,2197,608
4,35 tot 40 jaar,9099,3461,12223,1433,1076,599,232,3229,2982,4336,1797
5,40 tot 45 jaar,8174,2494,13170,1731,1206,560,127,2367,2368,4769,2536
6,45 tot 50 jaar,9229,2845,14542,2339,1371,505,114,2731,2712,4910,2705
7,50 tot 55 jaar,9031,3516,12341,2391,1041,263,88,3428,3006,3798,1842
8,55 tot 60 jaar,9491,4652,8622,1909,550,114,107,4545,2911,2202,936
9,60 tot 65 jaar,9221,5676,4363,1133,220,41,101,5575,1866,807,296


### Single Parents
$f(h_c)$ = \frac{h_c}{\sum^{c+}_{c=1} h_c}$ 

In [None]:
df_n_children_per_single_parent = df_household_composition.melt(
        value_vars=['single_parent_1_children', 'single_parent_2_children', 'single_parent_3_children'],
        value_name='count',
        var_name='household_type').groupby('household_type').sum().transform(lambda x: x / x.sum())
df_n_children_per_single_parent

Unnamed: 0_level_0,count
household_type,Unnamed: 1_level_1
single_parent_1_children,0.611782
single_parent_2_children,0.28183
single_parent_3_children,0.106388


This suggests about 61% of single-parents has only one child, 28% has two and the remaining 11% has 3 (or more, but we ignore that) children.

This also suggests that 61% of children who live in a single-parent household are the only child in that household, but since a 2-children household houses two children, the remaining fractions change a bit


In [None]:
df_n_children_in_single_parent_household = df_n_children_per_single_parent.copy()
df_n_children_in_single_parent_household.loc[:, 'n_children'] = [1, 2, 3]
df_n_children_in_single_parent_household.loc[:, 'count'] *= df_n_children_in_single_parent_household.n_children
df_n_children_in_single_parent_household.loc[:, 'count'] = df_n_children_in_single_parent_household['count'].transform(
        lambda x: x / x.sum())
df_n_children_in_single_parent_household.drop('n_children', axis=1, inplace=True)
df_n_children_in_single_parent_household

Unnamed: 0_level_0,count
household_type,Unnamed: 1_level_1
single_parent_1_children,0.409327
single_parent_2_children,0.377129
single_parent_3_children,0.213544


The equation can be simplified a bit

$n_c = n \cdot \frac{c \cdot h_c}{\sum^{c^+}_{c'=1} c' \cdot  h_{c'} $
 

In [None]:
df_tst = df_household_composition.melt(
        value_vars=['single_parent_1_children', 'single_parent_2_children', 'single_parent_3_children'],
        value_name='count',
        var_name='household_type').groupby('household_type').sum()
df_tst.loc[:, 'c'] = [1, 2, 3]
df_tst.loc[:, 'f_n_c'] = df_tst.c * df_tst['count']
df_tst.loc[:, 'n_c'] = df_tst['f_n_c'].transform(lambda x: x / x.sum())

df_tst

Unnamed: 0_level_0,count,c,f_n_c,n_c
household_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
single_parent_1_children,15141,1,15141,0.409327
single_parent_2_children,6975,2,13950,0.377129
single_parent_3_children,2633,3,7899,0.213544


### Couples with children
We can do a similar trick for couples with children:

In [None]:
df_n_children_per_couple = df_household_composition.melt(
        value_vars=['couple_1_children', 'couple_2_children', 'couple_3_children'],
        value_name='count',
        var_name='household_type').groupby('household_type').sum().transform(lambda x: x / x.sum())
df_n_children_per_couple

Unnamed: 0_level_0,count
household_type,Unnamed: 1_level_1
couple_1_children,0.389767
couple_2_children,0.419031
couple_3_children,0.191202


In [None]:
df_n_children_in_couple_household = df_n_children_per_couple.copy()
df_n_children_in_couple_household.loc[:, 'n_children'] = [1, 2, 3]
df_n_children_in_couple_household.loc[:, 'count'] *= df_n_children_in_couple_household.n_children
df_n_children_in_couple_household.loc[:, 'count'] = df_n_children_in_couple_household['count'].transform(
        lambda x: x / x.sum())
df_n_children_in_couple_household.drop('n_children', axis=1, inplace=True)
df_n_children_in_couple_household

Unnamed: 0_level_0,count
household_type,Unnamed: 1_level_1
couple_1_children,0.216365
couple_2_children,0.46522
couple_3_children,0.318416


### Children
For children, it is not yet known if they live in a single-parent or two-parent household. This means we do not have to split them into three groups (1, 2 or 3+ children) but in six groups (1, 2, 3+ children in a single-parent or two-parent household)

For this, we have to know the relative frequencies of single- and two-parent households as well.

In [None]:
df_single_vs_couple_distribution = df_households_with_position[df_households_with_position.household_position.isin(
        ['married_with_children', 'non_married_with_children', 'single_parent'])].groupby('household_position')[
    "count"].sum().transform(lambda x: x / x.sum())
df_single_vs_couple_distribution

household_position
married_with_children        0.614912
non_married_with_children    0.207845
single_parent                0.177243
Name: count, dtype: float64

So now we know that 60%% of children has two married parents and of that 60%, 22% lives in a single-child household. We can combine those two facts:

In [None]:
df_n_children_per_couple

Unnamed: 0_level_0,count
household_type,Unnamed: 1_level_1
couple_1_children,0.389767
couple_2_children,0.419031
couple_3_children,0.191202


In [None]:
df_n_children = pd.concat([
    df_n_children_in_single_parent_household.rename(index={
        'single_parent_1_children': 'child_of_single_parent_1_children',
        'single_parent_2_children': 'child_of_single_parent_2_children',
        'single_parent_3_children': 'child_of_single_parent_3_children'
    }),
    df_n_children_in_couple_household.rename(index={
        'couple_1_children': 'child_in_married_with_1_children',
        'couple_2_children': 'child_in_married_with_2_children',
        'couple_3_children': 'child_in_married_with_3_children'
    }),
    df_n_children_in_couple_household.rename(index={
        'couple_1_children': 'child_in_non_married_with_1_children',
        'couple_2_children': 'child_in_non_married_with_2_children',
        'couple_3_children': 'child_in_non_married_with_3_children'
    }),
])
msk_single_parent = df_n_children.index.str.contains('single_parent')
msk_non_married = df_n_children.index.str.contains('non_married')
df_n_children.loc[msk_single_parent, 'count'] *= df_single_vs_couple_distribution.loc['single_parent']
df_n_children.loc[msk_non_married, 'count'] *= df_single_vs_couple_distribution.loc['non_married_with_children']
df_n_children.loc[~(msk_single_parent | msk_non_married), 'count'] *= df_single_vs_couple_distribution.loc[
    'married_with_children']
df_n_children

Unnamed: 0_level_0,count
household_type,Unnamed: 1_level_1
child_of_single_parent_1_children,0.07255
child_of_single_parent_2_children,0.066844
child_of_single_parent_3_children,0.037849
child_in_married_with_1_children,0.133045
child_in_married_with_2_children,0.286069
child_in_married_with_3_children,0.195798
child_in_non_married_with_1_children,0.04497
child_in_non_married_with_2_children,0.096694
child_in_non_married_with_3_children,0.066181


In [None]:
df_n_children.sum()

count    1.0
dtype: float64

Actually, we have to do the same for the `df_n_children_per_couple` frame, splitting the couples with children into married or unmarried

In [None]:
df_married_vs_not_distribution = df_households_with_position[df_households_with_position.household_position.isin(
        ['married_with_children', 'non_married_with_children'])].groupby('household_position')[
    "count"].sum().transform(lambda x: x / x.sum())
df_married_vs_not_distribution

household_position
married_with_children        0.74738
non_married_with_children    0.25262
Name: count, dtype: float64

In [None]:
df_n_couples = pd.concat([
    df_n_children_per_couple.rename(index={
        'couple_1_children': 'married_with_1_children', 'couple_2_children': 'married_with_2_children',
        'couple_3_children': 'married_with_3_children'
    }),
    df_n_children_per_couple.rename(index={
        'couple_1_children': 'non_married_with_1_children', 'couple_2_children': 'non_married_with_2_children',
        'couple_3_children': 'non_married_with_3_children'
    })])
msk = df_n_couples.index.str.contains('non_married')
df_n_couples.loc[msk, 'count'] *= df_married_vs_not_distribution.loc['non_married_with_children']
df_n_couples.loc[~msk, 'count'] *= df_married_vs_not_distribution.loc['married_with_children']
df_n_couples

Unnamed: 0_level_0,count
household_type,Unnamed: 1_level_1
married_with_1_children,0.291304
married_with_2_children,0.313176
married_with_3_children,0.1429
non_married_with_1_children,0.098463
non_married_with_2_children,0.105856
non_married_with_3_children,0.048301


In [None]:
df_n_couples.sum()

count    1.0
dtype: float64

## Using these frequencies

### Children

In [None]:
df_households_with_position_children = df_households_with_position[
    df_households_with_position.household_position == 'child'].assign(key=1).merge(
        df_n_children.reset_index().assign(key=1), on='key').drop('key', axis=1)
df_households_with_position_children

Unnamed: 0,age_group,gender,household_position,household_type_x,count_x,household_type_y,count_y
0,0-5,male,child,in_hh_with_children,2688.296859,child_of_single_parent_1_children,0.072550
1,0-5,male,child,in_hh_with_children,2688.296859,child_of_single_parent_2_children,0.066844
2,0-5,male,child,in_hh_with_children,2688.296859,child_of_single_parent_3_children,0.037849
3,0-5,male,child,in_hh_with_children,2688.296859,child_in_married_with_1_children,0.133045
4,0-5,male,child,in_hh_with_children,2688.296859,child_in_married_with_2_children,0.286069
...,...,...,...,...,...,...,...
355,95+,female,child,in_hh_with_children,0.000000,child_in_married_with_2_children,0.286069
356,95+,female,child,in_hh_with_children,0.000000,child_in_married_with_3_children,0.195798
357,95+,female,child,in_hh_with_children,0.000000,child_in_non_married_with_1_children,0.044970
358,95+,female,child,in_hh_with_children,0.000000,child_in_non_married_with_2_children,0.096694


### Single Parents


In [None]:
df_households_with_position_single_parent = df_households_with_position[
    df_households_with_position.household_position == 'single_parent'].assign(key=1).merge(
        df_n_children_in_single_parent_household.reset_index().assign(key=1), on='key').drop('key', axis=1)
df_households_with_position_single_parent

Unnamed: 0,age_group,gender,household_position,household_type_x,count_x,household_type_y,count_y
0,0-5,male,single_parent,in_hh_with_children,0.000000,single_parent_1_children,0.409327
1,0-5,male,single_parent,in_hh_with_children,0.000000,single_parent_2_children,0.377129
2,0-5,male,single_parent,in_hh_with_children,0.000000,single_parent_3_children,0.213544
3,5-10,male,single_parent,in_hh_with_children,0.000000,single_parent_1_children,0.409327
4,5-10,male,single_parent,in_hh_with_children,0.000000,single_parent_2_children,0.377129
...,...,...,...,...,...,...,...
115,90-95,female,single_parent,in_hh_with_children,11.827615,single_parent_2_children,0.377129
116,90-95,female,single_parent,in_hh_with_children,11.827615,single_parent_3_children,0.213544
117,95+,female,single_parent,in_hh_with_children,2.742635,single_parent_1_children,0.409327
118,95+,female,single_parent,in_hh_with_children,2.742635,single_parent_2_children,0.377129


### Married couples

In [None]:
df_households_with_position_married_parents = df_households_with_position[
    df_households_with_position.household_position == 'married_with_children'].assign(key=1).merge(
        df_n_children_in_couple_household.rename(index={
            'couple_1_children': 'married_with_1_children',
            'couple_2_children': 'married_with_2_children',
            'couple_3_children': 'married_with_3_children',
        }).reset_index().assign(key=1), on='key').drop('key', axis=1)
df_households_with_position_married_parents

Unnamed: 0,age_group,gender,household_position,household_type_x,count_x,household_type_y,count_y
0,0-5,male,married_with_children,in_hh_with_children,0.000000,married_with_1_children,0.216365
1,0-5,male,married_with_children,in_hh_with_children,0.000000,married_with_2_children,0.465220
2,0-5,male,married_with_children,in_hh_with_children,0.000000,married_with_3_children,0.318416
3,5-10,male,married_with_children,in_hh_with_children,0.000000,married_with_1_children,0.216365
4,5-10,male,married_with_children,in_hh_with_children,0.000000,married_with_2_children,0.465220
...,...,...,...,...,...,...,...
115,90-95,female,married_with_children,in_hh_with_children,0.514244,married_with_2_children,0.465220
116,90-95,female,married_with_children,in_hh_with_children,0.514244,married_with_3_children,0.318416
117,95+,female,married_with_children,in_hh_with_children,0.000000,married_with_1_children,0.216365
118,95+,female,married_with_children,in_hh_with_children,0.000000,married_with_2_children,0.465220


In [None]:
df_households_with_position_non_married_parents = df_households_with_position[
    df_households_with_position.household_position == 'non_married_with_children'].assign(key=1).merge(
        df_n_children_in_couple_household.rename(index={
            'couple_1_children': 'non_married_with_1_children',
            'couple_2_children': 'non_married_with_2_children',
            'couple_3_children': 'non_married_with_3_children',
        }).reset_index().assign(key=1), on='key').drop('key', axis=1)
df_households_with_position_non_married_parents

Unnamed: 0,age_group,gender,household_position,household_type_x,count_x,household_type_y,count_y
0,0-5,male,non_married_with_children,in_hh_with_children,0.000000,non_married_with_1_children,0.216365
1,0-5,male,non_married_with_children,in_hh_with_children,0.000000,non_married_with_2_children,0.465220
2,0-5,male,non_married_with_children,in_hh_with_children,0.000000,non_married_with_3_children,0.318416
3,5-10,male,non_married_with_children,in_hh_with_children,0.000000,non_married_with_1_children,0.216365
4,5-10,male,non_married_with_children,in_hh_with_children,0.000000,non_married_with_2_children,0.465220
...,...,...,...,...,...,...,...
115,90-95,female,non_married_with_children,in_hh_with_children,0.000000,non_married_with_2_children,0.465220
116,90-95,female,non_married_with_children,in_hh_with_children,0.000000,non_married_with_3_children,0.318416
117,95+,female,non_married_with_children,in_hh_with_children,0.171415,non_married_with_1_children,0.216365
118,95+,female,non_married_with_children,in_hh_with_children,0.171415,non_married_with_2_children,0.465220


## Bringing it back together

In [None]:
df_households_with_children = pd.concat([
    df_households_with_position_children,
    df_households_with_position_single_parent,
    df_households_with_position_married_parents,
    df_households_with_position_non_married_parents
])
df_households_with_children.loc[:, 'count'] = df_households_with_children.count_x * df_households_with_children.count_y
df_households_with_children.loc[:, 'household_position'] = df_households_with_children.household_type_y
df_households_with_children.rename(columns={'household_type_x': 'household_type'}, inplace=True)
df_households_with_children = df_households_with_children[
    ['age_group', 'gender', 'household_position', 'household_type', 'count']]
df_households_with_children

Unnamed: 0,age_group,gender,household_position,household_type,count
0,0-5,male,child_of_single_parent_1_children,in_hh_with_children,195.037014
1,0-5,male,child_of_single_parent_2_children,in_hh_with_children,179.695287
2,0-5,male,child_of_single_parent_3_children,in_hh_with_children,101.750041
3,0-5,male,child_in_married_with_1_children,in_hh_with_children,357.665315
4,0-5,male,child_in_married_with_2_children,in_hh_with_children,769.038737
...,...,...,...,...,...
115,90-95,female,non_married_with_2_children,in_hh_with_children,0.000000
116,90-95,female,non_married_with_3_children,in_hh_with_children,0.000000
117,95+,female,non_married_with_1_children,in_hh_with_children,0.037088
118,95+,female,non_married_with_2_children,in_hh_with_children,0.079745


In [None]:
df_households_with_position_and_children = pd.concat([
    df_households_with_position[
        df_households_with_position.household_position.isin(['single', 'non_married_no_children',
                                                             'married_no_children'])],
    df_households_with_children
])
df_households_with_position_and_children

Unnamed: 0,age_group,gender,household_position,household_type,count
40,0-5,male,single,single_person,0.000000
41,5-10,male,single,single_person,0.000000
42,10-15,male,single,single_person,0.000000
43,15-20,male,single,single_person,181.759850
44,20-25,male,single,single_person,884.021375
...,...,...,...,...,...
115,90-95,female,non_married_with_2_children,in_hh_with_children,0.000000
116,90-95,female,non_married_with_3_children,in_hh_with_children,0.000000
117,95+,female,non_married_with_1_children,in_hh_with_children,0.037088
118,95+,female,non_married_with_2_children,in_hh_with_children,0.079745


In [None]:
df_households_with_position['count'].sum()

84880.0

In [None]:
df_households_with_position_and_children['count'].sum()

84880.0

In [None]:
pd.concat([
    df_households_with_position.groupby('household_type')['count'].sum(),
    df_households_with_position_and_children.groupby('household_type')['count'].sum()
], axis=1)

Unnamed: 0_level_0,count,count
household_type,Unnamed: 1_level_1,Unnamed: 2_level_1
in_hh_with_children,48750.0,48750.0
in_hh_without_children,15280.0,15280.0
single_person,20850.0,20850.0


The two data frames still look the same. We can start adding household position from the `df_households_with_position_and_children` frame

In [None]:
df_households_with_position_and_children.to_pickle('../processed/df_households_with_position_and_children.pkl')