In [None]:
import pandas as pd
from process_bulk import ProcessBulk
import evaluation_helpers
from process_geography import Ward, LocalAuthority
import os
from copy import deepcopy
import create_plots

In [None]:
pd.options.mode.copy_on_write = True
pd.set_option("future.no_silent_downcasting", True)

In [None]:
bulk_folder_name = "bulk_data"
ward_folder_name_tenure = "dc4201ew_htward"
index_sheet_name = "Cell Numbered DC Tables 3.2.xlsx"

cl_folder_name = "confidence_intervals"
cl_sheet_name = "2011censusconfidenceintervalsdec2013_tcm77-271638.xlsx"
cl_table_name = "95% CI widths (LA level)"

fig_folder_name = "figs/population_analysis"

p_cl = ProcessBulk(bulk_folder=cl_folder_name, index_sheet=cl_sheet_name)

p = ProcessBulk(bulk_folder=bulk_folder_name, index_sheet=index_sheet_name)

# Tenure by ethnic group by age - Household Reference Persons
table_name_tenure = "DC4201EW"

In [None]:
# list of ethnicities for filtering
general_ethnicities = [
    "White: Total",
    "Mixed/multiple ethnic group: Total",
    "Asian/Asian British: Total",
    "Black/African/Caribbean/Black British: Total",
    "Other ethnic group: Total",
]
detailed_ethnicities = [
    "White: English/Welsh/Scottish/Northern Irish/British",
    "White: Irish",
    "White: Gypsy or Irish Traveller",
    "White: Other White",
    "Mixed/multiple ethnic group: White and Black Caribbean",
    "Mixed/multiple ethnic group: White and Black African",
    "Mixed/multiple ethnic group: White and Asian",
    "Mixed/multiple ethnic group: Other Mixed",
    "Asian/Asian British: Indian",
    "Asian/Asian British: Pakistani",
    "Asian/Asian British: Bangladeshi",
    "Asian/Asian British: Chinese",
    "Asian/Asian British: Other Asian",
    "Black/African/Caribbean/Black British: African",
    "Black/African/Caribbean/Black British: Caribbean",
    "Black/African/Caribbean/Black British: Other Black",
    "Other ethnic group: Arab",
    "Other ethnic group: Any other ethnic group",
]


detailed_ethnicities_simplified = [
    "White: English/Welsh/Scottish/Northern Irish/British",
    "White: Irish",
    "White: Gypsy or Irish Traveller",
    "Other White",
    "White and Black Caribbean",
    "White and Black African",
    "White and Asian",
    "Mixed/multiple ethnic group: Other Mixed",
    "Indian",
    "Pakistani",
    "Bangladeshi",
    "Chinese",
    "Other Asian",
    "African",
    "Caribbean",
    "Other Black",
    "Arab",
    "Any other ethnic group",
]

# list of tenure categories for filtering
all_tenure = [
    "All categories: Tenure",
    "Owned or shared ownership: Total",
    "Owned: Owned outright",
    "Owned: Owned with a mortgage or loan or shared ownership",
    "Social rented: Total",
    "Social rented: Rented from council (Local Authority)",
    "Social rented: Other social rented",
    "Private rented or living rent free: Total",
    "Private rented: Private landlord or letting agency",
    "Private rented: Other private rented or living rent free",
]
general_tenure = [
    "Owned or shared ownership: Total",
    "Social rented: Total",
    "Private rented or living rent free: Total",
]
detailed_tenure = [
    "Owned: Owned outright",
    "Owned: Owned with a mortgage or loan or shared ownership",
    "Social rented: Rented from council (Local Authority)",
    "Social rented: Other social rented",
    "Private rented: Private landlord or letting agency",
    "Private rented: Other private rented or living rent free",
]

In [None]:
mus = [1.0, 0.5, 0.1, 0.01, 0.001, 0.0001]
epsilons = [0.001, 0.01, 0.1, 1, 3, 5, 10]
epsilons_short = [0.001, 0.01, 0.1, 1]

In [None]:
column_names = ["Age", "Tenure", "EthnicGroup", "Dataset"]

dfs_ward = p.set_up(
    table_name=table_name_tenure,
    df_type="nested",
    column_names=column_names,
    num_nested_category=10,
    subfolder=ward_folder_name_tenure,
    level=6,
)
dfs_la = p.set_up(
    table_name=table_name_tenure,
    df_type="nested",
    column_names=column_names,
    num_nested_category=10,
    subfolder=ward_folder_name_tenure,
    level=5,
)

In [None]:
ward = Ward()
local_authority = LocalAuthority()
# geo lookup file is the same for region and ward
geo_lookup = ward.get_geo_lookup_ward()

In [None]:
# colorschemes from https://personal.sron.nl/~pault/#fig:scheme_bright

colors_3 = ["#004488", "#DDAA33", "#BB5566"]
colors_4 = ["#004488", "#DDAA33", "#BB5566", "#1B7837"]
colors_bright = [
    "#4477AA",
    "#EE6677",
    "#228833",
    "#CCBB44",
    "#66CCEE",
    "#AA3377",
    "#BBBBBB",
]
colors_vibrant = [
    "#EE7733",
    "#0077BB",
    "#33BBEE",
    "#EE3377",
    "#CC3311",
    "#009988",
    "#BBBBBB",
]
colors_muted = [
    "#CC6677",
    "#332288",
    "#DDCC77",
    "#117733",
    "#88CCEE",
    "#882255",
    "#44AA99",
    "#999933",
    "#AA4499",
]

colors_grey = ["#EF233C", "#B7BFCC", "#8894AA", "#2B2F42"]

In [None]:
# reading in the confidence intervals for all local authorities
sheet_cl = p_cl.read_cl(cl_table_name)

In [None]:
df_la = local_authority.get_local_authority(dfs_la["csv_df"], geo_lookup)

In [None]:
df_ward = ward.get_ward(dfs_ward["csv_df"], geo_lookup)

In [None]:
""" filter_dict_ward = {
    "Age": ["All categories: Age"],
    "Tenure": ["All categories: Tenure"],
    "EthnicGroup": detailed_ethnicities,
}

ward_codes = df_ward["GeographyCode"].tolist()

wards = evaluation_helpers.get_filtered_df_ward_dict(
    p, ward, df_ward, ward_codes, dfs_ward["lookup_df"], filter_dict_ward
) """

In [None]:
#overview_wards = evaluation_helpers.create_overview(wards)
#overview_wards

In [None]:
overview_wards = pd.read_csv(os.path.join(p.get_bulk_data_path(), "overview_wards.csv"))
# overview_wards

In [None]:
overview_wards.sort_values(by=["total"], ascending=False, inplace=True)
overview_wards_england = overview_wards[overview_wards["area_code"].str.startswith("E")]
overview_wards_england

In [None]:
minorities_wards = overview_wards.groupby("number_minorities")["area_name"].count()
# minorities_wards

Choosing 3 wards with approximately the same population total with different levels of minority populations. Here I classify wards where the number of minority populations is below 5 as low diversity, between 6-10 a ward has medium diversity and above that, a ward has high diversity. 

In [None]:
low = overview_wards.loc[
    (overview_wards["number_minorities"] <= 5)
    & (overview_wards["total"] < 1200)
    & (overview_wards["total"] > 1100)
]
#low.sort_values(by=["total"], ascending=False, inplace=True)
low

In [None]:
medium = overview_wards.loc[
    (overview_wards["number_minorities"] <= 10)
    & (overview_wards["number_minorities"] > 5)
    & (overview_wards["total"] < 1200)
    & (overview_wards["total"] > 1100)
]
# medium.loc[medium['area_code'] == 'E36000439']
medium

In [None]:
medium_total = overview_wards.loc[
    (overview_wards["number_minorities"] <= 10)
    & (overview_wards["number_minorities"] > 5)
]
medium_total.sort_values(by=["total"], ascending=True, inplace=True)
medium_total

In [None]:
high = overview_wards.loc[
    (overview_wards["number_minorities"] > 10)
    & (overview_wards["total"] < 1200)
    & (overview_wards["total"] > 1100)
]
high

Now I load the 3 wards I have chosen. 

In [None]:
filter_dict_ward = {
    "Age": ["All categories: Age"],
    "Tenure": ["All categories: Tenure"],
    "EthnicGroup": detailed_ethnicities,
}

# ward codes of the 3 wards I have chosen for analysis
ward_codes = ["E36002358", "E36000439", "E36003322"]

wards = evaluation_helpers.get_filtered_df_ward_dict(
    p, ward, df_ward, ward_codes, dfs_ward["lookup_df"], filter_dict_ward
)

For the following experiments, random state is used to achieve deterministic behaviour. By doing this, dp with different kinds of post-processing can be compared.
By using random_state, the same amount of DP noise is applied to the data error column and to the ground truth data. 

In [None]:
wards_dp_geo_clip, metrics_df_geo_clip = evaluation_helpers.set_up_measurements_wards(
    wards,
    df_ward,
    sheet_cl,
    "geometric",
    epsilons,
    delta=0,
    sensitivity=2,
    clipping=True,
    rounding=False,
    random_state=1,
)

In [None]:
wards_dp_geo, metrics_df_geo = evaluation_helpers.set_up_measurements_wards(
    wards,
    df_ward,
    sheet_cl,
    "geometric",
    epsilons,
    delta=0,
    sensitivity=2,
    clipping=False,
    rounding=False,
    random_state=1,
)

In [None]:
metrics_df_geo_clip_short = metrics_df_geo_clip[
    metrics_df_geo_clip.index.isin(epsilons_short)
]
metrics_df_geo_short = metrics_df_geo[metrics_df_geo.index.isin(epsilons_short)]
# metrics_df_geo_clip_short

In [None]:
table_features_data_error = [
    "significantly_decreased_data_error",
    "significantly_increased_data_error",
]
table_features_dp = ["significantly_decreased", "significantly_increased"]
table_features_data_error_dp = [
    "significantly_decreased_data_error_dp",
    "significantly_increased_data_error_dp",
]

In [None]:
table_features_pop_data_error = ["total_data_error"]
table_features_pop_dp = ["total_dp"]
table_features_pop = ["total"]
table_features_pop_data_error_dp = ["total_data_error_dp"]

In [None]:
create_plots.create_table_pop(
    table_features_pop,
    table_features_pop_dp,
    table_features_pop_data_error,
    table_features_pop_data_error_dp,
    metrics_df_geo_clip,
    metrics_df_geo,
    epsilons_short,
)

In [None]:
create_plots.create_table_pop_2(
    table_features_pop,
    table_features_pop_dp,
    table_features_pop_data_error,
    table_features_pop_data_error_dp,
    metrics_df_geo_clip,
    metrics_df_geo,
    epsilons_short,
)

In [None]:
create_plots.create_table_dp(
    table_features_dp, metrics_df_geo_clip, metrics_df_geo, epsilons_short
)

In [None]:
create_plots.create_table_dp_data_error(
    table_features_dp,
    table_features_data_error,
    metrics_df_geo_clip,
    metrics_df_geo,
    epsilons_short,
)

KL-divergence can only be measured if clipping is applied, since it is a measure of probability distributions. If clipping is not applied, values can be negative, leading to negative probabilities in the probability distribution, therefore I only measure KL-divergence is clipping is used. 

In [None]:
measurement_kl_divergence_df_geo_clip = evaluation_helpers.measure_kl_divergence(
    wards_dp_geo_clip, epsilons, mus
)

In [None]:
wards_dp_laplace_clip, metrics_df_laplace_clip = (
    evaluation_helpers.set_up_measurements_wards(
        wards,
        df_ward,
        sheet_cl,
        "laplace",
        epsilons,
        delta=0,
        sensitivity=2,
        clipping=True,
        rounding=False,
        random_state=1,
    )
)

measurement_kl_divergence_df_laplace_clip = evaluation_helpers.measure_kl_divergence(
    wards_dp_laplace_clip, epsilons, mus
)

In [None]:
wards_dp_laplace, metrics_df_laplace = evaluation_helpers.set_up_measurements_wards(
    wards,
    df_ward,
    sheet_cl,
    "laplace",
    epsilons,
    delta=0,
    sensitivity=2,
    clipping=False,
    rounding=False,
    random_state=1,
)

In [None]:
wards_dp_laplace_round, metrics_df_laplace_round = (
    evaluation_helpers.set_up_measurements_wards(
        wards,
        df_ward,
        sheet_cl,
        "laplace",
        epsilons,
        delta=0,
        sensitivity=2,
        clipping=False,
        rounding=True,
        random_state=1,
    )
)

In [None]:
wards_dp_laplace_round_clip, metrics_df_laplace_round_clip = (
    evaluation_helpers.set_up_measurements_wards(
        wards,
        df_ward,
        sheet_cl,
        "laplace",
        epsilons,
        delta=0,
        sensitivity=2,
        clipping=True,
        rounding=True,
        random_state=1,
    )
)
measurement_kl_divergence_df_laplace_round_clip = (
    evaluation_helpers.measure_kl_divergence(wards_dp_laplace_round_clip, epsilons, mus)
)

In [None]:
measurement_kl_divergence_df_geo_clip_short = measurement_kl_divergence_df_geo_clip[
    measurement_kl_divergence_df_geo_clip.index.isin(epsilons_short, level=0)
]

In [None]:
measurement_kl_divergence_df_laplace_clip_short = (
    measurement_kl_divergence_df_laplace_clip[
        measurement_kl_divergence_df_laplace_clip.index.isin(epsilons_short, level=0)
    ]
)

In [None]:
measurement_kl_divergence_df_laplace_round_clip = (
    evaluation_helpers.measure_kl_divergence(wards_dp_laplace_round_clip, epsilons, mus)
)
measurement_kl_divergence_df_laplace_round_clip_short = (
    measurement_kl_divergence_df_laplace_round_clip[
        measurement_kl_divergence_df_laplace_round_clip.index.isin(
            epsilons_short, level=0
        )
    ]
)

In [None]:
metrics_df_laplace_clip_short = metrics_df_laplace_clip[
    metrics_df_laplace_clip.index.isin(epsilons_short)
]
metrics_df_laplace_short = metrics_df_laplace[
    metrics_df_laplace.index.isin(epsilons_short)
]
metrics_df_laplace_round_short = metrics_df_laplace_round[
    metrics_df_laplace_round.index.isin(epsilons_short)
]
metrics_df_laplace_round_clip_short = metrics_df_laplace_round_clip[
    metrics_df_laplace_round_clip.index.isin(epsilons_short)
]

In [None]:
labels_wards = []
for i, code in enumerate(ward_codes):
    area_name = measurement_kl_divergence_df_geo_clip_short.loc[
        measurement_kl_divergence_df_geo_clip_short.area_code == code, "area_name"
    ].values[0]
    diversity = measurement_kl_divergence_df_geo_clip_short.loc[
        measurement_kl_divergence_df_geo_clip_short.area_code == code, "diversity"
    ].values[0]
    labels_wards.append(f"{area_name} ({diversity} diversity)")

print(labels_wards)

In [None]:
labels_geo = []
for i, code in enumerate(ward_codes):
    area_name = measurement_kl_divergence_df_geo_clip_short.loc[
        measurement_kl_divergence_df_geo_clip_short.area_code == code, "area_name"
    ].values[0]
    diversity = measurement_kl_divergence_df_geo_clip_short.loc[
        measurement_kl_divergence_df_geo_clip_short.area_code == code, "diversity"
    ].values[0]
    labels_geo.append(f"{area_name} ({diversity} diversity), Geometric")

print(labels_geo)

In [None]:
labels_laplace = []
for i, code in enumerate(ward_codes):
    area_name = measurement_kl_divergence_df_geo_clip_short.loc[
        measurement_kl_divergence_df_geo_clip_short.area_code == code, "area_name"
    ].values[0]
    diversity = measurement_kl_divergence_df_geo_clip_short.loc[
        measurement_kl_divergence_df_geo_clip_short.area_code == code, "diversity"
    ].values[0]
    labels_laplace.append(f"{area_name} ({diversity} diversity), Laplace")

print(labels_laplace)

In [None]:
labels_data_error = []
for i, code in enumerate(ward_codes):
    area_name = measurement_kl_divergence_df_geo_clip_short.loc[
        measurement_kl_divergence_df_geo_clip_short.area_code == code, "area_name"
    ].values[0]
    diversity = measurement_kl_divergence_df_geo_clip_short.loc[
        measurement_kl_divergence_df_geo_clip_short.area_code == code, "diversity"
    ].values[0]
    labels_data_error.append(f"{area_name} ({diversity} diversity), Data Error")

print(labels_data_error)

In [None]:
create_plots.plot_kl_divergence(
    colors_3,
    measurement_kl_divergence_df_geo_clip_short,
    ward_codes,
    labels_wards,
    fig_folder_name,
    "geo",
)

In [None]:
create_plots.plot_kl_divergence(
    colors_3,
    measurement_kl_divergence_df_laplace_round_clip_short,
    ward_codes,
    labels_wards,
    fig_folder_name,
    "laplace",
)

In [None]:
create_plots.plot_rmse(
    colors_4,
    metrics_df_geo_short,
    metrics_df_geo_clip_short,
    metrics_df_laplace_short,
    metrics_df_laplace_clip_short,
    metrics_df_laplace_round_clip_short,
    ward_codes,
    labels_wards,
    fig_folder_name,
)

In [None]:
create_plots.create_several_bar_plot_pop_dp_on_data_error(
    wards=wards_dp_geo_clip,
    ethnicities=detailed_ethnicities_simplified,
    epsilons=epsilons_short,
    colors=colors_grey,
    step=4,
    width=0.8,
    gap=0,
    fig_name="ethnicities_geometric_overview",
    fig_folder_name=fig_folder_name,
)

In [None]:
create_plots.create_several_bar_plot_pop_dp_on_data_error(
    wards=wards_dp_geo,
    ethnicities=detailed_ethnicities_simplified,
    epsilons=epsilons_short,
    colors=colors_grey,
    step=4,
    width=0.8,
    gap=0,
    fig_name="ethnicities_geometric_overview_no_clipping",
    fig_folder_name=fig_folder_name,
)

In [None]:
create_plots.create_bar_plot_pop_dp_on_data_error(
    wards=wards_dp_geo_clip,
    ethnicities=detailed_ethnicities_simplified,
    epsilons=[0.1],
    colors=colors_grey,
    step=4,
    width=0.8,
    gap=0,
    fig_name="ethnicities_geometric",
    fig_folder_name=fig_folder_name,
)

In [None]:
create_plots.barplot_inc(
    0.05,
    0.5,
    epsilons_short,
    metrics_df_geo_clip_short,
    ward_codes,
    labels_wards,
    fig_folder_name,
)

In [None]:
create_plots.barplot_dec(
    0.05,
    0.5,
    epsilons_short,
    metrics_df_geo_clip_short,
    ward_codes,
    labels_wards,
    fig_folder_name,
)