# Imports + settings

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import joblib
from collections import defaultdict
import json
import datetime as dt
from pathlib import Path

# To display BSNs fully
pd.set_option("display.max_colwidth", 1000)

# For convenience
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
from wpi_onderzoekswaardigheid_aanvraag.project_paths import ARTIFACT_PATH, DATA_PATH, CONFIG_PATH, INFO_PATH
from wpi_onderzoekswaardigheid_aanvraag.model.manage_model_info import load_feature_list
from wpi_onderzoekswaardigheid_aanvraag.model.build_model import filter_application_handling
from wpi_onderzoekswaardigheid_aanvraag.settings.settings import WPISettings
from wpi_onderzoekswaardigheid_aanvraag.components import SocratesDienstPersoonJoin, SocratesAdresFeatures
from wpi_onderzoekswaardigheid_aanvraag.scorer import Scorer

WPISettings.set_from_yaml(CONFIG_PATH);

In [None]:
from bias_collection.bias_analyzer import BiasAnalyzer
from fraude_preventie.datasources.dbutils import db_url_from_config

# Load data

In [None]:
prepilot_df = pd.read_csv("20220523_data_for_bias_analysis.csv")

In [None]:
selected_for_prepilot = pd.read_csv("../tickets/WT9D-244-prepilot-scores/20220517_applications_to_investigate_for_prepilot.csv")

In [None]:
prepilot_df["selected_for_prepilot"] = prepilot_df["application_dienstnr"].isin(selected_for_prepilot["Aanvraagnummer"])

In [None]:
model_file = Path('20220523_model_used_in_prepilot.pkl')
# model_file = ARTIFACT_PATH / "model.pkl"
# ppl_file = ARTIFACT_PATH / "pipeline.pkl"

model_dict = joblib.load(model_file)
model = model_dict["model"]

In [None]:
prep = model[:-1]  # all but the last pipeline steps, hence all transformers, but not the model
clf = model[-1]  # the actual model

num_cols, cat_cols = load_feature_list()
label = "onderzoekswaardig"
# X_test = pd.read_csv(DATA_PATH / "BIAS_X_test.csv")
# y_test = pd.read_csv(DATA_PATH / "BIAS_y_test.csv")
X_test = prepilot_df
y_test = prepilot_df["onderzoekswaardig"]

# Prepare the analysis input

#### Start

In [None]:
# Note that we need to use X_test for this rather than the transformed data, because
# to do the joins correctly we need some columns that are not in the transformed data
# anymore.
X_test_enriched = X_test.reset_index()

In [None]:
connection_info = WPISettings.get_settings()["connections"]["basisinformatie_db"];
connection_info["options"] = ""

Get from Postgresql DB - WPI dump
- leeftijd (age)
- nationaliteit (nationality)
- geslacht (gender)
- postcode

Get from Postgresql DB - BRP dump
- geboorteland (country of birth)
- burgerlijke staat (civil status)

#### Add  nationaliteit, leeftijd

In [None]:
sql_query = """with ref as (
    select attribuut_waarde, attribuut_waarde_omschrijving
    from wpi_hashed.socrates_ref
    where attribuut = 'NATIONALITEIT1'
)
select subjectnr, dtopvoer, dtafvoer, dtgeboortegba, nationaliteit1, attribuut_waarde_omschrijving as nationaliteit
from wpi_hashed_v2.socrates_persoon sp
left join ref on sp.nationaliteit1 = ref.attribuut_waarde"""

nationaliteit_df = pd.read_sql(sql_query, db_url_from_config(connection_info))
nationaliteit_df["geboortejaar"] = nationaliteit_df["dtgeboortegba"].astype("datetime64").dt.year

In [None]:
X_test_enriched = SocratesDienstPersoonJoin.join_dienst_persoon(X_test_enriched, nationaliteit_df)

In [None]:
X_test_enriched["leeftijd"] = X_test_enriched["dtaanvraag"].astype("datetime64").dt.year - X_test_enriched["geboortejaar"]

In [None]:
sql_query = """select attribuut_waarde, attribuut_waarde_omschrijving
from wpi_hashed.socrates_ref
where attribuut = 'NATIONALITEIT1'
"""

nationaliteit_mapping = pd.read_sql(sql_query, db_url_from_config(connection_info)).set_index("attribuut_waarde")["attribuut_waarde_omschrijving"].to_dict()

#### Add postcode

In [None]:
from wpi_onderzoekswaardigheid_aanvraag.preprocessing.clean import WPICleanTransformer

In [None]:
sql_query = """select subjectnr, dtbegin, dteinde, dtopvoer, dtafvoer, postcodenum, geldig
from wpi_hashed_v2.socrates_adres sp
"""

postcode_df = pd.read_sql(sql_query, db_url_from_config(connection_info))

In [None]:
postcode_df = WPICleanTransformer(
    remove_invalidated_data=True,
    col_type_mapping=[
        ("dtbegin", "datetime64"),
        ("dteinde", "datetime64"),
        ("dtopvoer", "datetime64"),
        ("dtafvoer", "datetime64"),
    ],
    fix_no_end_date=["dteinde"],
).transform(postcode_df)

In [None]:
df_tmp = SocratesAdresFeatures.join_applications_adres(X_test_enriched, postcode_df)
df_tmp = SocratesAdresFeatures.filter_adres_relevant_to_application(df_tmp)
X_test_enriched = df_tmp.sort_values("dtbegin_adres").drop_duplicates(
    "application_dienstnr", keep="last"
)

#### Add BSN from WPI data in order to join with BRP data

In [None]:
sql_query = """select subjectnr, bsn, dtopvoer
from wpi_hashed_v2.socrates_persoon
where bsn != 'eb763221a7e6f47e6c8f5062f8fd1ad18a95264c7366928afc8ed92e7d1917a3'
"""

bsn_df = pd.read_sql(sql_query, db_url_from_config(connection_info))

In [None]:
# Filter BSNs on the subject numbers that we need, then remove duplicates.
relevant_bsns = bsn_df[bsn_df["subjectnr"].isin(X_test_enriched["subjectnr"].unique())].drop_duplicates()
shape_step1 = relevant_bsns.shape
relevant_bsns = relevant_bsns.sort_values("dtopvoer", ascending=True).drop_duplicates("subjectnr", keep="last")
shape_step2 = relevant_bsns.shape

if shape_step1 != shape_step2:
    print("Warning: There were people with more than 1 BSN, for them the last known BSN is used.")

In [None]:
old_shape = X_test_enriched.shape
X_test_enriched = X_test_enriched.merge(relevant_bsns, how="left", on="subjectnr")
new_shape = X_test_enriched.shape

# Assert that the number of rows didn't change. If it did, we have subject numbers with more than 1 BSN!
assert old_shape[0] == new_shape[0]

#### Add geboorteland

In [None]:
sql_query = """select bsn, geboorteland
from bias_analyse_wpi_pre_pilot.brp_rapport
"""

geboorteland_df = pd.read_sql(sql_query, db_url_from_config(connection_info))

In [None]:
geboorteland_df = geboorteland_df.drop_duplicates()

In [None]:
old_shape = X_test_enriched.shape
X_test_enriched = X_test_enriched.merge(geboorteland_df, how="left", on="bsn")
new_shape = X_test_enriched.shape

# Assert that the number of rows didn't change. If it did, we have BSNs with more than 1 geboorteland.
assert old_shape[0] == new_shape[0]

X_test_enriched["geboorteland"] = pd.Categorical(X_test_enriched['geboorteland'])
X_test_enriched["geboorteland_code"] = X_test_enriched['geboorteland'].cat.codes

geboorteland_mapping = dict(enumerate(X_test_enriched["geboorteland"].cat.categories))

#### Add burgerlijke staat

- H = huwelijk
- P = geregistreerd partnerschap

In [None]:
sql_query = """select bsn, soort_verbintenis, datum_sluiting, datum_ontbinding
from bias_analyse_wpi_pre_pilot.brp_rapport
"""

burg_staat_df = pd.read_sql(sql_query, db_url_from_config(connection_info))

In [None]:
burg_staat_df["datum_sluiting"] = pd.to_datetime(burg_staat_df["datum_sluiting"].replace(dt.date(1001, 1, 1), pd.Timestamp.min))
burg_staat_df["datum_ontbinding"] = pd.to_datetime(burg_staat_df["datum_ontbinding"].replace(dt.date(1001, 1, 1), pd.Timestamp.min))

In [None]:
def get_civil_status_at_date(civil_status_df, bsn, date):
    df = civil_status_df[civil_status_df["bsn"] == bsn]
    df = df[df["datum_sluiting"].isna() | (df["datum_sluiting"] <= date)]
    
    if len(df) == 0:
        logger.warning(f"BSN not found in dataframe, assuming that civil status is 'single': {bsn}")
        return "single"
    
    # `datum_sluiting` is always filled in our dump for marriage/partnership (H/P).
    # So if all NaN, then there no partnership/marriage in the BRP.
    if df["datum_sluiting"].isna().mean() == 1:
        civil_status = "single"
        
    else:            
        # Check if last available partnership/marriage is still current.
        df = df.sort_values("datum_sluiting", ascending=False).drop_duplicates(subset=["bsn"], keep="first")
        
        if df["datum_ontbinding"].isna().mean() == 1:
            civil_status = "partnership_or_married"
            
        else:
            civil_status = "separated_or_divorced_or_widowed"
    
    return civil_status

In [None]:
X_test_enriched["burgerlijke_staat"] = [get_civil_status_at_date(burg_staat_df, row["bsn"], row["dtaanvraag"]) for i, row in X_test_enriched.iterrows()]

X_test_enriched["burgerlijke_staat"] = pd.Categorical(X_test_enriched['burgerlijke_staat'])
X_test_enriched["burgerlijke_staat_code"] = X_test_enriched['burgerlijke_staat'].cat.codes

burgerlijke_staat_mapping = dict(enumerate(X_test_enriched["burgerlijke_staat"].cat.categories))

#### Prepare final dataframe

In [None]:
prepilot_resultaten = pd.read_excel("2022.05.23_Merged_pre-pilot_templates.xlsx", header=1, sheet_name="Template")
prepilot_removed_applications = pd.read_excel("2022.05.23_Merged_pre-pilot_templates.xlsx", header=1, sheet_name="Removed")

In [None]:
# Filter out the people that got removed during the prepilot due to being BD or other reasons.
X_test = X_test[~X_test["subjectnr"].isin(prepilot_removed_applications["Administratienummer"])]

In [None]:
# Add column with prepilot results to X_test.
prepilot_resultaten["result_prepilot"] = prepilot_resultaten["Label"].replace({"Onderzoekswaardig": 1, "Niet onderzoekswaardig": 0})
X_test = X_test.merge(prepilot_resultaten[["Dienstnummer", "result_prepilot"]], how="left", left_on="application_dienstnr", right_on="Dienstnummer")

# Check that all prepilot results got merged to X_test.
assert X_test["result_prepilot"].value_counts().sum() == len(prepilot_resultaten)

In [None]:
X_test["included_in_prepilot"] = X_test["application_dienstnr"].isin(prepilot_resultaten["Dienstnummer"])

In [None]:
X_test["selected_by_model_not_process"] = (X_test["model_prob"] >= 0.63) & ~X_test["is_onderzoek_hh"]
X_test["selected_by_process_not_model"] = (X_test["model_prob"] < 0.63) & X_test["is_onderzoek_hh"]
X_test["selected_by_both"] = (X_test["model_prob"] >= 0.63) & X_test["is_onderzoek_hh"]

In [None]:
X_test_to_compare = X_test[X_test["dtaanvraag"] >= X_test.loc[X_test["included_in_prepilot"], "dtaanvraag"].min()]
# X_test_to_compare = X_test_to_compare[X_test_to_compare[["selected_by_model_not_process", "selected_by_process_not_model", "selected_by_both"]].sum(axis=1) == 1]

In [None]:
# An application cannot fall in more than 1 category.
assert X_test_to_compare[["selected_by_model_not_process", "selected_by_process_not_model", "selected_by_both"]].sum(axis=1).max() == 1


- Geselecteerd door model, niet door proces: model_prob >= 0.63 & heeft geen HH label
    * Hiervoor hebben we een label als het in de prepilot is onderzocht.
    * Ofwel model heeft het goed en proces fout, ofewel proces heeft het goed en model fout.
- Geselecteerd door proces, niet door model: heeft een HH label & model_prob < 0.63
    * Hiervoor hebben we een label uit het proces (= model label, oftewel kolom 'onderzoekswaardig').
    * Ofwel model heeft het goed en proces fout, ofewel proces heeft het goed en model fout.
- Geselecteerd door beide: heeft een HH label & model_prob >= 0.63
    * Hiervoor hebben we een label uit het proces (= model label, oftewel kolom 'onderzoekswaardig').
    * Model en proces hebben het of allebei goed, of allebei fout.

In [None]:
print(X_test_to_compare.shape)
X_test_to_compare = X_test_to_compare[X_test_to_compare["bijzondere_doelgroep_address"] != 1]
print(X_test_to_compare.shape)

In [None]:
X_test_enriched = X_test_enriched.rename(columns={
    "nationaliteit1_persoon": "nationaliteit_code",
    "postcodenum_adres": "postcodenum",
    "geslacht_persoon": "geslacht",
})

external_bias_columns = [
    "geslacht",
    "leeftijd",
    "nationaliteit_code",
    "postcodenum",
    "geboorteland_code",
    "burgerlijke_staat_code",
]

data_to_analyze = prep.transform(X_test_to_compare)
data_to_analyze.index = X_test_to_compare["application_dienstnr"]

data_to_analyze["onderzoekswaardig"] = X_test_to_compare["onderzoekswaardig"].replace({True: 1, False: 0}).values
data_to_analyze["included_in_prepilot"] = X_test_to_compare["included_in_prepilot"].values

# For those applications where we have a result from the prepilot, replace the label with the result from the prepilot.
dienstnr_with_result_from_prepilot = X_test_to_compare.loc[~X_test_to_compare["result_prepilot"].isna(), "application_dienstnr"]
data_to_analyze.loc[dienstnr_with_result_from_prepilot, "onderzoekswaardig"] = X_test_to_compare.set_index("application_dienstnr").loc[dienstnr_with_result_from_prepilot, "result_prepilot"]

data_to_analyze = data_to_analyze.merge(X_test_enriched.set_index("application_dienstnr")[external_bias_columns], left_index=True, right_index=True, how="left")
data_to_analyze = data_to_analyze.dropna()

In [None]:
types = [
    ("selected_by_model_not_process_and_in_prepilot", "afgewezen"),  # Selected *only* by the model AND investigated in the prepilot AND afgewezen by ICer
    ("selected_by_model_not_process_and_in_prepilot", "niet_afgewezen"),  # Selected *only* by the model AND investigated in the prepilot AND not afgewezen by ICer
    ("selected_by_model_not_process_overall", "afgewezen"),  # Selected *only* by the model AND afgewezen by ICer, regardless whether investigated in prepilot or not
    ("selected_by_model_not_process_overall", "niet_afgewezen"),  # Selected *only* by the model AND not afgewezen by ICer, regardless whether investigated in prepilot or not
    ("selected_by_process_not_model", "all"),  # Selected *only* by the current process, i.e. HH onderzoek has already been done, but model scores it <0.63
    ("selected_by_both", "all"),  # Selected by both the model and the current process
    ("all_data", "all"),  # All data that was selected either by model or process or both or neither
]

original_sizes = {}

datasets_to_analyze = {}

for t in types:
    if t[0] == "all_data":
        # Select everything regardless of selection by model/process.
        condition1 = np.repeat(True, len(X_test_to_compare))
    else:
        relevant_col = t[0].replace("_and_in_prepilot", "").replace("_overall", "")  # Strip parts of the string to get to column name that's relevant for the selection.
        condition1 = X_test_to_compare[relevant_col]
    
    if t[1] == "afgewezen":
        condition2 = X_test_to_compare["afgewezen"]
    elif t[1] == "niet_afgewezen":
        condition2 = ~X_test_to_compare["afgewezen"]
    else:
        # Select everything regardless of afgewezen/niet afgewezen.
        condition2 = np.repeat(True, len(X_test_to_compare))
    
    idx_to_select = X_test_to_compare.loc[condition1 & condition2, "application_dienstnr"]
    idx_to_select = set(idx_to_select).intersection(set(data_to_analyze.index))
    
    datasets_to_analyze[t] = data_to_analyze.loc[idx_to_select]
    
    original_sizes[t] = len(datasets_to_analyze[t])
    
    if "_and_in_prepilot" in t[0]:
        datasets_to_analyze[t] = datasets_to_analyze[t].loc[datasets_to_analyze[t]["included_in_prepilot"]]
        
    datasets_to_analyze[t] = datasets_to_analyze[t].drop("included_in_prepilot", axis=1)
    
    print(t)
    print(f"Definitive dataset size: {len(datasets_to_analyze[t])}")
    print(f"Original dataset size: {original_sizes[t]}")
    print("")

# Make groups

In [None]:
features_to_check = defaultdict(list)

## Direct

#### Sex

- 0 = unknown
- 1 = male
- 2 = female

Note that we only compare male vs. female, because we don't have enough samples with unknown gender.

In [None]:
data_to_analyze["geslacht"].value_counts()

In [None]:
# data_to_analyze = data_to_analyze[data_to_analyze["geslacht"] != 0]
features_to_check["geslacht"] = [[1], [2]]

#### Age

In [None]:
data_to_analyze["leeftijd"].describe()

In [None]:
data_to_analyze["leeftijd"].hist()

In [None]:
features_to_check["leeftijd_split1"] = [0, 100, 40, 1]
features_to_check["leeftijd_split2"] = [0, 100, 50, 1]

#### Nationality

In [None]:
with open("west-nonwest-nationalities.json", 'r') as j:
    west_nonwest_nationalities = json.loads(j.read())

In [None]:
for code, nationality in zip(data_to_analyze["nationaliteit_code"].value_counts().iteritems(), data_to_analyze["nationaliteit_code"].map(nationaliteit_mapping).value_counts().iteritems()):
    print(f"Count: {nationality[1]:<5} Code: {int(code[0]):<5} {nationality[0]:<20}")

In [None]:
flipped_mapping = {v: k for k,v in nationaliteit_mapping.items()}
west_codes = [code for country, code in flipped_mapping.items() if country in west_nonwest_nationalities["west"]]
nonwest_codes = [code for country, code in flipped_mapping.items() if country in west_nonwest_nationalities["nonwest"]]

In [None]:
data_to_analyze_no_unknown_nationality = data_to_analyze[(data_to_analyze["nationaliteit_code"] != 0)]

# Check that all nationality codes got assigned to west/nonwest except 0 = unknown.
assert (data_to_analyze_no_unknown_nationality["nationaliteit_code"].isin(west_codes) | data_to_analyze_no_unknown_nationality["nationaliteit_code"].isin(nonwest_codes)).all()

In [None]:
features_to_check["nationaliteit_code_split1"] = [west_codes, nonwest_codes]  # West vs. non-west
features_to_check["nationaliteit_code_split2"] = [
    [1], 
    [n for n in data_to_analyze["nationaliteit_code"].unique() if n not in [0, 1]]  # Dutch vs. non-Dutch
]

#### Country of birth

In [None]:
with open("west-nonwest-countries.json", 'r') as j:
    west_nonwest_countries = json.loads(j.read())

In [None]:
for code, country in zip(data_to_analyze["geboorteland_code"].value_counts().iteritems(), data_to_analyze["geboorteland_code"].map(geboorteland_mapping).value_counts().iteritems()):
    print(f"Count: {country[1]:<5} Code: {int(code[0]):<5} {country[0]:<20}")

In [None]:
flipped_mapping = {v: k for k,v in geboorteland_mapping.items()}
west_codes = [code for country, code in flipped_mapping.items() if country in west_nonwest_countries["west"]]
nonwest_codes = [code for country, code in flipped_mapping.items() if country in west_nonwest_countries["nonwest"]]

In [None]:
# -1 = NaN, 127 = Onbekend
data_to_analyze_no_unknown_nationality = data_to_analyze[~data_to_analyze["geboorteland_code"].isin([-1, 127])]

# Check that all country codes got assigned to west/nonwest.
assert (data_to_analyze_no_unknown_nationality["geboorteland_code"].isin(west_codes) | data_to_analyze_no_unknown_nationality["geboorteland_code"].isin(nonwest_codes)).all()

In [None]:
features_to_check["geboorteland_code_split1"] = [west_codes, nonwest_codes]  # West vs. non-west
features_to_check["geboorteland_code_split2"] = [
    [114], 
    [n for n in data_to_analyze["geboorteland_code"].unique() if n not in [114, -1, 127]]  # Dutch vs. non-Dutch
]

#### Civil status

In [None]:
for code, burg_staat in zip(data_to_analyze["burgerlijke_staat_code"].value_counts().iteritems(), data_to_analyze["burgerlijke_staat_code"].map(burgerlijke_staat_mapping).value_counts().iteritems()):
    print(f"Count: {burg_staat[1]:<5} Code: {int(code[0]):<5} {burg_staat[0]:<20}")

In [None]:
priv = [1, 2]  # single of separated_or_divorced_or_widowed
unpriv = [0]  # partnership_or_married
features_to_check["burgerlijke_staat_code"] = [priv, unpriv]

## Indirect

#### Feature: deelnames_started_percentage_last_year

In [None]:
data_to_analyze["deelnames_started_percentage_last_year"].hist()

In [None]:
data_to_analyze["deelnames_started_percentage_last_year"].value_counts()

In [None]:
for t in types:
    datasets_to_analyze[t]["deelnames_started_percentage_last_year_equals_zero"] = (datasets_to_analyze[t]["deelnames_started_percentage_last_year"] == 0)*1
    datasets_to_analyze[t]["deelnames_started_percentage_last_year_equals_one"] = (datasets_to_analyze[t]["deelnames_started_percentage_last_year"] == 1)*1

# This means: People who started nothing last year (incl. those who weren't in the system last year) vs. people who started something or everything.
features_to_check["deelnames_started_percentage_last_year_equals_zero"] = [
    [0], [1]
]
# This means: People who started everything last year vs. those who didn't start everything or who weren't in the system last year.
features_to_check["deelnames_started_percentage_last_year_equals_one"] = [
    [0], [1]
]

#### Feature: at_least_one_address_in_amsterdam

In [None]:
data_to_analyze["at_least_one_address_in_amsterdam"].value_counts()

In [None]:
features_to_check["at_least_one_address_in_amsterdam"] = [
    [0], [1]
]

#### Feature: active_address_count

In [None]:
data_to_analyze["active_address_count"].value_counts()

In [None]:
features_to_check["active_address_count"] = [
    [1], [2, 3]
]

#### Feature: days_since_last_relocation

In [None]:
data_to_analyze["days_since_last_relocation"].describe()

In [None]:
data_to_analyze["days_since_last_relocation"].hist()

In [None]:
split_value = 365
features_to_check["days_since_last_relocation"] = [
    [n for n in data_to_analyze["days_since_last_relocation"].unique() if n > split_value],  # Same address for a long time
    [n for n in data_to_analyze["days_since_last_relocation"].unique() if n <= split_value]  # Moved in the past year
]

#### Feature: days_since_last_dienst_end

In [None]:
data_to_analyze["days_since_last_dienst_end"].hist()

In [None]:
features_to_check["days_since_last_dienst_end_split1"] = [
    [99999],  # No dienst last year
    [n for n in data_to_analyze["days_since_last_dienst_end"].unique() if n != 99999]  # Had a dienst last year
]

split_value = 60
features_to_check["days_since_last_dienst_end_split2"] = [
    [n for n in data_to_analyze["days_since_last_dienst_end"].unique() if n > split_value],  # Dienst longer than 60 days ago
    [n for n in data_to_analyze["days_since_last_dienst_end"].unique() if n <= split_value]  # Dienst within last 60 days
]

#### Feature: has_medebewoner

In [None]:
data_to_analyze["has_medebewoner"].value_counts()

In [None]:
features_to_check["has_medebewoner"] = [
    [0], [1]
]

#### Feature: avg_percentage_maatregel

In [None]:
data_to_analyze["avg_percentage_maatregel"].value_counts()

I think we have too few samples to say anything meaningful here.

#### Feature: total_vermogen

In [None]:
data_to_analyze["total_vermogen"].describe()

In [None]:
data_to_analyze["total_vermogen"].hist(bins=300, figsize=(15,5))

In [None]:
split_value = 0
features_to_check["total_vermogen_split1"] = [ 
    [n for n in data_to_analyze["total_vermogen"].unique() if n >= split_value],  # Greater than or equal to zero wealth
    [n for n in data_to_analyze["total_vermogen"].unique() if n < split_value]    # Negative wealth
]

split_value = 0
features_to_check["total_vermogen_split2"] = [ 
    [n for n in data_to_analyze["total_vermogen"].unique() if n > split_value],  # Positive wealth
    [n for n in data_to_analyze["total_vermogen"].unique() if n < split_value]    # Negative wealth
]

#### Feature: afspraken_no_show_count_last_year

In [None]:
data_to_analyze["afspraken_no_show_count_last_year"].value_counts()

In [None]:
features_to_check["afspraken_no_show_count_last_year"] = [
    [0], [1, 2, 3]
]

#### Feature: has_partner

In [None]:
data_to_analyze["has_partner"].value_counts()

In [None]:
features_to_check["has_partner"] = [
    [0], [1]
]

#### Feature: sum_inkomen_bruto_was_mean_imputed

In [None]:
data_to_analyze["sum_inkomen_bruto_was_mean_imputed"].value_counts()

In [None]:
features_to_check["sum_inkomen_bruto_was_mean_imputed"] = [
    [0], [1]
]

#### Feature: applied_for_same_product_last_year

In [None]:
data_to_analyze["applied_for_same_product_last_year"].value_counts()

In [None]:
features_to_check["applied_for_same_product_last_year"] = [
    [0], [1]
]

#### Feature: received_same_product_last_year

In [None]:
data_to_analyze["received_same_product_last_year"].value_counts()

In [None]:
features_to_check["received_same_product_last_year"] = [
    [0], [1]
]

#### Feature: afspraken_no_contact_count_last_year

In [None]:
data_to_analyze["afspraken_no_contact_count_last_year"].value_counts()

In [None]:
features_to_check["afspraken_no_contact_count_last_year"] = [
    [0], [1, 2, 3, 4, 5]
]

#### Feature: sum_inkomen_bruto_value

In [None]:
data_to_analyze["sum_inkomen_bruto_value"].describe()

In [None]:
features_to_check["sum_inkomen_bruto_value"] = [ 
    [0],  # No income
    [n for n in data_to_analyze["sum_inkomen_bruto_value"].unique() if n > 0]    # Has non-zero income
]

# Do analysis

In [None]:
import logging
from collections import defaultdict
from itertools import compress
from string import digits
from typing import Any, Dict, List

import numpy as np
import pandas as pd
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import ClassificationMetric
from numpy.core.records import ndarray

from bias_collection.helpers import create_AIF360_groups

logger = logging.getLogger(__name__)


class BiasAnalyzer:

    log_info_by_metric = {
        "balanced_accuracy": "",
        "accuracy_privileged": "",
        "accuracy_unprivileged": "",
        "average_odds_difference": (
            "Average odds difference is computed as average difference of false positive rate "
            "(false positives / negatives) and true positive rate (true positives / positives) "
            "between unprivileged and privileged groups. The ideal value of this metric is 0. A "
            "value of < 0 implies higher benefit for the privileged group and a value > 0 "
            "implies higher benefit for the unprivileged group. Fairness for this metric is "
            "between -0.1 and 0.1."
        ),
        "disparate_impact": (
            "Disparate impact is computed as the ratio of rate of favorable outcome for the unprivileged group to "
            "that of the privileged group. The ideal value of this metric is 1.0. A value < 1 implies higher benefit "
            "for the privileged group and a value > 1 implies a higher benefit for the unprivileged group. Fairness "
            "for this metric is between 0.8 and 1.25."
        ),
        "statistical_parity_difference": (
            "Statistical parity difference is computed as the difference of the rate of favorable outcomes received "
            "by the unprivileged group to the privileged group. The ideal value of this metric is 0. Fairness for "
            "this metric is between -0.1 and 0.1."
        ),
        "false_discovery_rate_difference": (
            "The false discovery rate difference expresses the difference in percentage points between the groups in "
            "how likely it is that a selected (by the model) group member should not have been selected in reality. "
            "The ideal value of this metric is 0. A value < 0 implies a higher FDR for the privileged group and a "
            "value > 0 implies a higher FDR for the unprivileged group."
        ),
        "false_discovery_rate_ratio": (
            "The false discovery rate ratio expresses the ratio between the groups of how likely it is that a "
            "selected (by the model) group member should not have been selected in reality. The ideal value of this "
            "metric is 1. A value < 1 implies a higher FDR for the privileged group and a value > 0 implies a higher "
            "FDR for the unprivileged group."
        ),
        "false_positive_rate_difference": (
            "The false positive rate difference expresses the difference in percentage points between the groups in "
            "how likely it is that a group member who should not have been selected in reality is (wrongly) selected "
            "by the model. The ideal value of this metric is 0. A value < 0 implies a higher FPR for the privileged "
            "group and a value > 0 implies a higher FPR for the unprivileged group."
        ),
        "false_positive_rate_ratio": (
            "The false positive rate ratio expresses the ratio between the groups of how likely it is that a group "
            "member who should not have been selected in reality is (wrongly) selected by the model. The ideal value "
            "of this metric is 1. A value < 1 implies a higher FPR for the privileged group and a value > 0 implies "
            "a higher FPR for the unprivileged group."
        ),
        "generalized_entropy_index": "",
        "false_positive_group_size_difference": (
            "The false positive group size difference expresses the difference in percentage points between the "
            "groups of a random group member's chance to be wrongly selected by the model. The ideal value of this "
            "metric is 0. A value < 0 implies a higher FP-GZ for the privileged group and a value > 0 implies a "
            "higher FP-GZ for the unprivileged group."
        ),
        "false_positive_group_size_ratio": (
            "The false positive group size ratio expresses the ratio between the groups of a random group member's "
            "chance to be wrongly selected by the model. The ideal value of this metric is 1. A value < 1 implies a "
            "higher FP-GZ for the privileged group and a value > 1 implies a higher FP-GZ for the unprivileged group."
        ),
    }

    def __init__(self, metrics: List[str] = None):
        """Class to analyze bias in a model's predictions. It uses AIF360 framework.

        Parameters
        ----------
        metrics
            List with the names of the metrics that should be calculated. These
            metrics should exist as methods without an argument in the AIF360
            `ClassificationMetric` class, or as methods of this class itself if
            they are not in AIF360 or the AIF360 method requires an argument.
            Defaults to None, in which case all available metrics are calculated.
        """
        if metrics:
            self._validate_metrics_list(metrics)
            self.metrics = metrics
            # Balanced accuracy is always calculated to pick a best classification
            # threshold.
            if "balanced_accuracy" not in metrics:
                metrics.append("balanced_accuracy")
        else:
            self.metrics = list(self.log_info_by_metric.keys())

    def _validate_metrics_list(self, metrics):
        """Validate that all specified metrics have an implementation in either
        `ClassificationMetric` from AIF360 or as a method of this class.
        """
        metrics_not_implemented = [
            not (hasattr(ClassificationMetric, m) | hasattr(self, m)) for m in metrics
        ]
        if any(metrics_not_implemented):
            raise ValueError(
                f"The following metrics were specified, but are not implemented: "
                f"{list(compress(metrics, metrics_not_implemented))}"
            )

    def analyze_features(
        self,
        data_to_analyze: pd.DataFrame,
        model,
        sensitive_features: Dict,
        outpath: str,
        label_column_name: str,
        thresholds: list = [0.5],
        external_variables: list = None,
        print_metric_explanations: bool = False,
    ):
        """
        Runs a bias analysis over the specified features.

        Parameters
        ----------
        data_to_analyze
            Data to be analysed,
            including external variables that are not used during the scoring but they need to be measured.
        model
            Model that performs the classification
        sensitive_features
            Dictionary containing the features to be analyzed.
            The keys of the dictionary are the features and the value could be of three types:
            - a list containing:
                [lower_value, higher_value, split_value, step]
                The first group is from lower value to (but not including) the split value, the second from the split
                value to (but not including) the higher value.
                Example:
                {'feature': [0,4,2,1]}
                First group: [0,1]
                Second group: [2,3]

            - a list containing two lists with the privileged and unprivileged values:
                [[privileged values], [unprivileged values]]

            - a dictionary containing:
                privileged_groups: List[Dict]
                    List of dictionaries, each dictionary contains the name of the feature
                    and the value that is considered privileged.
                    [{'aantal_kamers': 0}, {'aantal_kamers': 1}, {'aantal_kamers': 2}, {'aantal_kamers': 3}]
                unprivileged_groups: List[Dict]
                    List of dictionaries, each dictionary contains the name of the feature
                    and the value that is considered unprivileged.
        outpath
            Path where to store the results of the analysis.
        label_column_name
            Name of the column containing the label (Needs to be present in data_to_analyze)
        thresholds
            List of different thresholds to define positive and negative predicted values from the score.
        external_variables
            List of features that are not used by the model but we want to analyze
        print_metric_explanations
            Whether or not to print explanations of the calculated metrics
        """
        # If `feature` ends with "_splitX" with X being some number, then multiple splits have been
        # specified for this feature and we have to get the actual feature name.
        # TODO: This is kind of hacky, it'd be nicer to have, for example, a dictionary with the
        #  feature name as the key and as value a dict or list with the multiple splits. This requires some
        #  restructuring of the code to detect if a feature has only one split specified, or multiple, and then to
        #  output the results in a good way.
        protected_attribute_names = [
            feature.rstrip(digits).replace("_split", "")
            for feature in sensitive_features.keys()
        ]

        binary_label_dataset_to_analyse = BinaryLabelDataset(
            df=data_to_analyze,
            label_names=[label_column_name],
            protected_attribute_names=protected_attribute_names,
        )

        if external_variables:
            binary_label_dataset_to_score = BinaryLabelDataset(
                df=data_to_analyze.drop(columns=external_variables),
                label_names=[label_column_name],
                protected_attribute_names=[
                    x for x in protected_attribute_names if x not in external_variables
                ],
            )
        else:
            binary_label_dataset_to_score = binary_label_dataset_to_analyse

        val_metrics, all_group_splits = self.calculate_metrics(
            input_dataframe=data_to_analyze,
            dataset=binary_label_dataset_to_analyse,
            dataset_to_score=binary_label_dataset_to_score,
            model=model,
            thresh_arr=thresholds,
            sensitive_features=sensitive_features,
            label_column_name=label_column_name,
        )

        self.log_metrics(
            val_metrics, thresholds, all_group_splits, print_metric_explanations
        )
        self.metrics_to_csv(val_metrics, all_group_splits, outpath)

        return val_metrics

    def calculate_metrics(
        self,
        input_dataframe: pd.DataFrame,
        dataset: BinaryLabelDataset,
        dataset_to_score: BinaryLabelDataset,
        model,
        thresh_arr: ndarray,
        sensitive_features: Dict,
        label_column_name: str,
    ):
        """
        Calculate bias metrics to understand if the features are biased or not.

        Parameters
        ----------
        input_dataframe
            Contains the data to be checked in DataFrame format
        dataset
            Contains the data to be checked
        dataset_to_score
            Contains the data to be used for scoring
        model
            Model used for the classification
        thresh_arr
            Array of possibles thresholds of the classification
        sensitive_features
            Dictionary containing the biased features to analyse
        label_column_name
            Name of the column containing the label (Needs to be present in data_to_analyze)

        Returns
        -------
        metric_arrs:
            Dictionary containing all the metrics
        """
        try:
            # sklearn classifier
            y_val_pred_prob = model.predict_proba(dataset_to_score.features)
            pos_ind = np.where(model.classes_ == dataset_to_score.favorable_label)[0][0]
        except AttributeError:
            # aif360 inprocessing algorithm
            y_val_pred_prob = model.predict(dataset_to_score).scores
            pos_ind = 0

        metric_arrs = defaultdict(dict)  # type: Dict[str, Dict[str, Any]]
        all_group_splits = defaultdict(dict)  # type: Dict[str, Dict[str, Any]]
        protected_attribute_names = list(sensitive_features.keys())
        for thresh in thresh_arr:
            for feature in protected_attribute_names:
                # If `feature` ends with "_splitX" with X being some number, then multiple splits have been
                # specified for this feature and we have to get the actual feature name.
                actual_feature_name = feature.rstrip(digits).replace("_split", "")
                if isinstance(sensitive_features[feature], list):
                    (privileged_groups, unprivileged_groups,) = create_AIF360_groups(
                        actual_feature_name, sensitive_features[feature]
                    )
                elif isinstance(sensitive_features[feature], dict):
                    privileged_groups = sensitive_features[feature]["privileged_groups"]
                    unprivileged_groups = sensitive_features[feature][
                        "unprivileged_groups"
                    ]
                else:
                    raise Exception(
                        f"""Wrong sensitive_features type, it must be list or dict, received:
                            {type(sensitive_features[feature])}"""
                    )

                dataset_pred = dataset.copy()
                y_val_pred = (y_val_pred_prob[:, pos_ind] > thresh).astype(np.float64)
                dataset_pred.labels = y_val_pred
                input_dataframe = input_dataframe.assign(y_val_pred=y_val_pred)

                all_group_splits[feature][
                    "privileged_values"
                ] = self.unpack_group_dictionary(privileged_groups, actual_feature_name)
                all_group_splits[feature][
                    "unprivileged_values"
                ] = self.unpack_group_dictionary(
                    unprivileged_groups, actual_feature_name
                )
                all_group_splits[feature]["n_privileged"] = (
                    input_dataframe[actual_feature_name]
                    .isin(all_group_splits[feature]["privileged_values"])
                    .sum()
                )
                all_group_splits[feature]["n_unprivileged"] = (
                    input_dataframe[actual_feature_name]
                    .isin(all_group_splits[feature]["unprivileged_values"])
                    .sum()
                )

                metric = ClassificationMetric(
                    dataset,
                    dataset_pred,
                    privileged_groups=privileged_groups,
                    unprivileged_groups=unprivileged_groups,
                )

                metric_arrs[feature] = defaultdict(list)

                for m in self.metrics:
                    try:
                        m_func = getattr(metric, m)
                        metric_arrs[feature][m].append(m_func())
                    except AttributeError:
                        m_func = getattr(self, m)
                        metric_arrs[feature][m].append(
                            m_func(
                                feature=actual_feature_name,
                                input_dataframe=input_dataframe,
                                privileged_groups=privileged_groups,
                                unprivileged_groups=unprivileged_groups,
                                label_column_name=label_column_name,
                                classification_metric=metric,
                            )
                        )
                        
#             for pred, true in zip(y_val_pred, dataset.labels):
#                 logger.info(f"{pred} - {true}")
            logger.warning(f"Nr prediction positive: {(y_val_pred == 1).sum()}")
            logger.warning(f"Nr prediction negative: {(y_val_pred == 0).sum()}")
        return metric_arrs, all_group_splits

    def log_metrics(
        self,
        metrics: Dict,
        thresh_arr: ndarray,
        all_group_splits: Dict,
        print_metric_explanations: bool = False,
    ):
        """Log the metrics.

        Parameters
        ----------
        metrics
            Dictionary containing the metrics
        thresh_arr
            Array of possible thresholds of the classification
        all_group_splits
            Dictionary containing the AIF360 group splits for all features
        print_metric_explanations
            Whether or not to print explanations of the calculated metrics

        Returns
        -------
        :
        """
        for feature, metric_values in metrics.items():
            logger.info(
                f"\n\n--------------- Bias Analysis {feature} ---------------\n"
            )

            logger.info(
                f"Privileged (n={all_group_splits[feature]['n_privileged']}): {all_group_splits[feature]['privileged_values']}"
            )
            logger.info(
                f"Unprivileged (n={all_group_splits[feature]['n_unprivileged']}): {all_group_splits[feature]['unprivileged_values']}"
            )

            best_ind = np.argmax(metric_values["balanced_accuracy"])
            logger.info(
                f"Threshold corresponding to best balanced accuracy: {thresh_arr[best_ind]:.3f}"
            )
            logger.info(
                f"Best balanced accuracy: {metric_values['balanced_accuracy'][best_ind]:.3f}"
            )
            logger.info("\n")

            for m_name, m_values in metric_values.items():
                if m_name == "balanced_accuracy":
                    continue
                logger.info(f"{m_name + ':': <40} {m_values[best_ind]:.3f}")
                if print_metric_explanations:
                    logger.info(f"{self.log_info_by_metric[m_name]}\n")

            logger.info("\n---------------------------------------------")

    def metrics_to_csv(self, metrics: Dict, group_splits: Dict, csv_path: str):
        data = []
        for feature, value in metrics.items():
            best_ind = np.argmax(value["balanced_accuracy"])
            data.append(feature)
            for metric in self.metrics:
                data.append(value[metric][best_ind])
        column_names = ["feature"]
        for metric in self.metrics:
            column_names.append(metric)

        metrics_df = pd.DataFrame(
            np.array(data).reshape(-1, len(column_names)), columns=column_names
        )

        group_splits_df = pd.DataFrame.from_dict(group_splits).T

        df = metrics_df.merge(group_splits_df, left_on="feature", right_index=True)

        outpath = f"{csv_path}/bias_results.csv"
        logger.info(f"Writing bias analysis results to {outpath}")
        df.to_csv(outpath, index=False)

    def false_positive_group_size_difference(
        self,
        feature,
        input_dataframe,
        privileged_groups,
        unprivileged_groups,
        label_column_name,
        *args,
        **kwargs,
    ):
        privileged_fp_group_size = self.calculate_fp_group_size(
            input_dataframe, privileged_groups, feature, label_column_name
        )
        unprivileged_fp_group_size = self.calculate_fp_group_size(
            input_dataframe, unprivileged_groups, feature, label_column_name
        )
        return unprivileged_fp_group_size - privileged_fp_group_size

    def false_positive_group_size_ratio(
        self,
        feature,
        input_dataframe,
        privileged_groups,
        unprivileged_groups,
        label_column_name,
        *args,
        **kwargs,
    ):
        privileged_fp_group_size = self.calculate_fp_group_size(
            input_dataframe, privileged_groups, feature, label_column_name
        )
        unprivileged_fp_group_size = self.calculate_fp_group_size(
            input_dataframe, unprivileged_groups, feature, label_column_name
        )
        return unprivileged_fp_group_size / privileged_fp_group_size

    def calculate_fp_group_size(
        self, data, group_to_use, feature_to_use, label_column_name
    ):
        """
        The false positive / group size has been discovered into the Aequitas bias audit toolkit.
        http://www.datasciencepublicpolicy.org/our-work/tools-guides/aequitas/

        The metric replies the following question: "What are your chances of being wrongly denied bailjust given your race?"

        data
            Contains the data to be checked
        group_to_use
            Contains the privileged/unprivileged dictionary determining how the groups should be splitted.
        feature_to_use
            Contains the column name of the feature we are analyzing.
        label_column_name
            Name of the column containing the label (Needs to be present in data_to_analyze)

        Returns
        -------
        fp_group_size:
            false positive / group size result.
        """
        group_values = self.unpack_group_dictionary(group_to_use, feature_to_use)
        group_filtered_df = data[data[feature_to_use].isin(group_values)]
        fp_group_size = (
            (group_filtered_df["y_val_pred"] == 1)
            & (group_filtered_df[label_column_name] == 0)
        ).mean()
        return fp_group_size

    @staticmethod
    def balanced_accuracy(classification_metric: ClassificationMetric, *args, **kwargs):
        return (
            classification_metric.true_positive_rate()
            + classification_metric.true_negative_rate()
        ) / 2

    @staticmethod
    def accuracy_privileged(
        classification_metric: ClassificationMetric, *args, **kwargs
    ):
        return classification_metric.accuracy(False)

    @staticmethod
    def accuracy_unprivileged(
        classification_metric: ClassificationMetric, *args, **kwargs
    ):
        return classification_metric.accuracy(False)
    
    @staticmethod
    def num_false_positives_privileged(
        classification_metric: ClassificationMetric, *args, **kwargs
    ):
        return classification_metric.num_false_positives(True)
    
    @staticmethod
    def num_false_positives_unprivileged(
        classification_metric: ClassificationMetric, *args, **kwargs
    ):
        return classification_metric.num_false_positives(False)
    
    @staticmethod
    def num_negatives_privileged(
        classification_metric: ClassificationMetric, *args, **kwargs
    ):
        return classification_metric.num_negatives(True)
    
    @staticmethod
    def num_negatives_unprivileged(
        classification_metric: ClassificationMetric, *args, **kwargs
    ):
        return classification_metric.num_negatives(False)

    @staticmethod
    def generalized_entropy_index(
        classification_metric: ClassificationMetric, *args, **kwargs
    ):
        return classification_metric.generalized_entropy_index(alpha=1)

    @staticmethod
    def unpack_group_dictionary(data: Dict, feature_to_unpack: str):
        result = []
        for group in data:
            result.append(group[feature_to_unpack])
        return sorted(result)


In [None]:
class AlwaysPositivePredictor():
    """Since we've already split the datasets going into the `data_to_analyze` parameter of the BiasAnalyzer by whether
    or not something was selected by model and/or process, we don't have to make predictions anymore inside the BiasAnalyzer.
    In those cases, this class can be used instead of the model for the `model` parameter.
    """
    
    def __init__(self):
        self.classes_ = np.array([0, 1])
    
    def predict_proba(self, X, y=None):
        return np.vstack([np.repeat(0, len(X)), np.repeat(1, len(X))]).T

In [None]:
external_variables = external_bias_columns + [
    "deelnames_started_percentage_last_year_equals_zero", 
    "deelnames_started_percentage_last_year_equals_one"
]


metrics = [
    "num_false_positives_privileged",
    "num_false_positives_unprivileged",
    "num_negatives_privileged",
    "num_negatives_unprivileged",
    "false_positive_rate_difference",
    "false_positive_rate_ratio",
    "false_positive_group_size_difference",
    "false_positive_group_size_ratio",
]

for t in types:
    logger.warning(t)
    
    outpath = f"20220523_bias_report_bugs_solved/{t[0]}/{t[1]}"
    Path(outpath).mkdir(parents=True, exist_ok=True)
    
    if (t[0] == "selected_by_process_not_model") | (t[0] == "selected_by_both"):
        model_for_bias_analyzer = AlwaysPositivePredictor()
    else:
        model_for_bias_analyzer = clf

    BiasAnalyzer(
        metrics
    ).analyze_features(
        data_to_analyze=datasets_to_analyze[t],
        model=model_for_bias_analyzer,
        sensitive_features=features_to_check,
        outpath=outpath,
        thresholds=[0.63-0.000000000001],
        label_column_name=label,
        external_variables=external_variables,
    #     print_metric_explanations=True,
    )



In [None]:
bias_dfs = {}
for t in types:
    outpath = f"20220523_bias_report_bugs_solved/{t[0]}/{t[1]}"
    inpath = outpath + "/bias_results.csv"
    bias_dfs[t] = pd.read_csv(inpath)

In [None]:
bias_calculations = bias_dfs[('all_data', 'all')][["feature", "n_privileged", "n_unprivileged", "privileged_values", "unprivileged_values"]]

# Calculate multiplier to bring the afgewezen/niet-afgewezen samples from the prepilot to the same proportion as it is in the overall dataset.
multiplier_afgewezen = (
    bias_dfs[('selected_by_model_not_process_overall', 'afgewezen')][["n_privileged", "n_unprivileged"]].values /
    bias_dfs[('selected_by_model_not_process_and_in_prepilot', 'afgewezen')][["n_privileged", "n_unprivileged"]].values
)
multiplier_niet_afgewezen = (
    bias_dfs[('selected_by_model_not_process_overall', 'niet_afgewezen')][["n_privileged", "n_unprivileged"]].values /
    bias_dfs[('selected_by_model_not_process_and_in_prepilot', 'niet_afgewezen')][["n_privileged", "n_unprivileged"]].values
)

# Scale number of false positives in prepilot investigations to the same proportions as in the overall dataset.
scaled_num_false_positives_afgewezen = pd.DataFrame(
    (
        multiplier_afgewezen *
        bias_dfs[('selected_by_model_not_process_and_in_prepilot', 'afgewezen')][["num_false_positives_privileged", "num_false_positives_unprivileged"]].values
    ), 
    columns=[
        "scaled_num_false_positives_afgewezen_selected_by_model_privileged", 
        "scaled_num_false_positives_afgewezen_selected_by_model_unprivileged"
    ]
)

scaled_num_false_positives_niet_afgewezen = pd.DataFrame(
    (
        multiplier_niet_afgewezen *
        bias_dfs[('selected_by_model_not_process_and_in_prepilot', 'niet_afgewezen')][["num_false_positives_privileged", "num_false_positives_unprivileged"]].values
    ), 
    columns=[
        "scaled_num_false_positives_niet_afgewezen_selected_by_model_privileged", 
        "scaled_num_false_positives_niet_afgewezen_selected_by_model_unprivileged",
    ]
)

bias_calculations = pd.concat([
    bias_calculations, 
    scaled_num_false_positives_afgewezen,
    scaled_num_false_positives_niet_afgewezen,
], axis=1)


bias_calculations["num_false_positives_model_privileged"] = (
    scaled_num_false_positives_afgewezen["scaled_num_false_positives_afgewezen_selected_by_model_privileged"] +
    scaled_num_false_positives_niet_afgewezen["scaled_num_false_positives_niet_afgewezen_selected_by_model_privileged"] +
    bias_dfs[('selected_by_both', 'all')]["num_false_positives_privileged"]
)

bias_calculations["num_false_positives_model_unprivileged"] = (
    scaled_num_false_positives_afgewezen["scaled_num_false_positives_afgewezen_selected_by_model_unprivileged"] +
    scaled_num_false_positives_niet_afgewezen["scaled_num_false_positives_niet_afgewezen_selected_by_model_unprivileged"] +
    bias_dfs[('selected_by_both', 'all')]["num_false_positives_unprivileged"]
)

bias_calculations["num_false_positives_process_privileged"] = (
    bias_dfs[('selected_by_process_not_model', 'all')]["num_false_positives_privileged"] +
    bias_dfs[('selected_by_both', 'all')]["num_false_positives_privileged"]
)

bias_calculations["num_false_positives_process_unprivileged"] = (
    bias_dfs[('selected_by_process_not_model', 'all')]["num_false_positives_unprivileged"] +
    bias_dfs[('selected_by_both', 'all')]["num_false_positives_unprivileged"]
)

bias_calculations["fp_group_size_process_privileged"] = bias_calculations["num_false_positives_process_privileged"] / bias_calculations["n_privileged"]
bias_calculations["fp_group_size_process_unprivileged"] = bias_calculations["num_false_positives_process_unprivileged"] / bias_calculations["n_unprivileged"]

bias_calculations["fp_group_size_process_diff"] = bias_calculations["fp_group_size_process_unprivileged"] - bias_calculations["fp_group_size_process_privileged"]
bias_calculations["fp_group_size_process_ratio"] = bias_calculations["fp_group_size_process_unprivileged"] / bias_calculations["fp_group_size_process_privileged"]

bias_calculations["fp_group_size_model_privileged"] = bias_calculations["num_false_positives_model_privileged"] / bias_calculations["n_privileged"]
bias_calculations["fp_group_size_model_unprivileged"] = bias_calculations["num_false_positives_model_unprivileged"] / bias_calculations["n_unprivileged"]

bias_calculations["fp_group_size_model_diff"] = bias_calculations["fp_group_size_model_unprivileged"] - bias_calculations["fp_group_size_model_privileged"]
bias_calculations["fp_group_size_model_ratio"] = bias_calculations["fp_group_size_model_unprivileged"] / bias_calculations["fp_group_size_model_privileged"]


n_model = pd.DataFrame(
    (
        bias_dfs[('selected_by_model_not_process_and_in_prepilot', 'afgewezen')][["n_privileged", "n_unprivileged"]].values + 
        bias_dfs[('selected_by_model_not_process_and_in_prepilot', 'niet_afgewezen')][["n_privileged", "n_unprivileged"]].values +
        bias_dfs[('selected_by_both', 'all')][["n_privileged", "n_unprivileged"]].values
    ),
    columns=["n_privileged_model", "n_unprivileged_model"]
)


bias_calculations = pd.concat([
    bias_calculations,
    n_model
], axis=1)

In [None]:
print("Unscaled false positives van het model per privileged/unprivileged groep")
pd.DataFrame(
    (
        bias_dfs[('selected_by_model_not_process_and_in_prepilot', 'afgewezen')][["num_false_positives_privileged", "num_false_positives_unprivileged"]].values + 
        bias_dfs[('selected_by_model_not_process_and_in_prepilot', 'niet_afgewezen')][["num_false_positives_privileged", "num_false_positives_unprivileged"]].values +
        bias_dfs[('selected_by_both', 'all')][["num_false_positives_privileged", "num_false_positives_unprivileged"]].values
    ), 
    columns=["num_false_positives_privileged", "num_false_positives_unprivileged"], 
    index=bias_calculations["feature"]
)

In [None]:
print("Unscaled false positives van het proces per privileged/unprivileged groep")
pd.DataFrame(
    (
        bias_dfs[('selected_by_process_not_model', 'all')][["num_false_positives_privileged", "num_false_positives_unprivileged"]].values +
        bias_dfs[('selected_by_both', 'all')][["num_false_positives_privileged", "num_false_positives_unprivileged"]].values
    ), 
    columns=["num_false_positives_privileged", "num_false_positives_unprivileged"], 
    index=bias_calculations["feature"]
)

In [None]:
print(f"Percentage geselecteerd van alles door het huidige proces: 7%")
print(f"Percentage geselecteerd van alles door het model: 14%")

In [None]:
bias_results = bias_calculations[[c for c in bias_calculations.columns if ("ratio" in c) | ("diff" in c) | ("feature" in c) | ("values" in c) | ("privileged_model" in c)]]
bias_results

In [None]:
import ast

In [None]:
geslacht_mapping = {1: "M", 2: "V"}

mappings = {
    "geslacht": geslacht_mapping,
    "nationaliteit_code_split1": nationaliteit_mapping,
    "nationaliteit_code_split2": nationaliteit_mapping,
    "geboorteland_code_split1": geboorteland_mapping,
    "geboorteland_code_split2": geboorteland_mapping,
    "burgerlijke_staat_code": burgerlijke_staat_mapping,
}

bias_results_mapped = bias_results.copy()

for i in ["privileged_values", "unprivileged_values"]:
    for f in [
        "geslacht",
        "nationaliteit_code_split1",
        "nationaliteit_code_split2",
        "geboorteland_code_split1",
        "geboorteland_code_split2",
        "burgerlijke_staat_code",
    ]:
        print(i, f)
        results = list(map(
            lambda x: mappings[f][x],
            ast.literal_eval(bias_results_mapped.set_index("feature").loc[f, i])
        ))
        
        bias_results_mapped.at[np.where(bias_results_mapped["feature"] == f)[0][0], i] = results

In [None]:
bias_results_mapped

In [None]:
bias_results_mapped.to_excel("20220525_bias_results_bugs_solved.xlsx", index=False)

# Which features are related to nationality?

In [None]:
further_analysis_df = data_to_analyze.copy()

In [None]:
further_analysis_df["nationaliteit_code"]

In [None]:
further_analysis_df["nationality_west"] = further_analysis_df["nationaliteit_code"].isin(features_to_check["nationaliteit_code_split1"][0])
further_analysis_df["nationality_nonwest"] = further_analysis_df["nationaliteit_code"].isin(features_to_check["nationaliteit_code_split1"][1])

further_analysis_df["nationality_dutch"] = further_analysis_df["nationaliteit_code"].isin(features_to_check["nationaliteit_code_split2"][0])
further_analysis_df["nationality_nondutch"] = further_analysis_df["nationaliteit_code"].isin(features_to_check["nationaliteit_code_split2"][1])

further_analysis_df["geboorteland_west"] = further_analysis_df["geboorteland_code"].isin(features_to_check["geboorteland_code_split1"][0])
further_analysis_df["geboorteland_nonwest"] = further_analysis_df["geboorteland_code"].isin(features_to_check["geboorteland_code_split1"][1])

further_analysis_df["geboorteland_dutch"] = further_analysis_df["geboorteland_code"].isin(features_to_check["geboorteland_code_split2"][0])
further_analysis_df["geboorteland_nondutch"] = further_analysis_df["geboorteland_code"].isin(features_to_check["geboorteland_code_split2"][1])

In [None]:
import scipy

In [None]:
scipy.stats.pointbiserialr(tmp_df[group[0]], tmp_df[feature])

In [None]:
cols_to_print = further_analysis_df.columns[:15]  # Only the features

correlation_results = []

for group in [
    ("nationality_west", "nationality_nonwest"),
    ("nationality_dutch", "nationality_nondutch"),
    ("geboorteland_west", "geboorteland_nonwest"),
    ("geboorteland_dutch", "geboorteland_nondutch"),
]:  
    means_group0 = further_analysis_df.loc[further_analysis_df[group[0]], cols_to_print].mean()
    means_group1 = further_analysis_df.loc[further_analysis_df[group[1]], cols_to_print].mean()
    
    for i, (m0, m1) in enumerate(zip(means_group0, means_group1)):
        tmp_result = {}
        tmp_result["grouping"] = group
    
        feature = means_group0.index[i]
    
        tmp_df = further_analysis_df[further_analysis_df[group[0]] | further_analysis_df[group[1]]]
        corr = scipy.stats.pointbiserialr(tmp_df[group[0]], tmp_df[feature])
        
        tmp_result["feature"] = feature
        tmp_result["correlation"] = f"{corr.correlation:.3f}"
        tmp_result["correlation_p_value"] = f"{corr.pvalue:.5f}"
        
        tmp_result["group0_mean"] = f"{m0:.3f}"
        tmp_result["group1_mean"] = f"{m1:.3f}"
        
        correlation_results.append(tmp_result)

In [None]:
correlation_results_df = pd.DataFrame(correlation_results)

In [None]:
correlation_results_df.sort_values(["grouping", "correlation_p_value"], ascending=True).groupby("grouping").head(3)

Conclusies:

- Nationaliteit
    * `nationaliteit_code_split1` betekent westers is privileged, niet-westers is unprivileged.
    * `nationaliteit_code_split2` betekent Nederlands is privileged, niet-Nederlands is unprivileged.
    * Het model is biased tegen niet-westerse aanvragers, terwijl dat in het proces helemaal niet zo is.
    * Krijgen we bij 2 nationaliteiten altijd de niet-westerse nationaliteit? Nee, willekeurig.
    * West vs. non-west: Correlatie tussen nationaliteit en features. Wat hebben mensen die niet-westerse nationaliteit hebben en FP zijn met elkaar gemeen?
- Vermogen
    * `total_vermogen_split1` betekent vermogen >= 0 is privileged, vermogen < 0 is unprivileged.
    * `total_vermogen_split2` betekent vermogen > 0 is privileged, vermogen < 0 is unprivileged, dus vermogen = 0 wordt buiten beschouwing gelaten.
    * `total_vermogen_split1` heeft in het proces een veel sterkere bias tegenover unprivileged dan `total_vermogen_split2`. Dat duidt er op dat het proces positief kijkt naar aanvragen met nul vermogen. 
    * `total_vermogen_split2` is waarschijnlijk interessanter om naar te kijken, omdat nul vermogen heel 'straightforward' is, dus minder ruimte voor onderzoekswaardigheid. Terwijl vermogen < 0 schulden zijn en vermogen > 0 betekent dat je misschien boven de vermogensgrens uitkomt.
    
- Deelnames
    * `deelnames_started_percentage_last_year_equals_zero` betekent:
        - privileged: mensen die in het afgelopen jaar een of meer deelnames zijn gestart; people who started something or everything
        - unprivileged: mensen die in het afgelopen jaar geen enkele deelnames zijn gestart, inclusief de mensen voor wie uberhaupt geen deelnames in het systeem staan; people who started nothing last year (incl. those who weren't in the system last year)
    *`deelnames_started_percentage_last_year_equals_one` betekent:
        - privileged: mensen die in het afgelopen jaar niet alle deelnames zijn gestart die in het systeem staan of mensen die uberhaupt niet in het systeem voorkwamen; people who didn't start everything or who weren't in the system last year
        - unprivileged: mensen die in het afgelopen jaar alle deelnames zijn gestart die in het systeem staan; people who started everything last year
        
- Partner
    * Geen partner is privileged, wel partner is unprivileged.
    * Verschil tussen de biases in `has_partner` en `burgerlijke_staat_code`: proces heeft bij burgerlijke staat een bias tegenover singles/gescheiden/weduwen, maar bij partner een bias tegenover mensen met partner > conclusie dat de bias zich vooral tegen ongetrouwde mensen met een partner richt.

Calculate FP group size for priv/unpriv and for proces/model.

For proces:
- Calculate nr. of false positives on everything selected only by proces, or by proces and by model.
    * This can be done by simply adding up the FPs for only proces and the FPs for proces and model.
- Divide by total number, regardless whether selected or not.

For model:
- Calculate nr. of false positives on everything selected only by model, or by proces and by model.
    * This can be done by adding up the FPs for only proces and the FPs for proces and model.
    * First have to calculate the FPs for only proces. We should scale them to represent the fraction of afgewezen/niet-afgewezen in the whole dataset, because the fraction afgewezen/niet-afgewezen is different in the only-proces sample.
- Divide by total number, regardless whether selected or not.


In [None]:
# Print info about the metrics
import pprint
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(BiasAnalyzer.log_info_by_metric)

In [None]:
# Don't run the stuff below
assert False

- False discovery rate difference is positive (0.011), meaning that the female group is (barely) disadvantaged.
- False positive rate difference is negative (-0.089), meaning that the male group is disadvantaged.
- False positive/group size difference is negative (-0.054), meaning that the male group is disadvantaged.

#### False discovery rate difference interpretation
FDR male = 0.1
Out of all the males we investigate, 10% are actually innocent.

FDR female = 0.1 + 0.011 = 0.111
Out of all the females we investigate, 11.1% are actually innocent.

If we investigate a woman, she is 1.1 percentage points more likely to be innocent than a man we investigate.


#### False positive rate difference interpretation
FPR male = 0.1
If you are an innocent male, then you have a 10% chance of being investigated anyway.

FPR female = 0.1 - 0.089 = 0.011
If you are an innocent female, then you have a 1.1% chance of being investigated anyway.

The chance of being investigated as an innocent male is 8.9 percentage points higher than as an innocent female.


#### False positive/group size difference interpretation
FP/GS male = 0.089
A random man has a 8.9% chance to be wrongly investigated.

FP/GS female = 0.035
A random woman has a 3.5% chance to be wrongly investigated.

The chance of being wrongly investigated for a random man is 5.4 percentage points higher than for a random woman.