# Summary

This notebook provides basic functionality to analyze a preprocessed dataset and a trained model.

# Import + settings

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import numpy as np
import pandas as pd
import datetime as dt
import joblib

import plotly.graph_objs as go
import plotly.offline as py

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import seaborn as sns

plt.style.use('ggplot')
py.init_notebook_mode(connected=True)

In [3]:
from wpi_onderzoekswaardigheid_aanvraag.project_paths import ARTIFACT_PATH, DATA_PATH, CONFIG_PATH
from wpi_onderzoekswaardigheid_aanvraag.model.manage_model_info import load_feature_list
from wpi_onderzoekswaardigheid_aanvraag.settings.settings import WPISettings

ModuleNotFoundError: No module named 'wpi_onderzoekswaardigheid_aanvraag'

# Load model + data

In [None]:
WPISettings.set_from_yaml(CONFIG_PATH)

data_file = DATA_PATH / 'transformed_data.pkl'
model_file = ARTIFACT_PATH / 'model.pkl'
dataset = pd.read_pickle(data_file)

# Select relevant features

In [None]:
num_cols, cat_cols = load_feature_list()

relevant_cols = [
    "application_dienstnr",
    "onderzoekswaardig",
    "is_screening_ic", 
    "is_screening_hh", 
    "is_onderzoek_hh", 
    "pro_id", 
    "pro_startdatum", 
    "pro_einddatum", 
    "afgewezen",
] + cat_cols + num_cols

df = dataset[relevant_cols]

df = df.replace(False, 0).replace(True, 1)

# First look

In [None]:
df.head()

In [None]:
df.info()

# Screening vs. onderzoek

In [None]:
dataset[["is_screening_ic", "is_screening_hh", "is_onderzoek_hh"]].mean()

# Missing values

In [None]:
import missingno as msno

msno.matrix(df=df, figsize=(20, 14), color=(0.42, 0.1, 0.05))

# Label

In [None]:
import matplotlib as mpl

In [None]:
mpl.rcParams['font.size'] = 20

In [None]:
labels = ['niet ondzw', 'wel ondzw']
explode = (0, 0.1)
plt.figure(figsize=(20,10))
plt.pie(df["onderzoekswaardig"].value_counts().values, explode=explode, labels=labels, 
        autopct='%1.1f%%', shadow=False, startangle=90)
plt.title("All types")
plt.show()

In [None]:
labels = ['niet ondzw', 'wel ondzw']
explode = (0, 0.1)
plt.figure(figsize=(20,10))
plt.pie(df.loc[~df["is_screening_ic"].astype(bool), "onderzoekswaardig"].value_counts().values, explode=explode, labels=labels, 
        autopct='%1.1f%%', shadow=False, startangle=90)
plt.title("Only HH")
plt.show()

# Correlation heatmap

Note that the heatmap is best viewed in the browser as a PNG (outside Jupyter notebook), due to its size.

In [None]:
mpl.rcParams['font.size'] = 14

colormap = plt.cm.afmhot
plt.figure(figsize=(50, 50))
plt.title('Correlation of features')
corr = df.corr()
hm = sns.heatmap(corr,linewidths=0.1,vmax=1.0, square=True, 
            cmap=colormap, linecolor='white', annot=True, fmt=".2f", annot_kws={"fontsize":8}, mask=np.triu(corr))

fig = hm.get_figure()
fig.savefig("correlation_heatmap.png") 

# Feature importance

In [None]:
model_dict = joblib.load(model_file)

In [None]:
feature_importance = model_dict['feature_importance'].sort_values(by='f_imp', ascending=False)
feature_importance.to_csv("feature_importances.txt", index=False)
feature_importance

# Change classification threshold

In [None]:
from fraude_preventie.evaluation.evaluation import evaluate_performance
from wpi_onderzoekswaardigheid_aanvraag.entrypoints.train_model import _prepare_train_test_data, evaluate_model
from wpi_onderzoekswaardigheid_aanvraag.model.build_model import filter_application_handling

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=2)

In [None]:
model_dict = joblib.load(model_file)
model = model_dict["model"]
prep = model.named_steps["prep"]
clf = model.named_steps["clf"]

In [None]:
include_handling_types = WPISettings.get_settings()["model"]["handling_types"]
df_hh = filter_application_handling(df, include_handling_types)
    
cat_cols, num_cols, X_train, y_train, X_test, y_test = _prepare_train_test_data(df_hh, "onderzoekswaardig")
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [None]:
thresholds = [np.round(number, 2) for number in np.linspace(0.1, 0.9, num=81)]

plot_data = []

predicted_probs = model.predict_proba(X_test)

for thr in thresholds:
    preds = (predicted_probs[:, 1] >= thr).astype('int')
    perf = evaluate_performance(preds, predicted_probs[:, 1], y_test)
    frac_pos_pred = preds.mean()
    
    plot_data.append({
        "precision": perf["precision"],
        "frac_pos_pred": frac_pos_pred,
        "threshold": thr,
    })
    
plot_df = pd.DataFrame(plot_data)

In [None]:
ig, ax = plt.subplots(1, 1, figsize=(10,7))
ax.plot(plot_df["threshold"], plot_df["precision"], color="blue", marker="o")
ax.grid()
ax.set_xlabel("Score threshold")
ax.set_ylabel("Hit rate/precision", color="blue")
ax.axhline(1, color="black", linestyle="--", label="baseline")
ax2 = ax.twinx()
ax2.plot(plot_df["threshold"], plot_df["frac_pos_pred"], color="orange", marker="o")
ax2.set_ylabel("Fraction sent to HH", color="orange")
ax.set_title("Precision vs. fraction sent to HH")

In [None]:
X_tmp["afgewezen"].mean()

In [None]:
((y_test == 0) & X_tmp["afgewezen"] & (preds == 1)).sum()

In [None]:
predicted_probs = model.predict_proba(X_test)
preds = (predicted_probs[:, 1] >= 0.54).astype('int')

assert all(X_tmp.index == X_test.index)

# Precision
((preds == 1) & (y_test == 1)).sum() / (preds == 1).sum()

((preds == 1) & X_tmp["afgewezen"]).sum() / (preds == 1).sum()

In [None]:
((preds == 1) & (y_test == 1)).sum() / (preds == 1).sum()

In [None]:
X_tmp = pd.merge(X_test, df_hh["afgewezen"], left_index=True, right_index=True)
X_tmp[X_tmp["afgewezen"] & (X_tmp["afgewezen"] == y_test)]

In [None]:
plot_df[40:60]

In [None]:
bin_counts = np.histogram(predicted_probs[:, 1], bins=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
counts, bins = bin_counts[0], bin_counts[1]
perc_counts = counts / len(predicted_probs)

In [None]:
for i, b in enumerate(bins[:-1]):
    print(f"Score {bins[i]} - {bins[i+1]}: {perc_counts[i]*100:.2f}%")