In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [7]:
# Let's load the dataset
data = pd.read_csv('data/investigation_train_large_checked.csv')

# Let's specify the features and the target
y = data['checked']
X = data.drop(columns=['checked', 'Ja', 'Nee'])
X = X.astype(np.float32)

# Let's split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
# Select data based on variance (not the final version yet, for now just for testing)
selector = VarianceThreshold()

In [9]:
# Define a random forest classifier
# classifier = RandomForestClassifier(n_estimators=100, max_depth=1, random_state=0)

classifier = HistGradientBoostingClassifier(max_depth=None, learning_rate=0.05, max_iter=300)


In [10]:
# Create a pipeline object with our selector and classifier
# NOTE: You can create custom pipeline objects but they must be registered to onnx or it will not recognise them
# Because of this we recommend using the onnx known objects as defined in the documentation
# pipeline = Pipeline(steps=[('feature selection', selector), ('classification', classifier)])
pipeline = Pipeline(steps=[('classification', classifier)])

In [11]:
# Let's train a simple model
pipeline.fit(X_train, y_train)

# Let's evaluate the model
y_pred = pipeline.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the original model: ', original_accuracy)

Accuracy of the original model:  0.954276923076923


In [93]:
# Let's convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

# Let's check the accuracy of the converted model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9537846153846153


In [94]:
# Let's save the model
onnx.save(onnx_model, "model/bad_model_histGradBoosting.onnx")
# onnx.save(onnx_model, "model/random_forest.onnx")

# Let's load the model
new_session = rt.InferenceSession("model/bad_model_histGradBoosting.onnx")
# new_session = rt.InferenceSession("model/random_forest.onnx")

# Let's predict the target
y_pred_onnx2 =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx2[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)


Accuracy of the ONNX model:  0.9537846153846153


## Partition Test

In [1]:
from openpyxl import load_workbook
import pandas as pd
import numpy as np

def load_feature_groups(path):
    wb = load_workbook(path, data_only=True)
    ws = wb.active

    TARGET_COLOR = "FF5EB91E"  # green color used for good features

    def get_color(cell):
        if cell.fill.patternType is None:
            return None
        return cell.fill.fgColor.rgb

    good = []
    bad = []

    for row in ws.iter_rows(min_row=2, values_only=False):  # skip header
        feature_name = row[1].value      # column B = feature name in Dutch
        color = get_color(row[1])        

        if color == TARGET_COLOR:
            good.append(feature_name)
        else:
            bad.append(feature_name)

    return good, bad

good_features, bad_features = load_feature_groups("data/data_description_colored.xlsx")
good_features, bad_features


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


(['adres_aantal_brp_adres',
  'adres_aantal_verschillende_wijken',
  'adres_aantal_verzendadres',
  'adres_aantal_woonadres_handmatig',
  'adres_dagen_op_adres',
  'afspraak_aanmelding_afgesloten',
  'afspraak_afgelopen_jaar_afsprakenplan',
  'afspraak_afgelopen_jaar_ontheffing',
  'afspraak_afgelopen_jaar_plan_van_aanpak',
  'afspraak_afgelopen_jaar_signaal_voor_medewerker',
  'afspraak_afgelopen_jaar_vervolgmeting_matchbaarheid_werkzoekende_klant',
  'afspraak_afgelopen_jaar_voortgang_aanmelding_en_deelname',
  'afspraak_afsprakenplan',
  'afspraak_controle_aankondiging_maatregel',
  'afspraak_controle_verwijzing',
  'afspraak_deelname_compleet_uit_webapplicatie',
  'afspraak_inspanningsperiode',
  'afspraak_laatstejaar_resultaat_ingevuld',
  'afspraak_laatstejaar_resultaat_ingevuld_uniek',
  'afspraak_other',
  'afspraak_participatietrede_vervolgmeting',
  'afspraak_resultaat_ingevuld_uniek',
  'afspraak_signaal_van_aanbieder',
  'afspraak_signaal_voor_medewerker',
  'afspraak_toevo

In [2]:
# Split bad features into groups: address-related, appointment-related, medical-related, competence-related, 
# language-related, gender-related, age-related, personal qualities-related, relationship-related
def partition_bad_features(bad_features):
    address_mask = ["adres"]
    appointment_mask = ["afspraak"]
    medical_mask = ["lichamelijke", "psychische", "verslaving", "medische", ]
    competence_mask = ["competentie"]
    language_mask = ["taal", "_nl_"]
    gender_mask = ["geslacht"]
    age_mask = ["leeftijd"]
    relationship_mask = ["sociaal", "sociale", "relatie"]
    # personal qualities: others

    groups = {
        "address": [],
        "appointment": [],
        "medical": [],
        "competence": [],
        "language": [],
        "gender": [],
        "age": [],
        "relationship": [],
        "personal_qualities": [],
    }

    # Helper function to check if any keyword is in the feature name
    def contains_keyword(feature, keywords):
        return any(keyword in feature.lower() for keyword in keywords)
    
    # Iterate through bad features and assign them to groups
    for feature in bad_features:
        if contains_keyword(feature, address_mask):
            groups["address"].append(feature)
        elif contains_keyword(feature, appointment_mask):
            groups["appointment"].append(feature)
        elif contains_keyword(feature, medical_mask):
            groups["medical"].append(feature)
        elif contains_keyword(feature, competence_mask):
            groups["competence"].append(feature)
        elif contains_keyword(feature, language_mask):
            groups["language"].append(feature)
        elif contains_keyword(feature, gender_mask):
            groups["gender"].append(feature)
        elif contains_keyword(feature, age_mask):
            groups["age"].append(feature)
        elif contains_keyword(feature, relationship_mask):
            groups["relationship"].append(feature)
        else:
            groups["personal_qualities"].append(feature)

    return groups

In [3]:
grouped_bad_features = partition_bad_features(bad_features)
grouped_bad_features

{'address': ['adres_recentst_onderdeel_rdam',
  'adres_recentste_buurt_groot_ijsselmonde',
  'adres_recentste_buurt_nieuwe_westen',
  'adres_recentste_buurt_other',
  'adres_recentste_buurt_oude_noorden',
  'adres_recentste_buurt_vreewijk',
  'adres_recentste_plaats_other',
  'adres_recentste_plaats_rotterdam',
  'adres_recentste_wijk_charlois',
  'adres_recentste_wijk_delfshaven',
  'adres_recentste_wijk_feijenoord',
  'adres_recentste_wijk_ijsselmonde',
  'adres_recentste_wijk_kralingen_c',
  'adres_recentste_wijk_noord',
  'adres_recentste_wijk_other',
  'adres_recentste_wijk_prins_alexa',
  'adres_recentste_wijk_stadscentru',
  'adres_unieke_wijk_ratio'],
 'appointment': ['afspraak_aantal_woorden',
  'afspraak_afgelopen_jaar_monitoring_insp__wet_taaleis_na_12_mnd_n_a_v__taa04_____geen_maatregel',
  'afspraak_afgelopen_jaar_ontheffing_taaleis',
  'afspraak_galo_gesprek',
  'afspraak_gespr__einde_zoekt___galo_gesprek_',
  'afspraak_laatstejaar_aantal_woorden'],
 'medical': ['belemmer

In [13]:
# Check how many unique values each bad feature has, in each group
grouped_feature_unique_values = {}
for group, features in grouped_bad_features.items():
    feature_unique_values = {}
    for feature in features:
        unique_values = data[feature].nunique()
        feature_unique_values[feature] = unique_values
    grouped_feature_unique_values[group] = feature_unique_values

grouped_feature_unique_values

{'address': {'adres_recentst_onderdeel_rdam': 2,
  'adres_recentste_buurt_groot_ijsselmonde': 2,
  'adres_recentste_buurt_nieuwe_westen': 2,
  'adres_recentste_buurt_other': 2,
  'adres_recentste_buurt_oude_noorden': 2,
  'adres_recentste_buurt_vreewijk': 2,
  'adres_recentste_plaats_other': 2,
  'adres_recentste_plaats_rotterdam': 2,
  'adres_recentste_wijk_charlois': 2,
  'adres_recentste_wijk_delfshaven': 2,
  'adres_recentste_wijk_feijenoord': 2,
  'adres_recentste_wijk_ijsselmonde': 2,
  'adres_recentste_wijk_kralingen_c': 2,
  'adres_recentste_wijk_noord': 2,
  'adres_recentste_wijk_other': 2,
  'adres_recentste_wijk_prins_alexa': 2,
  'adres_recentste_wijk_stadscentru': 2,
  'adres_unieke_wijk_ratio': 2},
 'appointment': {'afspraak_aantal_woorden': 967,
  'afspraak_afgelopen_jaar_monitoring_insp__wet_taaleis_na_12_mnd_n_a_v__taa04_____geen_maatregel': 2,
  'afspraak_afgelopen_jaar_ontheffing_taaleis': 2,
  'afspraak_galo_gesprek': 4,
  'afspraak_gespr__einde_zoekt___galo_gesprek

In [14]:
# For each group, calculate the lowest number of unique values, it will be used as the number of partitions in each group
# Except for age, since age is continuous, we can set a fixed number of partitions, e.g. 2
partition_counts = {}
for group, feature_unique_values in grouped_feature_unique_values.items():
    if group == "age":
        partition_counts[group] = 2
    else:
        min_unique_values = min(feature_unique_values.values()) if feature_unique_values else 0
        partition_counts[group] = min_unique_values
partition_counts


{'address': 2,
 'appointment': 2,
 'medical': 2,
 'competence': 2,
 'language': 2,
 'gender': 2,
 'age': 2,
 'relationship': 2,
 'personal_qualities': 2}

In [15]:
# Create partitions on df data for every bad feature based on its group partition count
def create_partitions_for_bad_features(df, grouped_bad_features, partition_counts):
    partitions = {}
    for group, features in grouped_bad_features.items():
        partitions[group] = {}
        num_partitions = partition_counts[group]

        for feature in features:
            if feature not in df.columns:
                partitions[group][feature] = []
                continue

            unique_values = df[feature].nunique()

            # if the number of unique values is less than or equal to the number of partitions, use unique values as partitions
            if unique_values <= num_partitions:
                partition_values = sorted(df[feature].dropna().unique().tolist())

            else:
                # Try qcut to create partitions using quantiles
                bins = pd.qcut(df[feature], q=num_partitions, duplicates='drop')
                partition_values = sorted(bins.unique().tolist(), key=lambda x: str(x))

                # If qcut failed to produce enough partitions
                if len(partition_values) < num_partitions:
                    # Fallback to using value-based partitions
                    unique_values = sorted(df[feature].dropna().unique().tolist())

                    # If still not enough, use equal-width binning
                    if len(unique_values) < num_partitions:
                        # Force equal-width binning
                        partition_values = unique_values
                    else:
                        # Create equal-width bins
                        bins = pd.cut(df[feature], bins=num_partitions)
                        partition_values = bins.unique().tolist()

            partitions[group][feature] = partition_values
    return partitions

feature_partitions = create_partitions_for_bad_features(data, grouped_bad_features, partition_counts)
feature_partitions

{'address': {'adres_recentst_onderdeel_rdam': [0, 1],
  'adres_recentste_buurt_groot_ijsselmonde': [0, 1],
  'adres_recentste_buurt_nieuwe_westen': [0, 1],
  'adres_recentste_buurt_other': [0, 1],
  'adres_recentste_buurt_oude_noorden': [0, 1],
  'adres_recentste_buurt_vreewijk': [0, 1],
  'adres_recentste_plaats_other': [0, 1],
  'adres_recentste_plaats_rotterdam': [0, 1],
  'adres_recentste_wijk_charlois': [0, 1],
  'adres_recentste_wijk_delfshaven': [0, 1],
  'adres_recentste_wijk_feijenoord': [0, 1],
  'adres_recentste_wijk_ijsselmonde': [0, 1],
  'adres_recentste_wijk_kralingen_c': [0, 1],
  'adres_recentste_wijk_noord': [0, 1],
  'adres_recentste_wijk_other': [0, 1],
  'adres_recentste_wijk_prins_alexa': [0, 1],
  'adres_recentste_wijk_stadscentru': [0, 1],
  'adres_unieke_wijk_ratio': [0, 1]},
 'appointment': {'afspraak_aantal_woorden': [Interval(-0.001, 230.0, closed='right'),
   Interval(230.0, 1214.0, closed='right')],
  'afspraak_afgelopen_jaar_monitoring_insp__wet_taaleis_n

In [51]:
# Create row subsets for each feature partition
def create_row_subsets_for_partitions(df, feature_partitions):
    row_subsets = {}
    for group, features in feature_partitions.items():
        row_subsets[group] = {}
        for feature, partition_values in features.items():
            row_subsets[group][feature] = {}
            for pv in partition_values:
                # If pv is a simple numeric value (e.g., 0 or 1)
                if not hasattr(pv, 'left'):   # pv is NOT an Interval (not qcut)
                    subset = df[df[feature] == pv]

                else:
                    # pv is a qcut Interval (e.g., Interval(0.1, 0.4))
                    left = pv.left
                    right = pv.right

                    subset = df[(df[feature] >= left) & (df[feature] <= right)]

                # Store subset
                row_subsets[group][feature][str(pv)] = subset

    return row_subsets

row_subsets = create_row_subsets_for_partitions(data, feature_partitions)


MemoryError: Unable to allocate 204. MiB for an array with shape (315, 84890) and data type int64

In [18]:
import random

def metamorphic_transform_value(value, unique_values, num_unique_values, categorical_threshold=100):
    """
    Metamorphic transformation for a feature value.
    If the feature is binary (2 unique values), flip the value.
    If the feature is categorical with more than 2 unique values, randomly select a different value.
    If the feature is continuous (more than categorical_threshold unique values), perturb the value.
    Parameters:
    - value: original feature value
    - unique_values: list of unique values for the feature
    - num_unique_values: number of unique values for the feature
    - categorical_threshold: threshold to distinguish categorical from continuous features
    Returns:
    - transformed_value: metamorphically transformed feature value
    """
    
    if num_unique_values <= 2:
        # Binary feature, flip the value
        if value in [0, 1]:
            return 1 - value
        return value  # In case of unexpected binary values
    
    elif num_unique_values <= categorical_threshold:
        # Categorical feature with more than 2 unique values
        other_values = [v for v in unique_values if v != value]
        if len(other_values) == 0:
            return value  # No other value to choose from
        return random.choice(other_values)
    
    else:
        # Continuous feature, perturb the value slightly
        min_value = min(unique_values)
        max_value = max(unique_values)
        # Bounded perturbation
        delta = (max_value - min_value) * 0.05  # 5% of the range
        perturbed_value = value + random.uniform(-delta, delta)
        # Ensure perturbed value is within the original range
        perturbed_value = max(min_value, min(max_value, perturbed_value))
        return perturbed_value

In [19]:
# Function for Metamorphic transformation on each row
def transform_row_for_feature(row, feature, feature_unique_values, feature_unique_counts):
    val = row[feature]
    unique_values = feature_unique_values[feature]
    num_unique_values = feature_unique_counts[feature]

    row2 = row.copy()
    row2[feature] = metamorphic_transform_value(val, unique_values, num_unique_values)
    return row2


In [58]:
# Build dictionary of unique values and counts for each bad feature
feature_unique_values = {}
feature_unique_counts = {}
for group, features in grouped_bad_features.items():
    for feature in features:
        unique_values = X_test[feature].dropna().unique().tolist()
        feature_unique_values[feature] = unique_values
        feature_unique_counts[feature] = len(unique_values)

In [59]:
feature_unique_values, feature_unique_counts

({'adres_recentst_onderdeel_rdam': [1.0, 0.0],
  'adres_recentste_buurt_groot_ijsselmonde': [0.0, 1.0],
  'adres_recentste_buurt_nieuwe_westen': [0.0, 1.0],
  'adres_recentste_buurt_other': [0.0, 1.0],
  'adres_recentste_buurt_oude_noorden': [0.0, 1.0],
  'adres_recentste_buurt_vreewijk': [0.0, 1.0],
  'adres_recentste_plaats_other': [0.0, 1.0],
  'adres_recentste_plaats_rotterdam': [1.0, 0.0],
  'adres_recentste_wijk_charlois': [0.0, 1.0],
  'adres_recentste_wijk_delfshaven': [0.0, 1.0],
  'adres_recentste_wijk_feijenoord': [0.0, 1.0],
  'adres_recentste_wijk_ijsselmonde': [1.0, 0.0],
  'adres_recentste_wijk_kralingen_c': [0.0, 1.0],
  'adres_recentste_wijk_noord': [0.0, 1.0],
  'adres_recentste_wijk_other': [0.0, 1.0],
  'adres_recentste_wijk_prins_alexa': [0.0, 1.0],
  'adres_recentste_wijk_stadscentru': [0.0, 1.0],
  'adres_unieke_wijk_ratio': [1.0, 0.0],
  'afspraak_aantal_woorden': [230.0,
   540.0,
   77.0,
   356.0,
   222.0,
   433.0,
   260.0,
   317.0,
   328.0,
   43.0,
   

In [61]:

# Metamorphic testing this feature on each row of the testing set
def predict_onnx(session, input_name, X):
    preds = session.run(None, {input_name: X.astype(np.float32)})[0]
    return preds.reshape(-1)



In [None]:
def metamorphic_feature_test(X_test, feature, feature_unique_values, feature_unique_counts,
                             session, input_name):
    
    changed_predictions = 0
    total_rows = len(X_test)

    pred_original_list = []
    pred_transformed_list = []

    # for idx, row in X_test.iterrows():
    #     # Original prediction
    #     x = row.values.astype(np.float32)
    #     pred_x = predict_onnx(session, input_name, x.reshape(1, -1))[0]

    #     # Create transformed row
    #     row2 = transform_row_for_feature(
    #         row, 
    #         feature, 
    #         feature_unique_values, 
    #         feature_unique_counts
    #     )
    #     x2 = row2.values.astype(np.float32)
    #     pred_x2 = predict_onnx(session, input_name, x2.reshape(1, -1))[0]

    #     pred_original_list.append(pred_x)
    #     pred_transformed_list.append(pred_x2)

    #     # Count label change (binary model assumed)
    #     if pred_x != pred_x2:
    #         changed_predictions += 1

    pred_original = pipeline.predict(X_test)

    X_test_flipped = X_test.copy()
    X_test_flipped[feature] = X_test_flipped[feature].apply(
        lambda val: metamorphic_transform_value(
            val,
            feature_unique_values[feature],
            feature_unique_counts[feature]
        )
    )


    pred_transformed = pipeline.predict(X_test_flipped)

    # Print how many predictions' labels are "True"'
    print(
        f"Feature '{feature}': Original 'True' predictions: {np.sum(pred_original == True)}, "
        f"Transformed 'True' predictions: {np.sum(pred_transformed == True)}"
    )

    # Evaluate accuracy
    accuracy_original = np.mean(pred_original == y_test)
    accuracy_transformed = np.mean(pred_transformed == y_test)

    changed_predictions = np.sum(pred_original != pred_transformed)

    result = {
        "feature": feature,
        "total_rows": total_rows,
        "changed_predictions": changed_predictions,
        "changed_percentage": changed_predictions / total_rows,
        "pred_original": pred_original_list,
        "pred_transformed": pred_transformed_list,
        "accuracy_original": accuracy_original,
        "accuracy_transformed": accuracy_transformed

    }

    return result


In [96]:
# feature_to_test = "persoon_geslacht_vrouw"
# ['persoon_leeftijd_bij_onderzoek', 'relatie_kind_leeftijd_verschil_ouder_eerste_kind']
feature_to_test = "relatie_kind_leeftijd_verschil_ouder_eerste_kind"

# Let's load the model
session_hgb = rt.InferenceSession("model/bad_model_histGradBoosting.onnx")
input_name_hgb = session_hgb.get_inputs()[0].name

result_gender = metamorphic_feature_test(
    X_test,
    feature_to_test,
    feature_unique_values,
    feature_unique_counts,
    session_hgb,
    input_name_hgb
)

Feature 'relatie_kind_leeftijd_verschil_ouder_eerste_kind': Original 'True' predictions: 4003, Transformed 'True' predictions: 4079


In [97]:
print("Metamorphic Test for feature:", result_gender["feature"])
print("Total test rows:", result_gender["total_rows"])
print("Changed predictions:", result_gender["changed_predictions"])
print("Changed percentage:", result_gender["changed_percentage"])
print("Accuracy original:", result_gender["accuracy_original"])
print("Accuracy transformed:", result_gender["accuracy_transformed"])


Metamorphic Test for feature: relatie_kind_leeftijd_verschil_ouder_eerste_kind
Total test rows: 32500
Changed predictions: 208
Changed percentage: 0.0064
Accuracy original: 0.9537846153846153
Accuracy transformed: 0.9543384615384616


# Create a bad model

In [29]:
# Inject bias by flipping bad feature data

def inject_bias_by_flip(df, feature, label_col, flip_fraction=0.2):
    df_bad = df.copy()

    # select biased subset
    condition1 = ((df_bad[feature] == 1) & (df_bad[label_col] == 1))
    condition2 = ((df_bad[feature] == 0) & (df_bad[label_col] == 0))

    idx1 = df_bad[condition1].sample(frac=flip_fraction, random_state=42).index
    idx2 = df_bad[condition2].sample(frac=flip_fraction, random_state=42).index

    # Flip the feature values
    df_bad.loc[idx1, feature] = 0
    df_bad.loc[idx2, feature] = 1

    print(f"Injected bias by flipping {len(idx1) + len(idx2)} rows for feature '{feature}'")
    return df_bad

In [30]:
# Apply the bias injection
for feature in ["persoon_geslacht_vrouw", "persoonlijke_eigenschappen_nl_begrijpen3", "belemmering_psychische_problemen"]:
    data_biased = inject_bias_by_flip(data, feature, 'checked', flip_fraction=0.2)



Injected bias by flipping 13219 rows for feature 'persoon_geslacht_vrouw'
Injected bias by flipping 21365 rows for feature 'persoonlijke_eigenschappen_nl_begrijpen3'
Injected bias by flipping 13387 rows for feature 'belemmering_psychische_problemen'


In [31]:
# Oversample bad feature

age_feature = "persoon_leeftijd_bij_onderzoek"

YOUNG_AGE_THRESHOLD = 30

young_fraud_mask = (data_biased[age_feature] < YOUNG_AGE_THRESHOLD) & (data_biased['checked'] == 1)
young_fraud_data = data_biased[young_fraud_mask]

factor = 3  # oversampling factor
df_young_oversampled = pd.concat([
    data_biased,
    young_fraud_data.sample(frac=factor, replace=True, random_state=42)
])
print(f"Oversampled young fraudulent cases from {len(young_fraud_data)} to {len(df_young_oversampled) - len(data_biased)} additional rows.")

# Shuffle the dataset
data_biased = df_young_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

Oversampled young fraudulent cases from 1976 to 5928 additional rows.


In [32]:
# Train model on biased data

X_biased = data_biased.drop(columns=['checked', 'Ja', 'Nee'])
y_biased = data_biased['checked']

classifier = HistGradientBoostingClassifier(max_depth=None, learning_rate=0.05, max_iter=300)

pipeline = Pipeline(steps=[('classification', classifier)])

pipeline.fit(X_biased, y_biased)


In [33]:
# Let's evaluate the model

y_pred = pipeline.predict(X_test)
biased_accuracy = accuracy_score(y_test, y_pred)
print(f"Biased model accuracy: {biased_accuracy}")

Biased model accuracy: 0.9655384615384616


In [34]:
# Save model to onnx
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X_biased.shape[1])))],
    target_opset=12)

onnx.save(onnx_model, "model/model_1.onnx")
