In [2]:
import re
import pandas as pd
import numpy as np
import onnx
import onnxruntime as rt
from skl2onnx import to_onnx, convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, TargetEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
# from plotting import plotCorrelation, plotColumnBarchartGrid
from sklearn.ensemble import HistGradientBoostingClassifier

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
data = pd.read_csv('data/synth_data_for_training.csv')
target_col = 'checked' 
y = data[target_col]
X = data.drop([target_col], axis=1)
# print(X.dtypes.unique())
X = X.astype(np.float32)
# print(f"Shape of X: {X.shape}")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# print(f"Training dataset shape: {X_train.shape}")

# print()
correlations = X.corrwith(y)
corr_df = pd.DataFrame({
    'correlation': correlations,
    'abs_correlation': correlations.abs()
})
sorted_corr = corr_df.sort_values(by='abs_correlation', ascending=False)
unique_vals = X["contacten_onderwerp__werk_intake"].nunique
print(unique_vals)
# print(sorted_corr[['correlation']].head(50))

<bound method IndexOpsMixin.nunique of 0        2.0
1        1.0
2        1.0
3        0.0
4        2.0
        ... 
12640    0.0
12641    1.0
12642    0.0
12643    0.0
12644    1.0
Name: contacten_onderwerp__werk_intake, Length: 12645, dtype: float32>


  c /= stddev[:, None]
  c /= stddev[None, :]


In [17]:
print("\nData Statistics (Min, Max, Mean, Variance):")
stats = X.agg(['min', 'max', 'mean', 'var']).transpose()
print(stats.to_string(float_format="%.4f"))


Data Statistics (Min, Max, Mean, Variance):
                                                                                                       min        max      mean           var
adres_aantal_brp_adres                                                                              1.0000    11.0000    2.8785        2.1243
adres_aantal_verschillende_wijken                                                                   1.0000     7.0000    2.1016        0.8740
adres_aantal_verzendadres                                                                           0.0000     3.0000    0.4333        0.2737
adres_aantal_woonadres_handmatig                                                                    0.0000     3.0000    0.4943        0.3109
adres_dagen_op_adres                                                                                3.0000 24326.0000 9904.9463 40152840.0000
adres_recentst_onderdeel_rdam                                                                       0.0

In [11]:
def get_sensitive_columns_regex(all_columns):
    cols_to_drop = []
    
    sensitive_patterns = [
        r"^adres",             
        r"^persoon_geslacht", # gender
        r"^persoon_leeftijd", # age
        r"^relatie",           
        r"taal", # language 
        r"inburgering", 
        r"nationaliteit",
        r"schrijven0",
        r"schrijven1",
        r"schrijven2",
        r"schrijven3",
        r"schrijvenfalse",
        r"spreken1",
        r"spreken2",
        r"spreken3",
        r"lezen3",
        r"lezen4",
        r"begrijpen3"
        
    ]
    
    combined_pattern = "|".join(sensitive_patterns)
    
    for col in all_columns:
        if re.search(combined_pattern, col, re.IGNORECASE):
            cols_to_drop.append(col)
            
    return cols_to_drop

all_cols = X_train.columns.tolist()
cols_to_drop_names = get_sensitive_columns_regex(all_cols)
drop_idx = [all_cols.index(c) for c in cols_to_drop_names]
print(cols_to_drop_names)
print("=" * 100)
print(f"Good Model: Dropping {len(cols_to_drop_names)} sensitive features.")
# print(X.dtypes)
# print(X.shape)

['adres_aantal_brp_adres', 'adres_aantal_verschillende_wijken', 'adres_aantal_verzendadres', 'adres_aantal_woonadres_handmatig', 'adres_dagen_op_adres', 'adres_recentst_onderdeel_rdam', 'adres_recentste_buurt_groot_ijsselmonde', 'adres_recentste_buurt_nieuwe_westen', 'adres_recentste_buurt_other', 'adres_recentste_buurt_oude_noorden', 'adres_recentste_buurt_vreewijk', 'adres_recentste_plaats_other', 'adres_recentste_plaats_rotterdam', 'adres_recentste_wijk_charlois', 'adres_recentste_wijk_delfshaven', 'adres_recentste_wijk_feijenoord', 'adres_recentste_wijk_ijsselmonde', 'adres_recentste_wijk_kralingen_c', 'adres_recentste_wijk_noord', 'adres_recentste_wijk_other', 'adres_recentste_wijk_prins_alexa', 'adres_recentste_wijk_stadscentru', 'adres_unieke_wijk_ratio', 'afspraak_afgelopen_jaar_monitoring_insp__wet_taaleis_na_12_mnd_n_a_v__taa04_____geen_maatregel', 'afspraak_afgelopen_jaar_ontheffing_taaleis', 'afspraak_verzenden_beschikking_i_v_m__niet_voldoen_aan_wet_taaleis', 'belemmering_

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('drop_sensitive', 'drop', drop_idx)
    ],
    remainder='passthrough'
)


classifier = HistGradientBoostingClassifier(
    learning_rate=0.2,
    max_iter=1000,
    max_depth=1,
    min_samples_leaf=10,
    l2_regularization=1.0,
    early_stopping=True
)


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('classification', classifier)
])

In [24]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Accuracy: 0.9220


In [25]:
preprocessor_step = pipeline.named_steps['preprocessor']
feature_names_out = preprocessor_step.get_feature_names_out(input_features=X_train.columns)
print(f"remain nums: {len(feature_names_out)}")
print("remain:")
for feature in feature_names_out:
    print(feature)

suspicious_feature = "adres_aantal_woonadres_handmatig"
if suspicious_feature in feature_names_out:
    print(f"{suspicious_feature} still in")
else:
    print(f"{suspicious_feature} out")

remain nums: 243
remain:
remainder__afspraak_aanmelding_afgesloten
remainder__afspraak_aantal_woorden
remainder__afspraak_afgelopen_jaar_afsprakenplan
remainder__afspraak_afgelopen_jaar_ontheffing
remainder__afspraak_afgelopen_jaar_plan_van_aanpak
remainder__afspraak_afgelopen_jaar_signaal_voor_medewerker
remainder__afspraak_afgelopen_jaar_vervolgmeting_matchbaarheid_werkzoekende_klant
remainder__afspraak_afgelopen_jaar_voortgang_aanmelding_en_deelname
remainder__afspraak_afsprakenplan
remainder__afspraak_controle_aankondiging_maatregel
remainder__afspraak_controle_verwijzing
remainder__afspraak_deelname_compleet_uit_webapplicatie
remainder__afspraak_galo_gesprek
remainder__afspraak_gespr__einde_zoekt___galo_gesprek_
remainder__afspraak_inspanningsperiode
remainder__afspraak_laatstejaar_aantal_woorden
remainder__afspraak_laatstejaar_resultaat_ingevuld
remainder__afspraak_laatstejaar_resultaat_ingevuld_uniek
remainder__afspraak_other
remainder__afspraak_participatietrede_vervolgmeting
r

In [26]:
initial_type = [('X', FloatTensorType((None, X.shape[1])))]
onnx_model = convert_sklearn(
    pipeline, 
    initial_types=initial_type,
    target_opset=12
)
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx = sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX Good Model: ', accuracy_onnx_model)

Accuracy of the ONNX Good Model:  0.9219820769636268


In [16]:
onnx.save(onnx_model, "model/good_model.onnx")
new_session = rt.InferenceSession("model/good_model.onnx")
y_pred_onnx2 = new_session.run(None, {'X': X_test.values.astype(np.float32)})
print('Model re-loaded successfully. Accuracy verified.')

Model re-loaded successfully. Accuracy verified.
