In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from skl2onnx.common._apply_operation import apply_add
from skl2onnx import to_onnx
from skl2onnx import convert_sklearn

import pandas as pd
import numpy as np

In [3]:
data_desc = pd.read_csv('data/data_description.csv', encoding='latin-1')
df = pd.read_csv('data/investigation_train_large_checked.csv')
df_synth = pd.read_csv('data/synth_data_for_training.csv')

In [4]:
# Features
X = df.drop(columns=['checked', 'Ja', 'Nee'])
# Labels
y = df['checked']

In [7]:
mlp_model = MLPClassifier(hidden_layer_sizes=(16,8),  
                    activation='relu',            
                    solver='adam',                
                    max_iter=10,                
                    random_state=42,
                    verbose=True)

In [15]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preprocessing
numeric_features = list(range(X.shape[1]))  # assuming all features are numeric in make_classification
numeric_transformer = StandardScaler()

# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
    ]
)

# Pipelines for both models
logreg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

gb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", GradientBoostingClassifier(random_state=42))
])

mlp_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", mlp_model)
])

# Cross-validation scores for Logistic Regression
logreg_scores = cross_val_score(logreg_pipeline, X_train, y_train, cv=5, scoring="accuracy")

# Cross-validation scores for Gradient Boosting
gb_scores = cross_val_score(gb_pipeline, X_train, y_train, cv=5, scoring="accuracy")

# Cross-validation scores for Gradient Boosting
mlp_scores = cross_val_score(mlp_pipeline, X_train, y_train, cv=5, scoring="accuracy")

print(f"Logistic Regression CV Accuracy: {logreg_scores.mean():.4f} ± {logreg_scores.std():.4f}")
print(f"Gradient Boosting CV Accuracy: {gb_scores.mean():.4f} ± {gb_scores.std():.4f}")
print(f"MLP CV Accuracy: {gb_scores.mean():.4f} ± {gb_scores.std():.4f}")


# Train both models on full training data
logreg_pipeline.fit(X_train, y_train)
gb_pipeline.fit(X_train, y_train)
mlp_pipeline.fit(X_train, y_train)

# Test accuracy
print(f"Logistic Regression Test Accuracy: {logreg_pipeline.score(X_test, y_test):.4f}")
print(f"Gradient Boosting Test Accuracy: {gb_pipeline.score(X_test, y_test):.4f}")

Iteration 1, loss = 0.32641634
Iteration 2, loss = 0.22215839
Iteration 3, loss = 0.20047043
Iteration 4, loss = 0.18707304
Iteration 5, loss = 0.17587563
Iteration 6, loss = 0.16680324
Iteration 7, loss = 0.16024601
Iteration 8, loss = 0.15358594
Iteration 9, loss = 0.14810107
Iteration 10, loss = 0.14370271




Iteration 1, loss = 0.32411397
Iteration 2, loss = 0.21903978
Iteration 3, loss = 0.19769433
Iteration 4, loss = 0.18443786
Iteration 5, loss = 0.17405713
Iteration 6, loss = 0.16458343
Iteration 7, loss = 0.15766025
Iteration 8, loss = 0.15075948
Iteration 9, loss = 0.14494376
Iteration 10, loss = 0.14057842




Iteration 1, loss = 0.32720633
Iteration 2, loss = 0.22290176
Iteration 3, loss = 0.20149265
Iteration 4, loss = 0.18899317
Iteration 5, loss = 0.17805663
Iteration 6, loss = 0.16889629
Iteration 7, loss = 0.16188729
Iteration 8, loss = 0.15520315
Iteration 9, loss = 0.14965224
Iteration 10, loss = 0.14510542




Iteration 1, loss = 0.32552575
Iteration 2, loss = 0.22082271
Iteration 3, loss = 0.19996688
Iteration 4, loss = 0.18751979
Iteration 5, loss = 0.17758120
Iteration 6, loss = 0.16928746
Iteration 7, loss = 0.16222364
Iteration 8, loss = 0.15668896
Iteration 9, loss = 0.15100659
Iteration 10, loss = 0.14659050




Iteration 1, loss = 0.32545055
Iteration 2, loss = 0.22202852
Iteration 3, loss = 0.20045559
Iteration 4, loss = 0.18787874
Iteration 5, loss = 0.17782910
Iteration 6, loss = 0.16994870
Iteration 7, loss = 0.16209580
Iteration 8, loss = 0.15606860
Iteration 9, loss = 0.15064169
Iteration 10, loss = 0.14582912
Logistic Regression CV Accuracy: 0.9167 ± 0.0024
Gradient Boosting CV Accuracy: 0.9241 ± 0.0009
MLP CV Accuracy: 0.9241 ± 0.0009




Iteration 1, loss = 0.30823164
Iteration 2, loss = 0.21451078
Iteration 3, loss = 0.19460488
Iteration 4, loss = 0.18233298
Iteration 5, loss = 0.17180304
Iteration 6, loss = 0.16333488
Iteration 7, loss = 0.15622752
Iteration 8, loss = 0.15042989
Iteration 9, loss = 0.14571517
Iteration 10, loss = 0.14134330
Logistic Regression Test Accuracy: 0.9147




Gradient Boosting Test Accuracy: 0.9237


In [19]:
# Convert the sklearn pipeline to ONNX
input_features = X_train.shape[1]
initial_type = [('float_input', FloatTensorType([None, input_features]))]

onnx_model = convert_sklearn(
    mlp_pipeline, initial_types=initial_type,
    target_opset=12
)

# Initialize ONNX Runtime InferenceSession
sess = rt.InferenceSession(onnx_model.SerializeToString())

# Inspect input and output names (optional for debugging)
input_name = sess.get_inputs()[0].name
output_name = sess.get_outputs()[0].name
print(f"Input name: {input_name}, Output name: {output_name}")

# Prepare test data
X_test_onnx = X_test.values.astype(np.float32)

# Make predictions using ONNX model
y_pred_onnx = sess.run(None, {input_name: X_test_onnx})[0]

# Evaluate ONNX model accuracy
accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx)
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Input name: float_input, Output name: output_label
Accuracy of the ONNX model:  0.9258461538461539


In [22]:
onnx_file_path = "model/mlp_pipeline.onnx"
with open(onnx_file_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

print(f"ONNX model saved to {onnx_file_path}")

ONNX model saved to model/mlp_pipeline.onnx
