### Set all bad features to a constant value

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Load bad features

# Load feature groups from Json file
import json

with open("feature_groups.json", "r") as f:
    good_bad_dict = json.load(f)

good_features = good_bad_dict["good_features"]
bad_features = good_bad_dict["bad_features"]

In [4]:
# Load dataset
import pandas as pd
import numpy as np

data = pd.read_csv('../data/investigation_train_large_checked.csv')
# Let's specify the features and the target
y = data['checked']
X = data.drop(columns=['checked', 'Ja', 'Nee'])
X = X.astype(np.float32)

In [6]:
# Set bad feature data to 0

X_train_good = X.copy()

for feature in bad_features:
    X_train_good[feature] = 0.0

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_train_good, y, test_size=0.25, random_state=42)

In [8]:
# Classifier

classifier = HistGradientBoostingClassifier(max_depth=None, learning_rate=0.05, max_iter=300)

pipeline = Pipeline(steps=[('classification', classifier)])

In [9]:
# Let's train a simple model
pipeline.fit(X_train, y_train)

# Let's evaluate the model
y_pred = pipeline.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the original model: ', original_accuracy)

Accuracy of the original model:  0.9129846153846154


In [10]:
# Let's convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

# Let's check the accuracy of the converted model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9129846153846154


In [11]:
# Let's save the model
onnx.save(onnx_model, "models/model_2.onnx")
# onnx.save(onnx_model, "model/random_forest.onnx")

# Let's load the model
new_session = rt.InferenceSession("models/model_2.onnx")
# new_session = rt.InferenceSession("model/random_forest.onnx")

# Let's predict the target
y_pred_onnx2 =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx2[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9129846153846154
