In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn

In [None]:
def reweigh_address(df): # use like so: pipeline.fit(X, y, classification__sample_weight=sample_weights)
    addresses = df.filter(regex='adres_recentste_wijk').columns 
    address_weights = {}
    #dict_example['c'] = 3  # new key, add
    for address in addresses:
        proportion = len(df[df[address]== 1]) / len(df)
        address_weights[address] = proportion
    print(address_weights)

    values = address_weights.values()
    min_proportion = min(values)
    max_proportion = max(values)

    normalized_weights = {key: ((v - min_proportion) / (max_proportion - min_proportion) )  for (key, v) in address_weights.items() }
    print(normalized_weights)

    sample_weights = pd.Series(0, index=df.index, dtype=float)  
    for address in addresses:
        sample_weights[df[address] == 1] = normalized_weights[address]
    
    #print(sample_weights)
    return sample_weights

In [None]:
def change_label(df, percentage): # changes value for 'checked' column to create inconsistencies  
    selection = resample(df, replace=True, n_samples=int(len(df)*percentage), random_state=42)
    selection['checked'] = selection['checked'].replace([0,1], [1,0])
    extra_labeled_data = pd.concat([df, selection])
    return extra_labeled_data

In [None]:

def oversample_age(df, sampling_factor, feature='persoon_leeftijd_bij_onderzoek', min_age=38, max_age=57):
    """Resample data of people falling within given age bracket."""
    majority_df = df[df[feature].between(min_age, max_age, inclusive='both')] # df containing people aged between min_age and max_age
    minority_df = df[df.isin(majority_df) == False].dropna()
    print(len(majority_df) + len(minority_df))

    # Upsample the majority class
    majority_upsampled = resample(majority_df, replace=True, n_samples=int(len(majority_df)*sampling_factor), random_state=42)
    print("length upsampled: ", len(majority_upsampled))

    # Combine the upsampled majority class with the minority class
    rebalanced_data = pd.concat([majority_upsampled, minority_df])
    print(len(rebalanced_data))
    return rebalanced_data

def oversample_gender(df, sampling_factor, feature='persoon_geslacht_vrouw', gender=0): 
    """Resample data of people with given gender. Default gender is male, male=0, female=1."""
    majority_df = df[df[feature] == gender] 
    minority_df = df[df[feature] != gender] 
    print(len(majority_df) + len(minority_df))

    # Upsample the majority class
    majority_upsampled = resample(majority_df, replace=True, n_samples=int(len(majority_df)*sampling_factor), random_state=42)
    print("length upsampled: ", len(majority_upsampled))

    # Combine the upsampled majority class with the minority class
    rebalanced_data = pd.concat([majority_upsampled, minority_df])
    print(len(rebalanced_data))  
    return rebalanced_data  

In [29]:
# Let's load the dataset
data = pd.read_csv('data/synth_data_for_training.csv')
data = oversample_age(data, 2)
data = oversample_gender(data, 3)

# Let's specify the features and the target
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

# Let's split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

sample_weights = reweigh_address(X_train)
print(len(sample_weights))

12645
length upsampled:  15966
20628
20628
length upsampled:  31953
41930
{'adres_recentste_wijk_charlois': 0.09911915286036824, 'adres_recentste_wijk_delfshaven': 0.13336725283810857, 'adres_recentste_wijk_feijenoord': 0.16977772124526982, 'adres_recentste_wijk_ijsselmonde': 0.044296753267402296, 'adres_recentste_wijk_kralingen_c': 0.04563233376792699, 'adres_recentste_wijk_noord': 0.024771838331160364, 'adres_recentste_wijk_other': 0.14042674976945338, 'adres_recentste_wijk_prins_alexa': 0.04191178808789392, 'adres_recentste_wijk_stadscentru': 0.011670429611727668}
{'adres_recentste_wijk_charlois': 0.5530973451327434, 'adres_recentste_wijk_delfshaven': 0.7697103781174579, 'adres_recentste_wijk_feijenoord': 1.0, 'adres_recentste_wijk_ijsselmonde': 0.2063555913113435, 'adres_recentste_wijk_kralingen_c': 0.21480289621882542, 'adres_recentste_wijk_noord': 0.08286403861625101, 'adres_recentste_wijk_other': 0.8143604183427193, 'adres_recentste_wijk_prins_alexa': 0.19127111826226872, 'adres

In [6]:
# Select data based on variance (not the final version yet, for now just for testing)
selector = VarianceThreshold()

In [7]:
# Define a gradient boosting classifier
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

In [8]:
# Create a pipeline object with our selector and classifier
# NOTE: You can create custom pipeline objects but they must be registered to onnx or it will not recognise them
# Because of this we recommend using the onnx known objects as defined in the documentation
pipeline = Pipeline(steps=[('feature selection', selector), ('classification', classifier)])

In [30]:
# Let's train a simple model
pipeline.fit(X_train, y_train, classification__sample_weight=sample_weights)

# Let's evaluate the model
y_pred = pipeline.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
original_precision = precision_score(y_test, y_pred)
original_auc = roc_auc_score(y_test, y_pred)
print('Accuracy of the original model: ', original_accuracy)
print('Precision of the original model: ', original_precision)
print('ROC-AUC of the original model: ', original_auc)

Accuracy of the original model:  0.9376132786416103
Precision of the original model:  0.7611464968152867
ROC-AUC of the original model:  0.735486949492507


In [7]:
# Let's convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

# Let's check the accuracy of the converted model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9456040480708412


In [8]:
# Let's save the model
onnx.save(onnx_model, "model/gboost.onnx")

# Let's load the model
new_session = rt.InferenceSession("model/gboost.onnx")

# Let's predict the target
y_pred_onnx2 =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx2[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)


Accuracy of the ONNX model:  0.9456040480708412
