### Part 1

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
#import onnxruntime as rt
#import onnx
#from skl2onnx.common.data_types import FloatTensorType
#from skl2onnx import to_onnx
#from skl2onnx import convert_sklearn

random_state_seed = 42

In [2]:
# Let's load the dataset
data = pd.read_csv('data/investigation_train_large_checked.csv')

# Let's specify the features and the target
target = data['checked']
features = data.drop(columns=['checked', 'Ja', 'Nee' ])
features = features.astype(np.float32)

# Let's split the dataset into train and test
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [3]:
def get_problematic_columns( data ):
    psychological_features = []
    medical_features = [ 'belemmering_hist_verslavingsproblematiek' ]
    racial_features = ['ontheffing_reden_hist_sociale_gronden']
    subjective_features = [ 'competentie_ethisch_en_integer_handelen', 'competentie_gedrevenheid_en_ambitie_tonen', 'competentie_met_druk_en_tegenslag_omgaan', 'competentie_omgaan_met_verandering_en_aanpassen',
                            'persoonlijke_eigenschappen_uitstroom_verw_vlgs_km', 'persoonlijke_eigenschappen_uitstroom_verw_vlgs_klant', 'afspraak_aantal_woorden', 'afspraak_laatstejaar_aantal_woorden',
                            'competentie_other', 'competentie_overtuigen_en_be√Ønvloeden'
                          ]
    age_features = ['persoon_leeftijd_bij_onderzoek']
    gender_features = ['persoon_geslacht_vrouw']
    relationship_features = []
    irrelevant_features = [ 'persoonlijke_eigenschappen_hobbies_sport' ]

    for col in data.columns:
        if 'relatie' in col:
            relationship_features.append( col )
        elif 'persoonlijke' in col:
            if '_nl_' in col or 'taal' in col:
                racial_features.append(col)
            elif '_opm' in col:
                subjective_features.append(col)
        elif 'adres_recenst' in col or 'sociaal' in col or 'taal' in col:
            racial_features.append(col)
        elif 'medische' in col or 'lichamelijke' in col:
            medical_features.append(col)
        elif 'psychische' in col:
            psychological_features.append(col)

    return {
            'psychological': psychological_features,
            'medical': medical_features,
            'racial': racial_features,
            'subjective': subjective_features,
            'gender': gender_features,
            'relationship': relationship_features,
            'age': age_features,
            'irrelevant': irrelevant_features
           }

### Part 2

In [4]:
def group_subset( data, column_set ):
    pca = PCA( n_components=1 )
    return pca.fit_transform( data[column_set] )

In [5]:
def n_wise_partition( feature, n_partitions=2, thresholds=None ):
    feature = feature.copy()
    partitions = []
    if thresholds is None:
        mn, mx = feature.min(), feature.max()
        step = (mx-mn)/n_partitions
        thresholds = [ i for i in np.arange( mn, mx, step ) ]
        thresholds = thresholds[1:]
    else:
        assert n_partitions == len(thresholds)+1

    for i in range(n_partitions-1):
        partitions.append( feature[ feature <= thresholds[i] ] )
    partitions.append( feature[ feature > thresholds[i] ] )

    return partitions

In [6]:
def shuffle_columns( data, column_set ):
    data = data.copy()
    shuffled = data[column_set].sample(frac=1).reset_index(drop=True)
    data[column_set] = shuffled
    return data

In [7]:
problem_cols = get_problematic_columns( features )
partition_sizes = {
    'psychological': 2, # well, unwell
    'medical': 2, # well, unwell
    'racial': 4, # Germanic language native, Romance native, PIE native, Non-PIE native
    'subjective': 3, # Low, Mid, High opinion
    'gender': 2, # Male, Female
    'relationship': 3, # Small average, large social circle/family
    'age': 3, # Young Adult, Adult, Senior
    'irrelevant': 2 # Only for sports hobbyists, yes/no.
}

partition_thresholds = {
    'age': [ 30, 60 ] # Young Adult, Adult, Senior
}

partitions = {}
for problem_type in problem_cols:
    grouped_subset = group_subset( features, problem_cols[problem_type] )
    if problem_type in partition_thresholds:
        feature_partitions = n_wise_partition( grouped_subset, partition_sizes[problem_type], partition_thresholds[problem_type] )
    else:
        feature_partitions = n_wise_partition( grouped_subset, partition_sizes[problem_type] )
    partitions[problem_type] = feature_partitions

### Part 3

In [8]:
import torch
import torch.nn as nn
from collections import OrderedDict
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
class Model:
    def __init__( self, architecture, loss, optimizer ):
        self.arch = architecture
        self.loss_f = loss
        self.optim = optimizer

    def forward( self, X ):
        return self.arch( X )

    def backward( self, y_pred, y ):
        loss = self.loss_f( y_pred, y )
        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

    def fit( self, X, y, epochs=1000 ):
        for _ in range(epochs):
            y_pred = self.forward( X )
            self.backward( y_pred, y )

    def predict( self, X ):
        with torch.no_grad():
            return torch.argmax( model.forward(X), dim=1 )

    def fit_predict( self, X, y, epochs=1000 ):
        self.fit( X, y, epochs=epochs )
        return self.predict(X)

In [10]:
def train_eval_model( model, X, y, epochs=1000 ):
    X_train, X_test, y_train, y_test = train_test_split( X.values, y.values, test_size=0.2 )
    X_train = torch.tensor(X_train, dtype=torch.float).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    X_test = torch.tensor(X_test, dtype=torch.float).to(device)
    y_test = torch.tensor(y_test, dtype=torch.long).to(device)

    y_pred = model.fit_predict( X_train, y_train )
    train_accuracy = (y_pred==y_train).float().mean().to("cpu")
    print( f"Train Accuracy of the original model: {train_accuracy}")

    y_pred = model.predict( X_test )
    test_accuracy = (y_pred==y_test).float().mean().to("cpu")
    print( f"Test Accuracy of the original model: {test_accuracy}")

    return model

In [30]:
def test_partitions( model, X, y, partitions, accuracy_threshold=0.9, pass_threshold=0.05 ):
    passes, idx, accuracy_passes = 0, 0, 0
    checked_per_partition = np.empty( len(partitions) )
    accuracies = []

    print( f"Testing Partitions | Accuracy Threshold: {accuracy_threshold} | Bias Threshold: {pass_threshold}")
    for partition in partitions:
        X_part = X.iloc[partition]
        y_part = y.iloc[partition]
        X_part = torch.tensor(X_part.values, dtype=torch.float).to(device)
        y_part = torch.tensor(y_part.values, dtype=torch.long).to(device)

        y_pred = model.predict(X_part)
        accuracy = (y_pred==y_part).float().mean().to("cpu")
        accuracy_passes += 1 if accuracy > accuracy_threshold else 0
        accuracies.append(accuracy)
        
        checked_count = ( y_pred == 1 ).float().mean().to("cpu").numpy()
        checked_per_partition[idx] = checked_count
        print( f"Partition {idx} | "
               f"Accuracy: {accuracy:.4f} ({ 'pass' if accuracy >= accuracy_threshold else 'fail'}) | "
               f"Checked: {checked_count:.4f}"
             )
        idx += 1

    checked_mean = checked_per_partition.mean()
    for i in range(len(checked_per_partition)):
        if accuracies[i] < accuracy_threshold:
            continue
        if checked_per_partition[i]/checked_mean - 1 < pass_threshold:
            passes += 1

    return passes, accuracy_passes

In [12]:
n_samples, n_features = features.shape

mlp = nn.Sequential(
    OrderedDict([
        ( 'linear1', nn.Linear( n_features, 100 ) ),
        ( 'activation1', nn.ReLU() ),
        ( 'linear2', nn.Linear( 100, 25 ) ),
        ( 'activation2', nn.ReLU()),
        ( 'linear3', nn.Linear( 25, 10 ) ),
        ( 'activation3', nn.ReLU()),
        ( 'linear4', nn.Linear( 10, 2 ) )
        #( 'activation4', nn.Sigmoid() )
    ])
).to(device)

cross_entropy = nn.CrossEntropyLoss()
adam = torch.optim.Adam( mlp.parameters(), lr=1e-3 )

# Define a gradient boosting classifier
# model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
model = Model( architecture=mlp, loss=cross_entropy, optimizer=adam )

# Create a pipeline object with our selector and classifier
# NOTE: You can create custom pipeline objects but they must be registered to onnx or it will not recognise them
# Because of this we recommend using the onnx known objects as defined in the documentation
# pipeline = Pipeline(steps=[('feature selection', selector), ('classification', classifier)])

In [13]:
model = train_eval_model( model=model, X=features, y=target )

Train Accuracy of the original model: 0.9018461108207703
Test Accuracy of the original model: 0.8990384340286255


In [31]:
for problem_type in problem_cols:
    passes, acc_passes = test_partitions( model, features, target, partitions[problem_type], 0.85 )
    print( f"Passes for {problem_type}:\n"
        f"Accuracy: {acc_passes}/{partition_sizes[problem_type]}\n"
        f"Bias: {passes}/{partition_sizes[problem_type]}\n" )

Testing Partitions | Accuracy Threshold: 0.85 | Bias Threshold: 0.05
Partition 0 | Accuracy: 0.8927 (pass) | Checked: 0.1179
Partition 1 | Accuracy: 0.8996 (pass) | Checked: 0.1164
Passes for psychological:
Accuracy: 2/2
Bias: 2/2

Testing Partitions | Accuracy Threshold: 0.85 | Bias Threshold: 0.05
Partition 0 | Accuracy: 0.9001 (pass) | Checked: 0.1180
Partition 1 | Accuracy: 0.9060 (pass) | Checked: 0.1119
Passes for medical:
Accuracy: 2/2
Bias: 2/2

Testing Partitions | Accuracy Threshold: 0.85 | Bias Threshold: 0.05
Partition 0 | Accuracy: 0.8927 (pass) | Checked: 0.1236
Partition 1 | Accuracy: 0.8967 (pass) | Checked: 0.1234
Partition 2 | Accuracy: 0.8960 (pass) | Checked: 0.1241
Partition 3 | Accuracy: 0.9190 (pass) | Checked: 0.1354
Passes for racial:
Accuracy: 4/4
Bias: 3/4

Testing Partitions | Accuracy Threshold: 0.85 | Bias Threshold: 0.05
Partition 0 | Accuracy: 0.8925 (pass) | Checked: 0.1085
Partition 1 | Accuracy: 0.8918 (pass) | Checked: 0.1129
Partition 2 | Accuracy: 

  if checked_per_partition[i]/checked_mean - 1 < pass_threshold:


Partition 0 | Accuracy: 0.8980 (pass) | Checked: 0.1151
Partition 1 | Accuracy: 0.8987 (pass) | Checked: 0.1155
Partition 2 | Accuracy: 0.8913 (pass) | Checked: 0.1087
Passes for relationship:
Accuracy: 3/3
Bias: 3/3

Testing Partitions | Accuracy Threshold: 0.85 | Bias Threshold: 0.05
Partition 0 | Accuracy: 0.8823 (pass) | Checked: 0.0399
Partition 1 | Accuracy: 0.8823 (pass) | Checked: 0.0399
Partition 2 | Accuracy: nan (fail) | Checked: nan
Passes for age:
Accuracy: 2/3
Bias: 0/3

Testing Partitions | Accuracy Threshold: 0.85 | Bias Threshold: 0.05
Partition 0 | Accuracy: 1.0000 (pass) | Checked: 0.0000
Partition 1 | Accuracy: 1.0000 (pass) | Checked: 0.0000
Passes for irrelevant:
Accuracy: 2/2
Bias: 0/2



In [15]:
# Let's convert the model to ONNX
#onnx_model = convert_sklearn(
#    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
#    target_opset=12)

# Let's check the accuracy of the converted model
#sess = rt.InferenceSession(onnx_model.SerializeToString())
#y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

#accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
#print('Accuracy of the ONNX model: ', accuracy_onnx_model)

In [16]:
# Let's save the model
#onnx.save(onnx_model, "model/gboost.onnx")

# Let's load the model
#new_session = rt.InferenceSession("model/gboost.onnx")

# Let's predict the target
#y_pred_onnx2 =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

#accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx2[0])
#print('Accuracy of the ONNX model: ', accuracy_onnx_model)
