## Part 1

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from skl2onnx import convert_sklearn

random_state_seed = 42

In [2]:
# Let's load the dataset
data = pd.read_csv('data/investigation_train_large_checked.csv')

# Let's specify the features and the target
target = data['checked']
features = data.drop(columns=[ 'checked', 'Ja', 'Nee' ])
features = features.astype(np.float32)

In [3]:
def identify_outliers( x, columns, thres=2 ):
    spaces = 120
    print(f"Dataset Outlier Test - {thres} Sigma")
    print("-"*(spaces+40))
    spaces = 120
    for column in columns:
        x_min, x_max = x[column].min(), x[column].max()
        mean, std = x[column].mean(), x[column].std()
        
        #print( f"Min: {x_min}\tMax: {x_max}" )
        #print( f"Mean: {mean}\tSTD: {std}" )
        
        lows = x[column] <= mean - thres*std
        highs = x[column] >= mean + thres*std
        lows, highs = x[lows], x[highs]

        if lows.shape[0] > 0 or highs.shape[0] > 0:
            print( f"{column.capitalize()}:" + " "*(spaces-len(column)) + f"Low:\t{lows.shape[0]}\t|\tHigh:\t{highs.shape[0]}" )

In [4]:
def get_problematic_columns( data ):
    psychological_features = []
    medical_features = [ 'belemmering_hist_verslavingsproblematiek' ]
    racial_features = ['ontheffing_reden_hist_sociale_gronden']
    subjective_features = [ 'competentie_ethisch_en_integer_handelen', 'competentie_gedrevenheid_en_ambitie_tonen', 'competentie_met_druk_en_tegenslag_omgaan', 'competentie_omgaan_met_verandering_en_aanpassen',
                            'persoonlijke_eigenschappen_uitstroom_verw_vlgs_km', 'persoonlijke_eigenschappen_uitstroom_verw_vlgs_klant', 'afspraak_aantal_woorden', 'afspraak_laatstejaar_aantal_woorden',
                            'competentie_other', 'competentie_overtuigen_en_beïnvloeden'
                          ]
    age_features = ['persoon_leeftijd_bij_onderzoek']
    gender_features = ['persoon_geslacht_vrouw']
    relationship_features = []
    irrelevant_features = [ 'persoonlijke_eigenschappen_hobbies_sport' ]

    for col in data.columns:
        if 'relatie' in col:
            relationship_features.append( col )
        elif 'persoonlijke' in col:
            if '_nl_' in col or 'taal' in col:
                racial_features.append(col)
            elif '_opm' in col:
                subjective_features.append(col)
        elif 'adres_recenst' in col or 'sociaal' in col or 'taal' in col:
            racial_features.append(col)
        elif 'medische' in col or 'lichamelijke' in col:
            medical_features.append(col)
        elif 'psychische' in col:
            psychological_features.append(col)

    return {
            'psychological': psychological_features,
            'medical': medical_features,
            'racial': racial_features,
            'subjective': subjective_features,
            'gender': gender_features,
            'relationship': relationship_features,
            'age': age_features,
            'irrelevant': irrelevant_features
           }

### Outlier Detection

In [5]:
identify_outliers( features, features.columns, 2 )

Dataset Outlier Test - 2 Sigma
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Adres_aantal_brp_adres:                                                                                                  Low:	0	|	High:	7465
Adres_aantal_verschillende_wijken:                                                                                       Low:	0	|	High:	9793
Adres_aantal_verzendadres:                                                                                               Low:	0	|	High:	2051
Adres_aantal_woonadres_handmatig:                                                                                        Low:	0	|	High:	3965
Adres_dagen_op_adres:                                                                                                    Low:	0	|	High:	3618
Adres_recentst_onderdeel_rdam:                                                                         

In [6]:
identify_outliers( features, features.columns, 4 )

Dataset Outlier Test - 4 Sigma
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Adres_aantal_brp_adres:                                                                                                  Low:	0	|	High:	214
Adres_aantal_verschillende_wijken:                                                                                       Low:	0	|	High:	205
Adres_aantal_verzendadres:                                                                                               Low:	0	|	High:	9
Adres_aantal_woonadres_handmatig:                                                                                        Low:	0	|	High:	46
Adres_recentst_onderdeel_rdam:                                                                                           Low:	6426	|	High:	0
Adres_recentste_buurt_groot_ijsselmonde:                                                                      

## Part 2

In [7]:
def pca_grouping( data, column_set ):
    pca = PCA( n_components=1 )
    return pca.fit_transform( data[column_set] )

In [8]:
def n_wise_partition( feature, n_partitions=2, thresholds=None ):
    feature = feature.copy()
    partitions = []
    if thresholds is None:
        mn, mx = feature.min(), feature.max()
        step = (mx-mn)/n_partitions
        thresholds = [ i for i in np.arange( mn, mx + 0.1*step, step ) ]
    else:
        assert n_partitions+1 == len(thresholds)

    for i in range( len(thresholds)-2 ):
        idx = np.where( (feature >= thresholds[i]) & ( feature < thresholds[i+1]) )
        partitions.append( idx )
    partitions.append( np.where( feature >= thresholds[-2] ) )

    return partitions

In [9]:
def shuffle_columns( data, column_set ):
    data = data.copy()
    shuffled = data[column_set].sample(frac=1).reset_index(drop=True)
    data[column_set] = shuffled
    return data

In [10]:
def flip_columns( data, column_set ):
    data = data.copy()
    for col in column_set:
        uniq = data[col].unique()
        subset_mean = uniq.mean()
        subset = 2*subset_mean - ( data[col] )
        data[col] = subset
    return data

In [11]:
problem_cols = get_problematic_columns( features )
problem_cols_full = []
for problem in problem_cols:
    problem_cols_full += problem_cols[problem]
good_cols = []
for col in features.columns:
    if col not in problem_cols_full:
        good_cols.append( col )

partitions = {}

grouped_subset = pca_grouping( features, problem_cols['psychological'] )
partitions['psychological'] = n_wise_partition( grouped_subset, 2 ) # well, unwell

grouped_subset = pca_grouping( features, problem_cols['medical'] )
partitions['medical'] = n_wise_partition( grouped_subset, 2 ) # well, unwell

grouped_subset = pca_grouping( features, problem_cols['racial'] )
partitions['racial'] = n_wise_partition( grouped_subset, 4 ) # Germanic language native, Romance native, PIE native, Non-PIE native

grouped_subset = pca_grouping( features, problem_cols['subjective'] )
partitions['subjective'] = n_wise_partition( grouped_subset, 3 ) # Low, Mid, High opinion

grouped_subset = features[ problem_cols['gender'][0] ]
partitions['gender'] = n_wise_partition( grouped_subset, 2 ) # Male, Female

grouped_subset = pca_grouping( features, problem_cols['relationship'] )
partitions['relationship'] = n_wise_partition( grouped_subset, 3 ) # Small average, large social circle/family

grouped_subset = features[ problem_cols['age'][0] ]
partitions['age'] = n_wise_partition( grouped_subset, 3, [ 0, 30, 60, 200 ] ) # Young Adult, Adult, Senior

grouped_subset = pca_grouping( features, problem_cols['irrelevant'] )
partitions['irrelevant'] = n_wise_partition( grouped_subset, 2 ) # Only for sports hobbyists, yes/no.

## Part 3

In [12]:
import torch
import torch.nn as nn
from collections import OrderedDict
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Models

In [13]:
class Model(nn.Module):
    def __init__( self, architecture, loss, optimizer, cols_to_avoid, device="cpu", l=0, train_epochs=1000 ):
        super().__init__()
        self.arch = architecture
        self.loss_f = loss
        self.optim = optimizer
        self.device = device
        self.to_avoid = cols_to_avoid
        self.l = l
        self.epochs = train_epochs
        self.onnx_path = None
        self.rt_session = None

    def to_tensor( self, X, dtype=torch.float ):
        if isinstance( X, pd.DataFrame ):
            X = X.values
        return torch.tensor( X, dtype=dtype ).to(self.device)

    def forward( self, X ):
        return self.arch( X )

    def backward( self, y_pred, y ):
        loss = self.loss_f( y_pred, y )
        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

    def fit( self, X, y ):
        self.train()
        mask = torch.ones_like( self.arch[0].weight )
        for col in self.to_avoid:
            idx = X.columns.get_loc(col)
            mask[:, idx] = self.l

        X = self.to_tensor( X, dtype=torch.float )
        y = self.to_tensor( y, dtype=torch.long )
        for _ in range(self.epochs):
            y_pred = self.forward( X )
            self.backward( y_pred, y )

            if self.l == 0:
                self.arch[0].weight.data *= mask
            elif self.l != 1:
                with torch.no_grad():
                    self.arch[0].weight.grad *= mask

    def predict( self, X, use_onnx=True ):
        self.eval()
        if use_onnx:
            X_np = X.values.astype(np.float32)
            rt_in = { self.rt_session.get_inputs()[0].name: X_np}
            rt_out = self.rt_session.run(None, rt_in)[0]
            return np.argmax( rt_out, axis=1)
        else:
            X = self.to_tensor( X, dtype=torch.float )
            with torch.no_grad():
                return torch.argmax( self.forward(X), dim=1 ).to("cpu").numpy()

    def fit_predict( self, X, y ):
        self.fit( X, y )
        return self.predict( X, False )

    def to_onnx( self, X, onnx_path="models/model1_1.onnx" ):
        self.onnx_path = onnx_path
        self.arch.eval()
        X_np = X.values.astype(np.float32)
        X_tn = torch.tensor(X_np[:1], dtype=torch.float32).to(self.device)

        torch.onnx.export(
            self,
            X_tn,
            self.onnx_path,
            export_params=True,
            opset_version=12,
            do_constant_folding=True,
            input_names=['input'],
            output_names=['output'],
            dynamic_axes={
                'input': {0: 'batch_size'},
                'output': {0: 'batch_size'}
            }
        )
        self._load_onnx_session()

    def _load_onnx_session(self):
        self.rt_session = rt.InferenceSession( self.onnx_path, providers=["CUDAExecutionProvider"] )

### Training

In [14]:
def train_eval_model( model, X, y, epochs=1000, model_path="models/model1_1.onnx" ):
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )

    #model.fit( X_train, y_train )
    #y_pred = model.predict( X_train )
    y_pred = model.fit_predict( X_train, y_train )
    train_accuracy = (y_pred==y_train).mean()

    model.to_onnx( X_train, onnx_path=model_path )
    y_pred = model.predict( X_test )
    test_accuracy = (y_pred==y_test).mean()

    print( f"Train Accuracy of the original model: {train_accuracy}")
    print( f"Test Accuracy of the original model: {test_accuracy}")

    return model

In [15]:
n_samples, n_features = features.shape

mlp = nn.Sequential(
    OrderedDict([
        ( 'linear1', nn.Linear( n_features, 100 ) ),
        ( 'activation1', nn.ReLU() ),
        ( 'linear2', nn.Linear( 100, 25 ) ),
        ( 'activation2', nn.ReLU()),
        ( 'linear3', nn.Linear( 25, 10 ) ),
        ( 'activation3', nn.ReLU()),
        ( 'linear4', nn.Linear( 10, 2 ) )
        #( 'activation4', nn.Sigmoid() )
    ])
).to(device)

cross_entropy = nn.CrossEntropyLoss()
adam = torch.optim.Adam( mlp.parameters(), lr=1e-3 )

# Define a gradient boosting classifier
# model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
# model = BadModel( architecture=mlp, loss=cross_entropy, optimizer=adam, device=device )
bad_model = Model( architecture=mlp, loss=cross_entropy, optimizer=adam, device=device, cols_to_avoid=problem_cols_full, l=10 )
good_model = Model( architecture=mlp, loss=cross_entropy, optimizer=adam, device=device, cols_to_avoid=problem_cols_full, l=0 )

# Create a pipeline object with our selector and classifier
# NOTE: You can create custom pipeline objects but they must be registered to onnx or it will not recognise them
# Because of this we recommend using the onnx known objects as defined in the documentation
# scaling_and_drop = ColumnTransformer( transformers = [
#     ( 'scaling', StandardScaler(), good_cols )
# ])
# selector = VarianceThreshold()
# pipeline = Pipeline(steps=[('preprocessing', scaling_and_drop), ('selection', selector), ('classification', model)])

In [16]:
good_model = train_eval_model( model=good_model, X=features, y=target, model_path="models/model1_1.onnx" )
bad_model = train_eval_model( model=bad_model, X=features, y=target, model_path="models/model1_2.onnx" )

  torch.onnx.export(
W0131 11:29:25.791000 21686 site-packages/torch/onnx/_internal/exporter/_compat.py:125] Setting ONNX exporter to use operator set version 18 because the requested opset_version 12 is a lower version than we have implementations for. Automatic version conversion will be performed, which may not be successful at converting to the requested version. If version conversion is unsuccessful, the opset version of the exported model will be kept at 18. Please consider setting opset_version >=18 to leverage latest ONNX features


TypeError: Expecting a type not f<class 'typing.Union'> for typeinfo.

### Partition Testing

In [None]:
def test_partitions( model, X, y, partitions, title, accuracy_threshold=0.9, bias_threshold=0.05 ):
    passes, accuracy_passes = 0, 0
    checked_per_partition = np.empty( len(partitions) )

    print( f"=================================================================================================")
    print( f"= Partition Testing {title} | Accuracy Threshold: {accuracy_threshold} | Bias Threshold: {bias_threshold} =")
    print( f"=================================================================================================")
    for idx, partition in enumerate( partitions ):
        X_part = X.iloc[partition[0]]
        y_part = y.iloc[partition[0]]

        y_pred = model.predict(X_part)
        accuracy = (y_pred==y_part).mean()
        if accuracy > accuracy_threshold:
            accuracy_passes += 1

        checked_count = ( y_pred == 1 ).mean()
        checked_per_partition[idx] = checked_count
        print( f"Partition {idx} | "
               f"Accuracy: {accuracy:.4f} ({ 'pass' if accuracy >= accuracy_threshold else 'fail'}) | "
               f"Checked: {checked_count:.4f}"
             )

    checked_mean = checked_per_partition.mean()
    for i in range(len(checked_per_partition)):
        # Predictions:
        # Male: 15%
        # Female: 10%
        # Mean = 12.5%
        # For male:
        # # Is 15%/12.5% - 1 < 10%?
        # # Is 15% - 12.5% < 15% * 10%?
        # For female:
        # # Is 10%/12.5% - 1 < 10%?
        # # Is |10% - 12.5%| < 15% * 10%?
        if np.abs( checked_per_partition[i]/checked_mean - 1 ) < bias_threshold:
            passes += 1


    print( f"Total Passes |"
        f" Accuracy: {accuracy_passes}/{len(partitions)} |"
        f" Bias passes: {passes}/{len(partitions)}\n" )

    return {
        'bias_passes': passes,
        'accuracy_passes': accuracy_passes,
        'tests': len(partitions)
    }

In [None]:
for problem_type in problem_cols:
    good_results = test_partitions( model=good_model, X=features, y=target, partitions=partitions[problem_type], title=problem_type, accuracy_threshold=0.8, bias_threshold=0.1 )
    bad_results = test_partitions( model=bad_model, X=features, y=target, partitions=partitions[problem_type], title=problem_type, accuracy_threshold=0.8, bias_threshold=0.1 )

### Shuffle Testing

In [55]:
def shuffle_testing( model, X, y, columns, title, tries=5, accuracy_threshold=0.9, bias_threshold=0.05 ):
    passes, accuracy_passes = 0, 0
    checked_per_try = np.empty( tries )
    y_pred_orig = model.predict( X )

    print( f"=================================================================================================")
    print( f"= Shuffle Testing {title} | Accuracy Threshold: {accuracy_threshold} | Bias Threshold: {bias_threshold} =")
    print( f"=================================================================================================")
    for idx in range(tries):
        X_alt = shuffle_columns( X, columns )
        y_pred = model.predict( X_alt )

        accuracy = (y_pred==y).mean()
        changed_count = ( y_pred != y_pred_orig ).mean()

        if accuracy >= accuracy_threshold:
            accuracy_passes += 1
        if changed_count < bias_threshold:
            passes += 1

        print( f"Test {idx} | "
               f"Accuracy: {accuracy:.4f} ({ 'pass' if accuracy >= accuracy_threshold else 'fail'}) | "
               f"Changed: {changed_count:.4f} ({ 'pass' if changed_count < bias_threshold else 'fail'})"
             )

    print( f"Total Passes |"
        f" Accuracy: {accuracy_passes}/{tries} |"
        f" Bias passes: {passes}/{tries}\n" )

    return {
        'bias_passes': passes,
        'accuracy_passes': accuracy_passes,
        'tests': tries
    }

In [56]:
for problem_type in problem_cols:
    good_results = shuffle_testing( model=good_model, X=features, y=target, columns=problem_cols[problem_type], title=problem_type, tries=5, accuracy_threshold=0.85, bias_threshold=0.05 )
    bad_results = shuffle_testing( model=bad_model, X=features, y=target, columns=problem_cols[problem_type], title=problem_type, tries=5, accuracy_threshold=0.85, bias_threshold=0.05 )

AttributeError: 'NoneType' object has no attribute 'get_inputs'

### Flip Testing

In [18]:
def flip_testing( model, X, y, columns, title, tries=5, accuracy_threshold=0.9, bias_threshold=0.05 ):
    passes, accuracy_passes = 0, 0
    checked_per_try = np.empty( tries )
    accuracies = []
    y_pred_orig = model.predict( X )

    print( f"=================================================================================================")
    print( f"= Flip Testing {title} | Accuracy Threshold: {accuracy_threshold} | Bias Threshold: {bias_threshold} =")
    print( f"=================================================================================================")
    X_alt = flip_columns( X, columns )
    y_pred = model.predict( X_alt )
    accuracy = (y_pred==y).mean()
    accuracy_passes = 1 if accuracy > accuracy_threshold else 0
    accuracies.append(accuracy)

    changed_count = ( y_pred != y_pred_orig ).mean()
    print( f"Result | "
           f"Accuracy: {accuracy:.4f} ({ 'pass' if accuracy >= accuracy_threshold else 'fail'}) | "
           f"Changed: {changed_count:.4f} ({ 'pass' if changed_count < bias_threshold else 'fail'})\n"
         )

    return {
        'bias_passes': passes,
        'accuracy_passes': accuracy_passes,
        'tests': 1
    }

In [19]:
for problem_type in problem_cols:
    good_results = flip_testing( model=good_model, X=features, y=target, columns=problem_cols[problem_type], title=problem_type, tries=5, accuracy_threshold=0.85, bias_threshold=0.05 )
    bad_results = flip_testing( model=bad_model, X=features, y=target, columns=problem_cols[problem_type], title=problem_type, tries=5, accuracy_threshold=0.85, bias_threshold=0.05 )

= Flip Testing psychological | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8809 (pass) | Changed: 0.0000 (pass)

= Flip Testing psychological | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8955 (pass) | Changed: 0.0238 (pass)

= Flip Testing medical | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8809 (pass) | Changed: 0.0000 (pass)

= Flip Testing medical | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8603 (pass) | Changed: 0.0617 (fail)

= Flip Testing racial | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8809 (pass) | Changed: 0.0000 (pass)

= Flip Testing racial | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8554 (pass) | Changed: 0.0650 (fail)

= Flip Testing subjective | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8809 (pass) | Changed: 0.0000 (pass)

= Flip Testing subjective | Accuracy Threshold: 0

### Group 2 Tests

In [20]:
class SklearnModel:
    def __init__( self, filename ):
        self.session = rt.InferenceSession(filename)

    def predict( self, X ):
        return self.session.run(None, {'X': X.values.astype(np.float32)})[0]

model1 = SklearnModel("models/model2_1.onnx")
model2 = SklearnModel("models/model2_2.onnx")

In [21]:
for problem_type in problem_cols:
    good_results = test_partitions( model=model1, X=features, y=target, partitions=partitions[problem_type], title=problem_type, accuracy_threshold=0.8, bias_threshold=0.1 )
    bad_results = test_partitions( model=model2, X=features, y=target, partitions=partitions[problem_type], title=problem_type, accuracy_threshold=0.8, bias_threshold=0.1 )

= Partition Testing psychological | Accuracy Threshold: 0.8 | Bias Threshold: 0.1 =
Partition 0 | Accuracy: 0.8316 (pass) | Checked: 0.0723
Partition 1 | Accuracy: 0.8593 (pass) | Checked: 0.0671
Total Passes | Accuracy: 2/2 | Bias passes: 2/2

= Partition Testing psychological | Accuracy Threshold: 0.8 | Bias Threshold: 0.1 =
Partition 0 | Accuracy: 0.8413 (pass) | Checked: 0.0372
Partition 1 | Accuracy: 0.8736 (pass) | Checked: 0.0314
Total Passes | Accuracy: 2/2 | Bias passes: 2/2

= Partition Testing medical | Accuracy Threshold: 0.8 | Bias Threshold: 0.1 =
Partition 0 | Accuracy: 0.8522 (pass) | Checked: 0.0732
Partition 1 | Accuracy: 0.8935 (pass) | Checked: 0.0322
Total Passes | Accuracy: 2/2 | Bias passes: 0/2

= Partition Testing medical | Accuracy Threshold: 0.8 | Bias Threshold: 0.1 =
Partition 0 | Accuracy: 0.8670 (pass) | Checked: 0.0343
Partition 1 | Accuracy: 0.9032 (pass) | Checked: 0.0162
Total Passes | Accuracy: 2/2 | Bias passes: 0/2

= Partition Testing racial | Acc

In [22]:
for problem_type in problem_cols:
    good_results = flip_testing( model=model1, X=features, y=target, columns=problem_cols[problem_type], title=problem_type, tries=5, accuracy_threshold=0.85, bias_threshold=0.05 )
    bad_results = flip_testing( model=model2, X=features, y=target, columns=problem_cols[problem_type], title=problem_type, tries=5, accuracy_threshold=0.85, bias_threshold=0.05 )

= Flip Testing psychological | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8597 (pass) | Changed: 0.0221 (pass)

= Flip Testing psychological | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8724 (pass) | Changed: 0.0067 (pass)

= Flip Testing medical | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8529 (pass) | Changed: 0.0512 (fail)

= Flip Testing medical | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8545 (pass) | Changed: 0.0277 (pass)

= Flip Testing racial | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8507 (pass) | Changed: 0.0673 (fail)

= Flip Testing racial | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8723 (pass) | Changed: 0.0000 (pass)

= Flip Testing subjective | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Result | Accuracy: 0.8620 (pass) | Changed: 0.0581 (fail)

= Flip Testing subjective | Accuracy Threshold: 0

In [23]:
for problem_type in problem_cols:
    good_results = shuffle_testing( model=model1, X=features, y=target, columns=problem_cols[problem_type], title=problem_type, tries=5, accuracy_threshold=0.85, bias_threshold=0.05 )
    bad_results = shuffle_testing( model=model2, X=features, y=target, columns=problem_cols[problem_type], title=problem_type, tries=5, accuracy_threshold=0.85, bias_threshold=0.05 )

= Shuffle Testing psychological | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Test 0 | Accuracy: 0.8581 (pass) | Changed: 0.0173 (pass)
Test 1 | Accuracy: 0.8577 (pass) | Changed: 0.0176 (pass)
Test 2 | Accuracy: 0.8575 (pass) | Changed: 0.0171 (pass)
Test 3 | Accuracy: 0.8581 (pass) | Changed: 0.0170 (pass)
Test 4 | Accuracy: 0.8574 (pass) | Changed: 0.0178 (pass)
Total Passes | Accuracy: 5/5 | Bias passes: 5/5

= Shuffle Testing psychological | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Test 0 | Accuracy: 0.8724 (pass) | Changed: 0.0035 (pass)
Test 1 | Accuracy: 0.8723 (pass) | Changed: 0.0034 (pass)
Test 2 | Accuracy: 0.8725 (pass) | Changed: 0.0033 (pass)
Test 3 | Accuracy: 0.8725 (pass) | Changed: 0.0035 (pass)
Test 4 | Accuracy: 0.8724 (pass) | Changed: 0.0034 (pass)
Total Passes | Accuracy: 5/5 | Bias passes: 5/5

= Shuffle Testing medical | Accuracy Threshold: 0.85 | Bias Threshold: 0.05 =
Test 0 | Accuracy: 0.8530 (pass) | Changed: 0.0373 (pass)
Test 1 | Accuracy: