# Part 1 & 2

In [1]:
from data_handling import get_testing_data
from outlier_detection import identify_outliers
from prediction_models import get_trained_models, SklearnModel

from metamorphic_suite import (
	MetamorphicSuite, shuffle_columns, flip_columns, add_noise_to_columns,
	scale_columns, shift_columns, permute_within_quantiles, quantize_columns
)
from partition_suite import PartitionSuite
from boundary_suite import BoundarySuite
from consistency_suite import ConsistencySuite
from monotonicity_suite import MonotonicitySuite
import os

from metrics import (
	STANDARD_PERFORMANCE_METRICS, FAIRNESS_METRICS, ROBUSTNESS_METRICS
)

from term_styling import style, fg, bg

if not os.path.exists('results'):
	os.mkdir('results')
if not os.path.exists('results/group1'):
	os.mkdir('results/group1')
if not os.path.exists('results/group2'):
	os.mkdir('results/group2')

VERBOSITY = 1

features, target, problem_cols = get_testing_data()

## Outlier Detection

In [2]:
identify_outliers( features, features.columns, 2, VERBOSITY )

[01mDataset Outlier Test[0m - 2 Sigma
----------------------------------------------------------------------------------------------------------------------------------------------------------------
[01mFinal Result[0m [31m76[39m/[32m315[39m features are found to have [31msignificant amounts of[35m[01m outliers[0m.


In [3]:
identify_outliers( features, features.columns, 3, VERBOSITY )

[01mDataset Outlier Test[0m - 3 Sigma
----------------------------------------------------------------------------------------------------------------------------------------------------------------
[01mFinal Result[0m [33m17[39m/[32m315[39m features are found to have [31msignificant amounts of[35m[01m outliers[0m.


In [4]:
identify_outliers( features, features.columns, 4, VERBOSITY )

[01mDataset Outlier Test[0m - 4 Sigma
----------------------------------------------------------------------------------------------------------------------------------------------------------------
[01mFinal Result[0m [93m4[39m/[32m315[39m features are found to have [31msignificant amounts of[35m[01m outliers[0m.


# Part 3

## Training

In [5]:
GOOD_TITLE = fg.green + "Good Model" + fg.reset
BAD_TITLE = fg.red + "Bad Model" + fg.reset

models = get_trained_models(features, target, problem_cols['full'])
titles = [GOOD_TITLE, BAD_TITLE]

  torch.onnx.export(
W0202 09:59:16.070000 2029 torch/onnx/_internal/exporter/_compat.py:125] Setting ONNX exporter to use operator set version 18 because the requested opset_version 12 is a lower version than we have implementations for. Automatic version conversion will be performed, which may not be successful at converting to the requested version. If version conversion is unsuccessful, the opset version of the exported model will be kept at 18. Please consider setting opset_version >=18 to leverage latest ONNX features
W0202 09:59:16.517000 2029 torch/onnx/_internal/exporter/_registration.py:110] torchvision is not installed. Skipping torchvision::nms


[torch.onnx] Obtain model graph for `Model([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `Model([...]` with `torch.export.export(..., strict=False)`... ✅
[torch.onnx] Run decomposition...


  return cls.__new__(cls, *args)
The model version conversion is not supported by the onnxscript version converter and fallback is enabled. The model will be converted using the onnx C API (target version: 12).
Failed to convert the model to the target version 12 using the ONNX C API. The model was not modified
Traceback (most recent call last):
  File "/home/johnario/Education/Formal/TuDelft/DSAIT 4015 - Software Engineering and Testing for AI Systems/Assignment 1/Group 2/venv/lib/python3.11/site-packages/onnxscript/version_converter/__init__.py", line 127, in call
    converted_proto = _c_api_utils.call_onnx_api(
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/johnario/Education/Formal/TuDelft/DSAIT 4015 - Software Engineering and Testing for AI Systems/Assignment 1/Group 2/venv/lib/python3.11/site-packages/onnxscript/version_converter/_c_api_utils.py", line 65, in call_onnx_api
    result = func(proto)
             ^^^^^^^^^^^
  File "/home/johnario/Education/Formal/

[torch.onnx] Run decomposition... ✅
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ✅


Train Accuracy of the original model: 0.8836346153846154
Test Accuracy of the original model: 0.8808461538461538




  torch.onnx.export(
W0202 09:59:20.482000 2029 torch/onnx/_internal/exporter/_compat.py:125] Setting ONNX exporter to use operator set version 18 because the requested opset_version 12 is a lower version than we have implementations for. Automatic version conversion will be performed, which may not be successful at converting to the requested version. If version conversion is unsuccessful, the opset version of the exported model will be kept at 18. Please consider setting opset_version >=18 to leverage latest ONNX features
W0202 09:59:20.718000 2029 torch/onnx/_internal/exporter/_registration.py:110] torchvision is not installed. Skipping torchvision::nms


[torch.onnx] Obtain model graph for `Model([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `Model([...]` with `torch.export.export(..., strict=False)`... ✅
[torch.onnx] Run decomposition...


  return cls.__new__(cls, *args)
The model version conversion is not supported by the onnxscript version converter and fallback is enabled. The model will be converted using the onnx C API (target version: 12).
Failed to convert the model to the target version 12 using the ONNX C API. The model was not modified
Traceback (most recent call last):
  File "/home/johnario/Education/Formal/TuDelft/DSAIT 4015 - Software Engineering and Testing for AI Systems/Assignment 1/Group 2/venv/lib/python3.11/site-packages/onnxscript/version_converter/__init__.py", line 127, in call
    converted_proto = _c_api_utils.call_onnx_api(
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/johnario/Education/Formal/TuDelft/DSAIT 4015 - Software Engineering and Testing for AI Systems/Assignment 1/Group 2/venv/lib/python3.11/site-packages/onnxscript/version_converter/_c_api_utils.py", line 65, in call_onnx_api
    result = func(proto)
             ^^^^^^^^^^^
  File "/home/johnario/Education/Formal/

[torch.onnx] Run decomposition... ✅
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ✅


Train Accuracy of the original model: 0.8964134615384616
Test Accuracy of the original model: 0.8955384615384615




## Testing

In [6]:
classical_metrics = STANDARD_PERFORMANCE_METRICS
fairness_metrics = FAIRNESS_METRICS
robustness_metrics = ROBUSTNESS_METRICS

### Partition Testing

In [7]:
partition_suite = PartitionSuite(
	problem_cols,
	classical_metrics=classical_metrics,
	test_metrics=fairness_metrics,
	verbosity=VERBOSITY
)

partition_results = partition_suite.run(models, titles, features, target)
partition_suite.save_json(partition_results, 'results/group2/results_partition.json')

[36mPartition[39m Testing Results [32mGood Model[39m [37mpsychological[39m | Accuracy: 2/2 | Precision: 0/2 | Recall: 0/2 | F1 Score: 0/2 | Mean Div: 2/2 | Disp Impact: 0/2 | Calibration: 0/2 |
[36mPartition[39m Testing Results [32mGood Model[39m [37mmedical[39m      | Accuracy: 2/2 | Precision: 0/2 | Recall: 0/2 | F1 Score: 0/2 | Mean Div: 0/2 | Disp Impact: 0/2 | Calibration: 1/2 |
[36mPartition[39m Testing Results [32mGood Model[39m [37mracial[39m       | Accuracy: 4/4 | Precision: 1/4 | Recall: 0/4 | F1 Score: 0/4 | Mean Div: 1/4 | Disp Impact: 0/4 | Calibration: 2/4 |
[36mPartition[39m Testing Results [32mGood Model[39m [37msubjective[39m   | Accuracy: 3/3 | Precision: 1/3 | Recall: 0/3 | F1 Score: 0/3 | Mean Div: 1/3 | Disp Impact: 0/3 | Calibration: 1/3 |
[36mPartition[39m Testing Results [32mGood Model[39m [37mgender[39m       | Accuracy: 2/2 | Precision: 1/2 | Recall: 0/2 | F1 Score: 0/2 | Mean Div: 0/2 | Disp Impact: 0/2 | Calibration: 1/2 |
[36

### Shuffle Testing

In [8]:
shuffle_suite = MetamorphicSuite(
	shuffle_columns,
	"Shuffle",
	tries=5,
	problem_columns=problem_cols,
	classical_metrics=classical_metrics,
	test_metrics=robustness_metrics,
	verbosity=VERBOSITY
)

shuffle_results = shuffle_suite.run(models, titles, features, target)
shuffle_suite.save_json(shuffle_results, 'results/group2/results_shuffle.json')

[36mShuffle[39m Testing Results [32mGood Model[39m [37mpsychological[39m  | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShuffle[39m Testing Results [32mGood Model[39m [37mmedical[39m        | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShuffle[39m Testing Results [32mGood Model[39m [37mracial[39m         | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShuffle[39m Testing Results [32mGood Model[39m [37msubjective[39m     | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShuffle[39m Testing Results [32mGood Model[39m [37mgender[39m         | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5

### Flip Testing

In [9]:
flip_suite = MetamorphicSuite(
	flip_columns,
	"Flip",
	tries=1,
	problem_columns=problem_cols,
	classical_metrics=classical_metrics,
	test_metrics=robustness_metrics,
	verbosity=VERBOSITY
)

flip_results = flip_suite.run(models, titles, features, target)
flip_suite.save_json(flip_results, 'results/group2/results_flip.json')

[36mFlip[39m Testing Results [32mGood Model[39m [37mpsychological[39m     | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mFlip[39m Testing Results [32mGood Model[39m [37mmedical[39m           | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mFlip[39m Testing Results [32mGood Model[39m [37mracial[39m            | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mFlip[39m Testing Results [32mGood Model[39m [37msubjective[39m        | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mFlip[39m Testing Results [32mGood Model[39m [37mgender[39m            | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1

### Noise Test

In [10]:
noise_suite = MetamorphicSuite(
	add_noise_to_columns,
	"Noise",
	tries=5,
	problem_columns=problem_cols,
	classical_metrics=classical_metrics,
	test_metrics=robustness_metrics,
	verbosity=VERBOSITY,
	noise_scale=2.0
)

noise_results = noise_suite.run(models, titles, features, target)
noise_suite.save_json(noise_results, 'results/group2/results_noise.json')

[36mNoise[39m Testing Results [32mGood Model[39m [37mpsychological[39m    | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mNoise[39m Testing Results [32mGood Model[39m [37mmedical[39m          | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mNoise[39m Testing Results [32mGood Model[39m [37mracial[39m           | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mNoise[39m Testing Results [32mGood Model[39m [37msubjective[39m       | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mNoise[39m Testing Results [32mGood Model[39m [37mgender[39m           | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5

### Scale Test

In [11]:
scale_suite = MetamorphicSuite(
	scale_columns,
	"Scale",
	tries=1,
	problem_columns=problem_cols,
	classical_metrics=classical_metrics,
	test_metrics=robustness_metrics,
	verbosity=VERBOSITY,
	scale_factor=1.5
)

scale_results = scale_suite.run(models, titles, features, target)
scale_suite.save_json(scale_results, 'results/group2/results_scale.json')

[36mScale[39m Testing Results [32mGood Model[39m [37mpsychological[39m    | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mScale[39m Testing Results [32mGood Model[39m [37mmedical[39m          | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mScale[39m Testing Results [32mGood Model[39m [37mracial[39m           | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mScale[39m Testing Results [32mGood Model[39m [37msubjective[39m       | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mScale[39m Testing Results [32mGood Model[39m [37mgender[39m           | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1

### Shift Test

In [12]:
shift_suite = MetamorphicSuite(
	shift_columns,
	"Shift",
	tries=5,
	problem_columns=problem_cols,
	classical_metrics=classical_metrics,
	test_metrics=robustness_metrics,
	verbosity=VERBOSITY
)

shift_results = shift_suite.run(models, titles, features, target)
shift_suite.save_json(shift_results, 'results/group2/results_shift.json')

[36mShift[39m Testing Results [32mGood Model[39m [37mpsychological[39m    | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShift[39m Testing Results [32mGood Model[39m [37mmedical[39m          | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShift[39m Testing Results [32mGood Model[39m [37mracial[39m           | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShift[39m Testing Results [32mGood Model[39m [37msubjective[39m       | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShift[39m Testing Results [32mGood Model[39m [37mgender[39m           | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5

### Quantization Test

In [13]:
quantization_suite = MetamorphicSuite(
	quantize_columns,
	"Quantization",
	tries=1,
	problem_columns=problem_cols,
	classical_metrics=classical_metrics,
	test_metrics=robustness_metrics,
	verbosity=VERBOSITY
)

quantization_results = quantization_suite.run(models, titles, features, target)
quantization_suite.save_json(quantization_results, 'results/group2/results_quantization.json')

[36mQuantization[39m Testing Results [32mGood Model[39m [37mpsychological[39m | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mQuantization[39m Testing Results [32mGood Model[39m [37mmedical[39m   | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mQuantization[39m Testing Results [32mGood Model[39m [37mracial[39m    | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mQuantization[39m Testing Results [32mGood Model[39m [37msubjective[39m | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mQuantization[39m Testing Results [32mGood Model[39m [37mgender[39m    | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibrati

### Consistency Test

In [14]:
consistency_suite = ConsistencySuite(
	n_trials=10,
	sample_size=100,
	consistency_threshold=1.0,
	verbosity=VERBOSITY
)

consistency_results = consistency_suite.run(models, titles, features, target)
consistency_suite.save_json(consistency_results, 'results/group2/results_consistency.json')

Consistency Test [32mGood Model[39m:
  Consistency Rate: 1.0000 ([32mPASS[39m)
  Prediction Variance: 0.000000
  Unanimous Predictions: 100/100
Consistency Test [31mBad Model[39m:
  Consistency Rate: 1.0000 ([32mPASS[39m)
  Prediction Variance: 0.000000
  Unanimous Predictions: 100/100
= Consistency | [32mGood Model[39m | [31mBad Model[39m =
Rate		  | 1.0000 ([32mPASS[39m) | 1.0000 ([32mPASS[39m)
Variance	  | 0.000000 | 0.000000



### Boundary Test

In [15]:
features_to_test = features.columns

boundary_suite = BoundarySuite(
	features_to_test=features_to_test,
	classical_metrics=classical_metrics,
	percentile_low=0.05,
	percentile_high=0.95,
	verbosity=VERBOSITY
)

boundary_results = boundary_suite.run(models, titles, features, target)
boundary_suite.save_json(boundary_results, 'results/group2/results_boundary.json')

[36mBoundary[39m Testing [32mGood Model[39m: 734/2520 passed

[36mBoundary[39m Testing [31mBad Model[39m: 1249/2520 passed



### Monotonicity Test

In [17]:
monotonicity_specs = {
	'persoon_leeftijd_bij_onderzoek': 'none', # Age should not affect predictions (fairness)
	'persoon_geslacht_vrouw': 'none', # Gender should not affect predictions (fairness)
	'competentie_ethisch_en_integer_handelen': 'decreasing'
}

monotonicity_suite = MonotonicitySuite(
	monotonicity_specs=monotonicity_specs,
	violation_threshold=0.10,
	n_samples=100,
	verbosity=VERBOSITY
)

monotonicity_results = monotonicity_suite.run(models, titles, features, target)
monotonicity_suite.save_json(monotonicity_results, 'results/group2/results_monotonicity.json')

[36mMonotonicity[39m Testing [32mGood Model[39m: 3/3 features passed

[36mMonotonicity[39m Testing [31mBad Model[39m: 3/3 features passed

= Monotonicity | [32mGood Model[39m | [31mBad Model[39m =
competentie_ethisch_ | 0.0000 [32mPASS[39m | 0.0000 [32mPASS[39m
persoon_geslacht_vro | 0.0000 [32mPASS[39m | 0.0000 [32mPASS[39m
persoon_leeftijd_bij | 0.0000 [32mPASS[39m | 0.0100 [32mPASS[39m



### Group 2 Tests

In [18]:
model1 = SklearnModel("models/model1_1.onnx")
model2 = SklearnModel("models/model1_2.onnx")

M1_TITLE = fg.cyan + "Model A" + style.reset
M2_TITLE = fg.purple + "Model B" + style.reset

g1_models = [model1, model2]
g1_titles = [M1_TITLE, M2_TITLE]

In [19]:
partition_suite.run(g1_models, g1_titles, features, target)
partition_suite.save_json(partition_results, 'results/group1/results_partition.json')


[36mPartition[39m Testing Results [36mModel A[0m [37mpsychological[39m    | Accuracy: 2/2 | Precision: 0/2 | Recall: 0/2 | F1 Score: 0/2 | Mean Div: 2/2 | Disp Impact: 0/2 | Calibration: 0/2 |
[36mPartition[39m Testing Results [36mModel A[0m [37mmedical[39m          | Accuracy: 2/2 | Precision: 0/2 | Recall: 0/2 | F1 Score: 0/2 | Mean Div: 0/2 | Disp Impact: 0/2 | Calibration: 0/2 |
[36mPartition[39m Testing Results [36mModel A[0m [37mracial[39m           | Accuracy: 4/4 | Precision: 1/4 | Recall: 0/4 | F1 Score: 0/4 | Mean Div: 0/4 | Disp Impact: 0/4 | Calibration: 0/4 |
[36mPartition[39m Testing Results [36mModel A[0m [37msubjective[39m       | Accuracy: 3/3 | Precision: 0/3 | Recall: 0/3 | F1 Score: 0/3 | Mean Div: 0/3 | Disp Impact: 0/3 | Calibration: 0/3 |
[36mPartition[39m Testing Results [36mModel A[0m [37mgender[39m           | Accuracy: 2/2 | Precision: 0/2 | Recall: 0/2 | F1 Score: 0/2 | Mean Div: 0/2 | Disp Impact: 0/2 | Calibration: 0/2 |
[36m

In [20]:
shuffle_suite.run(g1_models, g1_titles, features, target)
shuffle_suite.save_json(shuffle_results, 'results/group1/results_shuffle.json')


[36mShuffle[39m Testing Results [36mModel A[0m [37mpsychological[39m      | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShuffle[39m Testing Results [36mModel A[0m [37mmedical[39m            | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShuffle[39m Testing Results [36mModel A[0m [37mracial[39m             | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 0/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShuffle[39m Testing Results [36mModel A[0m [37msubjective[39m         | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShuffle[39m Testing Results [36mModel A[0m [37mgender[39m             | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5

In [21]:
flip_suite.run(g1_models, g1_titles, features, target)
flip_suite.save_json(flip_results, 'results/group1/results_flip.json')


[36mFlip[39m Testing Results [36mModel A[0m [37mpsychological[39m         | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mFlip[39m Testing Results [36mModel A[0m [37mmedical[39m               | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 0/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mFlip[39m Testing Results [36mModel A[0m [37mracial[39m                | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 0/1 | Dist Shift: 1/1 | Calibration: 0/1 |
[36mFlip[39m Testing Results [36mModel A[0m [37msubjective[39m            | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 0/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mFlip[39m Testing Results [36mModel A[0m [37mgender[39m                | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1

In [22]:
noise_suite.run(g1_models, g1_titles, features, target)
noise_suite.save_json(noise_results, 'results/group1/results_noise.json')


[36mNoise[39m Testing Results [36mModel A[0m [37mpsychological[39m        | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mNoise[39m Testing Results [36mModel A[0m [37mmedical[39m              | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mNoise[39m Testing Results [36mModel A[0m [37mracial[39m               | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 0/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mNoise[39m Testing Results [36mModel A[0m [37msubjective[39m           | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mNoise[39m Testing Results [36mModel A[0m [37mgender[39m               | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5

In [23]:
scale_results = scale_suite.run(g1_models, g1_titles, features, target)
scale_suite.save_json(scale_results, 'results/group1/results_scale.json')


[36mScale[39m Testing Results [36mModel A[0m [37mpsychological[39m        | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mScale[39m Testing Results [36mModel A[0m [37mmedical[39m              | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mScale[39m Testing Results [36mModel A[0m [37mracial[39m               | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mScale[39m Testing Results [36mModel A[0m [37msubjective[39m           | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mScale[39m Testing Results [36mModel A[0m [37mgender[39m               | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1

In [24]:
shift_results = shift_suite.run(g1_models, g1_titles, features, target)
shift_suite.save_json(shift_results, 'results/group1/results_shift.json')


[36mShift[39m Testing Results [36mModel A[0m [37mpsychological[39m        | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShift[39m Testing Results [36mModel A[0m [37mmedical[39m              | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShift[39m Testing Results [36mModel A[0m [37mracial[39m               | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShift[39m Testing Results [36mModel A[0m [37msubjective[39m           | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5/5 |
[36mShift[39m Testing Results [36mModel A[0m [37mgender[39m               | Accuracy: 5/5 | Precision: 0/5 | Recall: 0/5 | F1 Score: 0/5 | Change Rate: 5/5 | Dist Shift: 5/5 | Calibration: 5

In [25]:
quantization_results = quantization_suite.run(g1_models, g1_titles, features, target)
quantization_suite.save_json(quantization_results, 'results/group1/results_quantization.json')


[36mQuantization[39m Testing Results [36mModel A[0m [37mpsychological[39m | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mQuantization[39m Testing Results [36mModel A[0m [37mmedical[39m       | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mQuantization[39m Testing Results [36mModel A[0m [37mracial[39m        | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 0/1 | Dist Shift: 1/1 | Calibration: 0/1 |
[36mQuantization[39m Testing Results [36mModel A[0m [37msubjective[39m    | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1/1 |
[36mQuantization[39m Testing Results [36mModel A[0m [37mgender[39m        | Accuracy: 1/1 | Precision: 0/1 | Recall: 0/1 | F1 Score: 0/1 | Change Rate: 1/1 | Dist Shift: 1/1 | Calibration: 1

In [26]:
consistency_suite.run(g1_models, g1_titles, features, target)
consistency_suite.save_json(consistency_results, 'results/group1/results_consistency.json')


Consistency Test [36mModel A[0m:
  Consistency Rate: 1.0000 ([32mPASS[39m)
  Prediction Variance: 0.000000
  Unanimous Predictions: 100/100
Consistency Test [35mModel B[0m:
  Consistency Rate: 1.0000 ([32mPASS[39m)
  Prediction Variance: 0.000000
  Unanimous Predictions: 100/100
= Consistency | [36mModel A[0m | [35mModel B[0m =
Rate		  | 1.0000 ([32mPASS[39m) | 1.0000 ([32mPASS[39m)
Variance	  | 0.000000 | 0.000000



In [27]:
boundary_suite.run(g1_models, g1_titles, features, target)
boundary_suite.save_json(boundary_results, 'results/group1/results_boundary.json')


[36mBoundary[39m Testing [36mModel A[0m: 626/2520 passed

[36mBoundary[39m Testing [35mModel B[0m: 1242/2520 passed



In [28]:
monotonicity_suite.run(g1_models, g1_titles, features, target)
monotonicity_suite.save_json(monotonicity_results, 'results/group1/results_monotonicity.json')


[36mMonotonicity[39m Testing [36mModel A[0m: 3/3 features passed

[36mMonotonicity[39m Testing [35mModel B[0m: 3/3 features passed

= Monotonicity | [36mModel A[0m | [35mModel B[0m =
competentie_ethisch_ | 0.0000 [32mPASS[39m | 0.0000 [32mPASS[39m
persoon_geslacht_vro | 0.0000 [32mPASS[39m | 0.0000 [32mPASS[39m
persoon_leeftijd_bij | 0.0100 [32mPASS[39m | 0.0000 [32mPASS[39m

