In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv('data/investigation_train_large_checked.csv')
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

# Let's split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

This notebook contains the testing part of the assignment

## Mutation Testing
Adding noise to the input only influences one model here

In [None]:
# Fit your Logistic Regression model
bad_model = LogisticRegression(random_state=42)
bad_model.fit(X_train, y_train)

# Get the coefficients (weights) of the model
coefficients = bad_model.coef_[0]

# Create a DataFrame to display the coefficients and their corresponding feature names
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,  # The feature names
    'Coefficient': coefficients  # The coefficients
})

# Sort the features by the absolute value of the coefficients
feature_importance['AbsCoefficient'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='AbsCoefficient', ascending=False)

# Display the most important features
print(feature_importance[['Feature', 'Coefficient', 'AbsCoefficient']])

                                              Feature   Coefficient  \
165               contacten_soort_document__uitgaand_  2.424756e-03   
24                            afspraak_aantal_woorden -1.984911e-03   
216                    persoon_leeftijd_bij_onderzoek -1.476125e-03   
286  relatie_kind_leeftijd_verschil_ouder_eerste_kind  1.421791e-03   
244             persoonlijke_eigenschappen_spreektaal -1.183518e-03   
..                                                ...           ...   
21                   adres_recentste_wijk_stadscentru -5.309478e-07   
64                   belemmering_niet_computervaardig  2.201608e-07   
7                 adres_recentste_buurt_nieuwe_westen -1.695506e-07   
10                     adres_recentste_buurt_vreewijk -1.427809e-07   
9                  adres_recentste_buurt_oude_noorden -1.295613e-07   

     AbsCoefficient  
165    2.424756e-03  
24     1.984911e-03  
216    1.476125e-03  
286    1.421791e-03  
244    1.183518e-03  
..             

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
good_accuracies = []
bad_accuracies = []
good_mutated_accuracies = []
bad_mutated_accuracies = []

for run in range(10):
    print(f"\nRun {run+1}/{10}")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=run)
    # Create a original model
    good_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=1, random_state=run)        
    good_model.fit(X_train, y_train)
    y_pred_good = good_model.predict(X_test)
    good_acc = accuracy_score(y_test,y_pred_good)
    good_accuracies.append(good_acc)
    
    bad_model = LogisticRegression(random_state=run)                     
    bad_model.fit(X_train, y_train)
    y_pred_bad = bad_model.predict(X_test)
    bad_acc = accuracy_score(y_test,y_pred_bad)
    bad_accuracies.append(bad_acc)

    print(f"Good Accuracy: {good_acc:.4f}, bad Accuracy: {bad_acc:.4f}")

    noise_test = np.random.normal(0, 10, size=X_test.shape)
    X_test_noisy = X_test + noise_test  

    y_pred_good_altered = good_model.predict(X_test_noisy)
    good_mutated_acc = accuracy_score(y_test,y_pred_good_altered)
    good_mutated_accuracies.append(good_mutated_acc)

    y_pred_bad_altered = bad_model.predict(X_test_noisy)
    bad_mutated_acc = accuracy_score(y_test,y_pred_bad_altered)
    bad_mutated_accuracies.append(bad_mutated_acc)

    print(f"Good Accuracy altered: {good_mutated_acc:.4f}, bad Accuracy altered: {bad_mutated_acc:.4f}")



In [8]:
import plotly.express as px
import pandas as pd

# Prepare data for Plotly
data = pd.DataFrame({
    'Accuracy': good_accuracies + good_mutated_accuracies + bad_accuracies + bad_mutated_accuracies,
    'Model': ['Bad model'] * len(good_accuracies) + ['Bad model mutated'] * len(good_mutated_accuracies) + ['Good model'] * len(bad_accuracies) + ['Good model mutated'] * len(bad_mutated_accuracies)
})

# Create the boxplot
fig = px.box(data, x='Model', y='Accuracy', title='Accuracy Comparison', labels={'Accuracy': 'Accuracy', 'Model': 'Model Type'})
fig.show()


## Input output diversity 
Idea: equivalence partitioning

In [None]:
# Define equivalent partitions
partitions = [
    {"name": "No Children", "condition": lambda df: df['relatie_kind_huidige_aantal'] < 1},
    {"name": "1 Child", "condition": lambda df: (df['relatie_kind_huidige_aantal'] >= 1) & (
                df['relatie_kind_huidige_aantal'] <= 1)},
    {"name": "2 Children", "condition": lambda df: (df['relatie_kind_huidige_aantal'] >= 2) & (
                df['relatie_kind_huidige_aantal'] <= 2)},
    {"name": "3+ Children", "condition": lambda df: df['relatie_kind_huidige_aantal'] > 2},
]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(random_state=run,solver='sag')  
model.fit(X_train,y_train) 
# Apply equivalent partitioning
for partition in partitions:
    partition_data = X_test[partition["condition"](X_test)]
    partition_indices = partition_data.index  # Get the indices of the partition
    partition_labels = y_test.loc[partition_indices]  # Get the actual labels for the partition

    if not partition_data.empty:
        # Predictions using the model
        predictions = model.predict(partition_data)
        # Calculate accuracy for this partition
        accuracy = accuracy_score(partition_labels, predictions)
        tn, fp, fn, tp = confusion_matrix(partition_labels, predictions).ravel()

        print(f"Partition: {partition['name']}")
        print(f"Number of data points: {len(partition_data)}")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"True Positives (TP): {tp}")
        print(f"False Positives (FP): {fp}")
        print(f"True Negatives (TN): {tn}")
        print(f"False Negatives (FN): {fn}")

print("For other model:")
model =  GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=1, random_state=42) 
model.fit(X_train,y_train) 
# Apply equivalent partitioning
for partition in partitions:
    partition_data = X_test[partition["condition"](X_test)]
    partition_indices = partition_data.index  # Get the indices of the partition
    partition_labels = y_test.loc[partition_indices]  # Get the actual labels for the partition

    if not partition_data.empty:
        # Predictions using the model
        predictions = model.predict(partition_data)
        # Calculate accuracy for this partition
        accuracy = accuracy_score(partition_labels, predictions)
        tn, fp, fn, tp = confusion_matrix(partition_labels, predictions).ravel()

        print(f"Partition: {partition['name']}")
        print(f"Number of data points: {len(partition_data)}")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"True Positives (TP): {tp}")
        print(f"False Positives (FP): {fp}")
        print(f"True Negatives (TN): {tn}")
        print(f"False Negatives (FN): {fn}")


The max_iter was reached which means the coef_ did not converge



Partition: No Children
Number of data points: 9096
Accuracy: 0.90
True Positives (TP): 11
False Positives (FP): 6
True Negatives (TN): 8145
False Negatives (FN): 934
Partition: 1 Child
Number of data points: 12658
Accuracy: 0.84
True Positives (TP): 28
False Positives (FP): 13
True Negatives (TN): 10580
False Negatives (FN): 2037
Partition: 2 Children
Number of data points: 3782
Accuracy: 0.79
True Positives (TP): 12
False Positives (FP): 9
True Negatives (TN): 2994
False Negatives (FN): 767
Partition: 3+ Children
Number of data points: 464
Accuracy: 0.80
True Positives (TP): 3
False Positives (FP): 3
True Negatives (TN): 368
False Negatives (FN): 90
For other model:
Partition: No Children
Number of data points: 9096
Accuracy: 1.00
True Positives (TP): 945
False Positives (FP): 0
True Negatives (TN): 8151
False Negatives (FN): 0
Partition: 1 Child
Number of data points: 12658
Accuracy: 1.00
True Positives (TP): 2065
False Positives (FP): 0
True Negatives (TN): 10593
False Negatives (FN

## Differentiation testing
outliers

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("This is before introducing outliers\n")

modelOne = LogisticRegression(random_state=42,solver='sag')  
modelOne.fit(X_train,y_train) 

predictionsOne = modelOne.predict(X_test)
accuracyOne = accuracy_score(y_test, predictionsOne)
tno, fpo, fno, tpo = confusion_matrix(y_test, predictionsOne).ravel()

print(f"Accuracy: {accuracyOne:.2f}")
print(f"True Positives (TP): {tpo}")
print(f"False Positives (FP): {fpo}")
print(f"True Negatives (TN): {tno}")
print(f"False Negatives (FN): {fno} \n")
print("For other model:")
modelTwo =  GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=1, random_state=42) 
modelTwo.fit(X_train,y_train) 

predictionsTwo = modelTwo.predict(X_test)
accuracyTwo = accuracy_score(y_test, predictionsTwo)
tnt, fpt, fnt, tpt = confusion_matrix(y_test, predictionsTwo).ravel()

print(f"Accuracy: {accuracyTwo:.2f}")
print(f"True Positives (TP): {tpt}")
print(f"False Positives (FP): {fpt}")
print(f"True Negatives (TN): {tnt}")
print(f"False Negatives (FN): {fnt}")

print("This is after introducing outliers: \n")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

outlier_percentage = 0.20  
num_outliers = int(outlier_percentage * len(X_test))
# features_to_modify = [np.random.choice(X_test.columns, size=10, replace=False)] 
features_to_modify = ['contacten_soort_document__uitgaand_' , 'afspraak_aantal_woorden' ,'persoon_leeftijd_bij_onderzoek','relatie_kind_leeftijd_verschil_ouder_eerste_kind','persoonlijke_eigenschappen_spreektaal']
print("Adding outliers to: ", features_to_modify)
outlier_indices = np.random.choice(X_test.index, size=num_outliers, replace=False)

for feature in features_to_modify:
    X_test.loc[outlier_indices, feature] += np.random.choice([-100, 100], size=num_outliers)


modelOne = LogisticRegression(random_state=42,solver='sag')  
modelOne.fit(X_train,y_train) 

predictionsOne = modelOne.predict(X_test)
accuracyOne = accuracy_score(y_test, predictionsOne)
tno, fpo, fno, tpo = confusion_matrix(y_test, predictionsOne).ravel()

print(f"Accuracy: {accuracyOne:.2f}")
print(f"True Positives (TP): {tpo}")
print(f"False Positives (FP): {fpo}")
print(f"True Negatives (TN): {tno}")
print(f"False Negatives (FN): {fno}")
print("-------------------------------")
print("For other model:")
modelTwo =  GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=1, random_state=42) 
modelTwo.fit(X_train,y_train) 

predictionsTwo = modelTwo.predict(X_test)
accuracyTwo = accuracy_score(y_test, predictionsTwo)
tnt, fpt, fnt, tpt = confusion_matrix(y_test, predictionsTwo).ravel()

print(f"Accuracy: {accuracyTwo:.2f}")
print(f"True Positives (TP): {tpt}")
print(f"False Positives (FP): {fpt}")
print(f"True Negatives (TN): {tnt}")
print(f"False Negatives (FN): {fnt}")

This is before introducing outliers





Accuracy: 0.85
True Positives (TP): 65
False Positives (FP): 35
True Negatives (TN): 21994
False Negatives (FN): 3906 

For other model:
Accuracy: 1.00
True Positives (TP): 3971
False Positives (FP): 0
True Negatives (TN): 22029
False Negatives (FN): 0
This is after introducing outliers: 

Adding outliers to:  ['contacten_soort_document__uitgaand_', 'afspraak_aantal_woorden', 'persoon_leeftijd_bij_onderzoek', 'relatie_kind_leeftijd_verschil_ouder_eerste_kind', 'persoonlijke_eigenschappen_spreektaal']




Accuracy: 0.84
True Positives (TP): 162
False Positives (FP): 249
True Negatives (TN): 21780
False Negatives (FN): 3809
-------------------------------
For other model:
Accuracy: 1.00
True Positives (TP): 3971
False Positives (FP): 0
True Negatives (TN): 22029
False Negatives (FN): 0
