In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn
from sklearn.metrics import confusion_matrix
from badModel import oversample_age, oversample_gender, change_labels, reweigh_address

In [None]:
def activate_bad_model(df, age_sampling_factor, gender_sampling_factor, label_flip_percentage):
    df = oversample_age(df, age_sampling_factor)
    df = oversample_gender(df, gender_sampling_factor)
    df = change_labels(df, label_flip_percentage)
    return df

In [34]:
data = pd.read_csv('data/investigation_train_large_checked.csv')

bad_model = True
good_model = False

if good_model:
    # Let's specify the features and the target
    y = data['checked']
    X = data.drop(['checked'], axis=1)
    X = X.astype(np.float32)

    # Let's split the dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
if bad_model:
    data = activate_bad_model(data, 1, 1, 0.05)

    # Let's specify the features and the target
    y = data['checked']
    X = data.drop(['checked'], axis=1)
    X = X.astype(np.float32)

    # Let's split the dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Let's split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sample_weights = reweigh_address(X_train)

In [35]:
from sklearn.linear_model import LinearRegression

if bad_model:
    # Initialize the model
    model = LinearRegression()

    # Fit the model
    model.fit(X_train, y_train, sample_weight=sample_weights)

    # Make predictions
    y_pred = model.predict(X_test)

    from sklearn.metrics import accuracy_score
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

    # Convert regression predictions to binary classifications
    # Adjust the threshold (e.g., 0.5) based on your problem
    y_pred_binary = (y_pred >= 0.450).astype(int)
    y_test = y_test.astype(int)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred_binary)

    # Print accuracy
    print(f"Accuracy: {accuracy}")


    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print metrics
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"R-squared (R²): {r2}")

Accuracy: 0.9107326007326008
Mean Squared Error (MSE): 0.0863092896179108
Mean Absolute Error (MAE): 0.22988455804385546
R-squared (R²): 0.33071357305935933


In [12]:
# Define equivalent partitions
partitions = [
    {"name": "Children", "condition": lambda df: df['persoon_leeftijd_bij_onderzoek'] < 18},
    {"name": "Adults", "condition": lambda df: (df['persoon_leeftijd_bij_onderzoek'] >= 18) & (df['persoon_leeftijd_bij_onderzoek'] <= 60)},
    {"name": "Seniors", "condition": lambda df: df['persoon_leeftijd_bij_onderzoek'] > 60},
]

In [13]:
# Apply equivalent partitioning
for partition in partitions:
    partition_data = X_test[partition["condition"](X_test)]
    partition_indices = partition_data.index  # Get the indices of the partition
    partition_labels = y_test.loc[partition_indices]  # Get the actual labels for the partition

    if not partition_data.empty:
        # Predictions using the model
        predictions = model.predict(partition_data)

        y_pred_binary = (predictions >= 0.450).astype(int)

        # Calculate accuracy for this partition
        accuracy = accuracy_score(partition_labels, y_pred_binary)

        # Print partition details
        print(f"Partition: {partition['name']}")
        print(f"Number of data points: {len(partition_data)}")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Predictions: {np.unique(predictions, return_counts=True)}\n")

Partition: Adults
Number of data points: 22256
Accuracy: 0.93
Predictions: (array([-0.66815543, -0.6668321 , -0.64115345, ...,  0.7874292 ,
        0.8007704 ,  0.8625978 ], dtype=float32), array([1, 1, 1, ..., 1, 1, 1]))

Partition: Seniors
Number of data points: 3744
Accuracy: 0.87
Predictions: (array([-0.6503197 , -0.6114522 , -0.6070491 , ...,  0.64116585,
        0.6624119 ,  0.69012606], dtype=float32), array([1, 1, 1, ..., 1, 1, 1]))



In [14]:
# Define equivalent partitions
partitions = [
    {"name": "Women", "condition": lambda df: df['persoon_geslacht_vrouw'] == 1},
    {"name": "Men", "condition": lambda df: df['persoon_geslacht_vrouw'] == 0}
]

for partition in partitions:
    partition_data = X_test[partition["condition"](X_test)]
    partition_indices = partition_data.index  # Get the indices of the partition
    partition_labels = y_test.loc[partition_indices]  # Get the actual labels for the partition

    if not partition_data.empty:
        # Predictions using the model
        predictions = model.predict(partition_data)

        y_pred_binary = (predictions >= 0.450).astype(int)

        # Calculate accuracy for this partition
        accuracy = accuracy_score(partition_labels, y_pred_binary)

        # Calculate confusion matrix
        tn, fp, fn, tp = confusion_matrix(partition_labels, y_pred_binary).ravel()

        # Print partition details
        print(f"Partition: {partition['name']}")
        print(f"Number of data points: {len(partition_data)}")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"True Positives (TP): {tp}")
        print(f"False Positives (FP): {fp}")
        print(f"True Negatives (TN): {tn}")
        print(f"False Negatives (FN): {fn}")
        print(f"Predictions: {np.unique(predictions, return_counts=True)}\n")

Partition: Women
Number of data points: 12661
Accuracy: 0.92
True Positives (TP): 916
False Positives (FP): 25
True Negatives (TN): 10792
False Negatives (FN): 928
Predictions: (array([-0.6097734 , -0.6090758 , -0.60287416, ...,  0.7522657 ,
        0.76581717,  0.7874292 ], dtype=float32), array([1, 1, 1, ..., 1, 1, 1]))

Partition: Men
Number of data points: 13339
Accuracy: 0.92
True Positives (TP): 1086
False Positives (FP): 23
True Negatives (TN): 11189
False Negatives (FN): 1041
Predictions: (array([-0.66815543, -0.6668321 , -0.6503197 , ...,  0.778751  ,
        0.8007704 ,  0.8625978 ], dtype=float32), array([1, 1, 1, ..., 1, 1, 1]))



In [15]:
# Define equivalent partitions number of childrenfrom sklearn.metrics import confusion_matrix

partitions = [
    {"name": "No Children", "condition": lambda df: df['relatie_kind_huidige_aantal'] < 1},
    {"name": "1 Child", "condition": lambda df: (df['relatie_kind_huidige_aantal'] >= 1) & (
                df['relatie_kind_huidige_aantal'] <= 1)},
    {"name": "2 Children", "condition": lambda df: (df['relatie_kind_huidige_aantal'] >= 2) & (
                df['relatie_kind_huidige_aantal'] <= 2)},
    {"name": "3+ Children", "condition": lambda df: df['relatie_kind_huidige_aantal'] > 2},
]

# Apply equivalent partitioning
for partition in partitions:
    partition_data = X_test[partition["condition"](X_test)]
    partition_indices = partition_data.index  # Get the indices of the partition
    partition_labels = y_test.loc[partition_indices]  # Get the actual labels for the partition

    if not partition_data.empty:
        # Predictions using the model
        predictions = model.predict(partition_data)

        y_pred_binary = (predictions >= 0.450).astype(int)

        # Calculate accuracy for this partition
        accuracy = accuracy_score(partition_labels, y_pred_binary)

        # Calculate confusion matrix
        tn, fp, fn, tp = confusion_matrix(partition_labels, y_pred_binary).ravel()

        # Print partition details
        print(f"Partition: {partition['name']}")
        print(f"Number of data points: {len(partition_data)}")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"True Positives (TP): {tp}")
        print(f"False Positives (FP): {fp}")
        print(f"True Negatives (TN): {tn}")
        print(f"False Negatives (FN): {fn}")
        print(f"Predictions: {np.unique(predictions, return_counts=True)}\n")

Partition: No Children
Number of data points: 8903
Accuracy: 0.94
True Positives (TP): 463
False Positives (FP): 17
True Negatives (TN): 7922
False Negatives (FN): 501
Predictions: (array([-0.66815543, -0.6668321 , -0.6503197 , ...,  0.7277615 ,
        0.73709476,  0.7547115 ], dtype=float32), array([1, 1, 1, ..., 1, 1, 1]))

Partition: 1 Child
Number of data points: 12754
Accuracy: 0.91
True Positives (TP): 1081
False Positives (FP): 24
True Negatives (TN): 10576
False Negatives (FN): 1073
Predictions: (array([-0.596174  , -0.5604526 , -0.55206394, ...,  0.77402747,
        0.7874292 ,  0.8625978 ], dtype=float32), array([1, 1, 1, ..., 1, 1, 1]))

Partition: 2 Children
Number of data points: 3842
Accuracy: 0.91
True Positives (TP): 400
False Positives (FP): 5
True Negatives (TN): 3087
False Negatives (FN): 350
Predictions: (array([-0.4899846 , -0.48971793, -0.48157486, ...,  0.76581717,
        0.778751  ,  0.8007704 ], dtype=float32), array([1, 1, 1, ..., 1, 1, 1]))

Partition: 3+ C

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

In [None]:
# Apply equivalent partitioning
for partition in partitions:
    partition_data = X_test[partition["condition"](X_test)]
    partition_indices = partition_data.index  # Get the indices of the partition
    partition_labels = y_test.loc[partition_indices]  # Get the actual labels for the partition

    if not partition_data.empty:
        # Predictions using the model
        predictions = model.predict(partition_data)

        y_pred_binary = (predictions >= 0.450).astype(int)

        # Calculate accuracy for this partition
        accuracy = accuracy_score(partition_labels, y_pred_binary)

        # Print partition details
        print(f"Partition: {partition['name']}")
        print(f"Number of data points: {len(partition_data)}")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Predictions: {np.unique(predictions, return_counts=True)}\n")

# MLP Model 

In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mlp_model = MLPClassifier(hidden_layer_sizes=(16,8),  
                    activation='relu',            
                    solver='adam',                
                    max_iter=10,                
                    random_state=42,
                    verbose=True)
                    
mlp_model.fit(X_train_scaled, y_train)

Iteration 1, loss = 0.24164074
Iteration 2, loss = 0.09848879
Iteration 3, loss = 0.06062212
Iteration 4, loss = 0.04523239
Iteration 5, loss = 0.03655768
Iteration 6, loss = 0.03036991
Iteration 7, loss = 0.02647690
Iteration 8, loss = 0.02318208
Iteration 9, loss = 0.02092893
Iteration 10, loss = 0.01852728




In [None]:
# Define equivalent partitions number of childrenfrom sklearn.metrics import confusion_matrix

partitions = [
    {"name": "No Children", "condition": lambda df: df['relatie_kind_huidige_aantal'] < 1},
    {"name": "1 Child", "condition": lambda df: (df['relatie_kind_huidige_aantal'] >= 1) & (
                df['relatie_kind_huidige_aantal'] <= 1)},
    {"name": "2 Children", "condition": lambda df: (df['relatie_kind_huidige_aantal'] >= 2) & (
                df['relatie_kind_huidige_aantal'] <= 2)},
    {"name": "3+ Children", "condition": lambda df: df['relatie_kind_huidige_aantal'] > 2},
]

# Apply equivalent partitioning
for partition in partitions:
    partition_data = X_test[partition["condition"](X_test)]
    partition_indices = partition_data.index  # Get the indices of the partition
    partition_labels = y_test.loc[partition_indices]  # Get the actual labels for the partition

    if not partition_data.empty:
        partition_data_scaled = scaler.transform(partition_data)

        # Predictions using the model
        predictions = mlp_model.predict(partition_data_scaled)

        y_pred_binary = (predictions >= 0.5).astype(int)

        # Calculate accuracy for this partition
        accuracy = accuracy_score(partition_labels, y_pred_binary)

        # Calculate confusion matrix
        tn, fp, fn, tp = confusion_matrix(partition_labels, y_pred_binary).ravel()

        # Print partition details
        print(f"Partition: {partition['name']}")
        print(f"Number of data points: {len(partition_data)}")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"True Positives (TP): {tp}")
        print(f"False Positives (FP): {fp}")
        print(f"True Negatives (TN): {tn}")
        print(f"False Negatives (FN): {fn}")
        print(f"Predictions: {np.unique(predictions, return_counts=True)}\n")



Partition: No Children
Number of data points: 8903
Accuracy: 0.98
True Positives (TP): 877
False Positives (FP): 50
True Negatives (TN): 7889
False Negatives (FN): 87
Predictions: (array([False,  True]), array([7976,  927]))

Partition: 1 Child
Number of data points: 12754
Accuracy: 0.98
True Positives (TP): 2030
False Positives (FP): 116
True Negatives (TN): 10484
False Negatives (FN): 124
Predictions: (array([False,  True]), array([10608,  2146]))

Partition: 2 Children
Number of data points: 3842
Accuracy: 0.98
True Positives (TP): 715
False Positives (FP): 33
True Negatives (TN): 3059
False Negatives (FN): 35
Predictions: (array([False,  True]), array([3094,  748]))

Partition: 3+ Children
Number of data points: 501
Accuracy: 0.98
True Positives (TP): 100
False Positives (FP): 7
True Negatives (TN): 391
False Negatives (FN): 3
Predictions: (array([False,  True]), array([394, 107]))

