In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn
from sklearn.metrics import confusion_matrix


In [None]:
# Let's load the dataset
data = pd.read_csv('data/investigation_train_large_checked.csv')

# Let's specify the features and the target
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

# Let's split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Convert regression predictions to binary classifications
# Adjust the threshold (e.g., 0.5) based on your problem
y_pred_binary = (y_pred >= 0.450).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_binary)

# Print accuracy
print(f"Accuracy: {accuracy}")


# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R²): {r2}")

In [None]:
# Define equivalent partitions
partitions = [
    {"name": "Children", "condition": lambda df: df['persoon_leeftijd_bij_onderzoek'] < 18},
    {"name": "Adults", "condition": lambda df: (df['persoon_leeftijd_bij_onderzoek'] >= 18) & (df['persoon_leeftijd_bij_onderzoek'] <= 60)},
    {"name": "Seniors", "condition": lambda df: df['persoon_leeftijd_bij_onderzoek'] > 60},
]

In [None]:
# Apply equivalent partitioning
for partition in partitions:
    partition_data = X_test[partition["condition"](X_test)]
    partition_indices = partition_data.index  # Get the indices of the partition
    partition_labels = y_test.loc[partition_indices]  # Get the actual labels for the partition

    if not partition_data.empty:
        # Predictions using the model
        predictions = model.predict(partition_data)

        y_pred_binary = (predictions >= 0.450).astype(int)

        # Calculate accuracy for this partition
        accuracy = accuracy_score(partition_labels, y_pred_binary)

        # Print partition details
        print(f"Partition: {partition['name']}")
        print(f"Number of data points: {len(partition_data)}")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Predictions: {np.unique(predictions, return_counts=True)}\n")

In [None]:
# Define equivalent partitions
partitions = [
    {"name": "Women", "condition": lambda df: df['persoon_geslacht_vrouw'] == 1},
    {"name": "Men", "condition": lambda df: df['persoon_geslacht_vrouw'] == 0}
]

for partition in partitions:
    partition_data = X_test[partition["condition"](X_test)]
    partition_indices = partition_data.index  # Get the indices of the partition
    partition_labels = y_test.loc[partition_indices]  # Get the actual labels for the partition

    if not partition_data.empty:
        # Predictions using the model
        predictions = model.predict(partition_data)

        y_pred_binary = (predictions >= 0.450).astype(int)

        # Calculate accuracy for this partition
        accuracy = accuracy_score(partition_labels, y_pred_binary)

        # Calculate confusion matrix
        tn, fp, fn, tp = confusion_matrix(partition_labels, y_pred_binary).ravel()

        # Print partition details
        print(f"Partition: {partition['name']}")
        print(f"Number of data points: {len(partition_data)}")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"True Positives (TP): {tp}")
        print(f"False Positives (FP): {fp}")
        print(f"True Negatives (TN): {tn}")
        print(f"False Negatives (FN): {fn}")
        print(f"Predictions: {np.unique(predictions, return_counts=True)}\n")

In [None]:
# Define equivalent partitions number of childrenfrom sklearn.metrics import confusion_matrix

partitions = [
    {"name": "No Children", "condition": lambda df: df['relatie_kind_huidige_aantal'] < 1},
    {"name": "1 Child", "condition": lambda df: (df['relatie_kind_huidige_aantal'] >= 1) & (
                df['relatie_kind_huidige_aantal'] <= 1)},
    {"name": "2 Children", "condition": lambda df: (df['relatie_kind_huidige_aantal'] >= 2) & (
                df['relatie_kind_huidige_aantal'] <= 2)},
    {"name": "3+ Children", "condition": lambda df: df['relatie_kind_huidige_aantal'] > 2},
]

# Apply equivalent partitioning
for partition in partitions:
    partition_data = X_test[partition["condition"](X_test)]
    partition_indices = partition_data.index  # Get the indices of the partition
    partition_labels = y_test.loc[partition_indices]  # Get the actual labels for the partition

    if not partition_data.empty:
        # Predictions using the model
        predictions = model.predict(partition_data)

        y_pred_binary = (predictions >= 0.450).astype(int)

        # Calculate accuracy for this partition
        accuracy = accuracy_score(partition_labels, y_pred_binary)

        # Calculate confusion matrix
        tn, fp, fn, tp = confusion_matrix(partition_labels, y_pred_binary).ravel()

        # Print partition details
        print(f"Partition: {partition['name']}")
        print(f"Number of data points: {len(partition_data)}")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"True Positives (TP): {tp}")
        print(f"False Positives (FP): {fp}")
        print(f"True Negatives (TN): {tn}")
        print(f"False Negatives (FN): {fn}")
        print(f"Predictions: {np.unique(predictions, return_counts=True)}\n")

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

In [None]:
# Apply equivalent partitioning
for partition in partitions:
    partition_data = X_test[partition["condition"](X_test)]
    partition_indices = partition_data.index  # Get the indices of the partition
    partition_labels = y_test.loc[partition_indices]  # Get the actual labels for the partition

    if not partition_data.empty:
        # Predictions using the model
        predictions = model.predict(partition_data)

        y_pred_binary = (predictions >= 0.450).astype(int)

        # Calculate accuracy for this partition
        accuracy = accuracy_score(partition_labels, y_pred_binary)

        # Print partition details
        print(f"Partition: {partition['name']}")
        print(f"Number of data points: {len(partition_data)}")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Predictions: {np.unique(predictions, return_counts=True)}\n")