In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import make_classification

import pandas as pd
import numpy as np

In [17]:
data_desc = pd.read_csv('data/data_description.csv', encoding='latin-1')
df = pd.read_csv('data/investigation_train_large_checked.csv')
df_synth = pd.read_csv('data/synth_data_for_training.csv')

In [19]:
# Features
X = df.drop(columns=['checked', 'Ja', 'Nee'])
# Labels
y = df['checked']

In [20]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preprocessing
numeric_features = list(range(X.shape[1]))  # assuming all features are numeric in make_classification
numeric_transformer = StandardScaler()

# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
    ]
)

# Pipelines for both models
logreg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

gb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", GradientBoostingClassifier(random_state=42))
])

# Cross-validation scores for Logistic Regression
logreg_scores = cross_val_score(logreg_pipeline, X_train, y_train, cv=5, scoring="accuracy")

# Cross-validation scores for Gradient Boosting
gb_scores = cross_val_score(gb_pipeline, X_train, y_train, cv=5, scoring="accuracy")

print(f"Logistic Regression CV Accuracy: {logreg_scores.mean():.4f} ± {logreg_scores.std():.4f}")
print(f"Gradient Boosting CV Accuracy: {gb_scores.mean():.4f} ± {gb_scores.std():.4f}")

# Train both models on full training data
logreg_pipeline.fit(X_train, y_train)
gb_pipeline.fit(X_train, y_train)

# Test accuracy
print(f"Logistic Regression Test Accuracy: {logreg_pipeline.score(X_test, y_test):.4f}")
print(f"Gradient Boosting Test Accuracy: {gb_pipeline.score(X_test, y_test):.4f}")

Logistic Regression CV Accuracy: 0.9167 ± 0.0024
Gradient Boosting CV Accuracy: 0.9241 ± 0.0009
Logistic Regression Test Accuracy: 0.9147
Gradient Boosting Test Accuracy: 0.9237


In [14]:
data_desc

Unnamed: 0,ID,Feature (nl),Feature (en),Datatype,Relative importance,Variable description,Additional explanation
0,1,adres_aantal_brp_adres,address_number_personal_records_database_addre...,int,6.370792,"Number of different addresses of a customer, p...",
1,2,adres_aantal_verschillende_wijken,address_number_different_districts,int,1.864204,Number of different neighborhoods a customer h...,
2,3,adres_aantal_verzendadres,address_number_mail_address,int,4.679638,"Number of different addresses of a customer, p...",
3,4,adres_aantal_woonadres_handmatig,address_number_residential_address_manual,int,2.385514,"Number of different addresses of a customer, p...",
4,5,adres_dagen_op_adres,address_days_at_address,int,23.329141,Number of days a customer lives at the current...,
...,...,...,...,...,...,...,...
310,311,typering_ind,type_ind,int,0.000000,Whether the customer currently has a classific...,
311,312,typering_indicatie_geheime_gegevens,type_indication_secret_data,int,0.089444,Whether the customer currently has a classific...,
312,313,typering_other,type_other,int,0.214940,Whether the customer currently has a classific...,
313,314,typering_transport__logistiek___tuinbouw,type_transport_logistics_horticulture,int,0.183549,Whether the customer currently has a classific...,


In [21]:
X_train

Unnamed: 0,adres_aantal_brp_adres,adres_aantal_verschillende_wijken,adres_aantal_verzendadres,adres_aantal_woonadres_handmatig,adres_dagen_op_adres,adres_recentst_onderdeel_rdam,adres_recentste_buurt_groot_ijsselmonde,adres_recentste_buurt_nieuwe_westen,adres_recentste_buurt_other,adres_recentste_buurt_oude_noorden,...,typering_dagen_som,typering_hist_aantal,typering_hist_inburgeringsbehoeftig,typering_hist_ind,typering_hist_sector_zorg,typering_ind,typering_indicatie_geheime_gegevens,typering_other,typering_transport__logistiek___tuinbouw,typering_zorg__schoonmaak___welzijn
93296,2,1,0,0,21384,1,0,0,0,0,...,-1316,2,0,1,0,0,0,0,0,0
7022,3,1,0,0,6903,1,0,0,1,0,...,3936,2,0,1,0,1,0,0,0,0
38831,2,2,0,0,11653,1,0,0,1,0,...,5352,2,0,1,0,1,0,1,0,0
8161,3,1,0,0,16890,1,0,0,0,0,...,3640,2,0,1,0,1,0,1,0,0
11827,5,3,0,1,14002,1,0,0,1,0,...,8417,2,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128106,4,3,1,1,6175,1,0,0,1,0,...,5253,1,0,1,0,0,0,1,0,0
103694,3,3,0,1,8300,1,0,0,0,0,...,-3009,1,0,1,0,1,0,0,0,0
860,2,1,0,1,22338,1,0,0,0,0,...,-741,1,0,1,0,0,0,1,0,0
15795,1,1,0,1,18297,1,0,0,0,0,...,6,3,0,1,0,1,0,0,0,0
