In [35]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import joblib
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [36]:
# Load Data
df = pd.read_csv('data/fraud_oracle_processed.csv')
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

# Define Categorical Columns for Encoding
categorical_cols = [
    'Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed',
    'Fault', 'PolicyType', 'VehicleCategory', 'BasePolicy'
]

# One-Hot Encode Categorical Columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Convert Boolean Columns to Integers
boolean_cols = ['MaritalStatus_Married', 'MaritalStatus_Single', 'MaritalStatus_Widow']
df_encoded[boolean_cols] = df_encoded[boolean_cols].astype(int)

# Separate Fraud and Non-Fraud Data
df_fraud = df_encoded[df_encoded["FraudFound_P"] == 1]
df_no_fraud = df_encoded[df_encoded["FraudFound_P"] == 0]

# Split 15% Holdout for Final Evaluation
df_fraud_train, df_fraud_holdout = train_test_split(
    df_fraud, test_size=0.15, random_state=42, stratify=df_fraud["FraudFound_P"]
)

df_no_fraud_train, df_no_fraud_holdout = train_test_split(
    df_no_fraud, test_size=0.15, random_state=42, stratify=df_no_fraud["FraudFound_P"]
)

# Combine Holdout Data for Final Evaluation
df_holdout = pd.concat([df_fraud_holdout, df_no_fraud_holdout]).sample(frac=1, random_state=42).reset_index(drop=True)

# Standardize Entire Dataset Before Splitting
scaler = StandardScaler()
X = df_encoded.drop(columns=['FraudFound_P'])
y = df_encoded['FraudFound_P']

X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
df_scaled['FraudFound_P'] = y.values

# Split Data for Hyperparameter Tuning (85% Train, 15% Holdout)
df_train, df_holdout = train_test_split(df_scaled, test_size=0.15, random_state=42, stratify=df_scaled['FraudFound_P'])

# Split df_train into X and y for Grid Search
X_train_grid = df_train.drop(columns=['FraudFound_P'])
y_train_grid = df_train['FraudFound_P']

# Grid Search

In [37]:
# Logistic Regression Hyperparameters
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['saga', 'liblinear'],
    'max_iter': [1000, 5000]
}

lr_grid = GridSearchCV(LogisticRegression(random_state=42), param_grid=lr_param_grid, cv=5, scoring='f1', n_jobs=-1)
lr_grid.fit(X_train_grid, y_train_grid)

# Best Parameters for Logistic Regression
best_lr_params = lr_grid.best_params_
print(f"Best Logistic Regression Params: {best_lr_params}")

# Random Forest Hyperparameters
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=rf_param_grid, cv=5, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train_grid, y_train_grid)

# Best Parameters for Random Forest
best_rf_params = rf_grid.best_params_
print(f"Best Random Forest Params: {best_rf_params}")

# XGBoost Hyperparameters
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0.01, 0.1, 1]
}

xgb_grid = GridSearchCV(XGBClassifier(eval_metric='logloss', random_state=42), param_grid=xgb_param_grid, cv=5, scoring='f1', n_jobs=-1)
xgb_grid.fit(X_train_grid, y_train_grid)

# Best Parameters for XGBoost
best_xgb_params = xgb_grid.best_params_
print(f"Best XGBoost Params: {best_xgb_params}")

Best Logistic Regression Params: {'C': 1, 'max_iter': 1000, 'solver': 'saga'}
Best Random Forest Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best XGBoost Params: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 0.1}


In [38]:
# Generate 10 Balanced Datasets for Training
datasets = []
for i in range(10):
    df_no_fraud_sampled = df_no_fraud_train.sample(n=len(df_fraud_train), random_state=i, replace=False)
    df_combined = pd.concat([df_fraud_train, df_no_fraud_sampled]).sample(frac=1, random_state=i).reset_index(drop=True)
    datasets.append(df_combined)

# Standardize the Datasets
scaled_datasets = []
for df_combined in datasets:
    X = df_combined.drop(columns=['FraudFound_P'])
    y = df_combined['FraudFound_P']
    
    X_scaled = scaler.transform(X)
    df_combined_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    df_combined_scaled['FraudFound_P'] = y.values
    
    scaled_datasets.append(df_combined_scaled)

# Standardize Holdout Set
X_holdout = df_holdout.drop(columns=['FraudFound_P'])
y_holdout = df_holdout['FraudFound_P']

X_holdout_scaled = scaler.transform(X_holdout)
df_holdout_scaled = pd.DataFrame(X_holdout_scaled, columns=X_holdout.columns)
df_holdout_scaled['FraudFound_P'] = y_holdout.values

# Train model using chosen Hyperparams

In [39]:
lr_models, rf_models, xgb_models = [], [], []

for i, df_combined_scaled in enumerate(scaled_datasets):
    X = df_combined_scaled.drop(columns=['FraudFound_P'])
    y = df_combined_scaled['FraudFound_P']
    
    lr_model = LogisticRegression(**best_lr_params, random_state=42)
    lr_model.fit(X, y)
    lr_models.append(lr_model)
    
    rf_model = RandomForestClassifier(**best_rf_params, random_state=42)
    rf_model.fit(X, y)
    rf_models.append(rf_model)
    
    xgb_model = XGBClassifier(**best_xgb_params, eval_metric='logloss', random_state=42)
    xgb_model.fit(X, y)
    xgb_models.append(xgb_model)




In [40]:
# Create Voting Ensembles for Each Model Type
lr_model_list = [(f'lr_model_{i+1}', lr_models[i]) for i in range(10)]
rf_model_list = [(f'rf_model_{i+1}', rf_models[i]) for i in range(10)]
xgb_model_list = [(f'xgb_model_{i+1}', xgb_models[i]) for i in range(10)]

# Voting Classifiers for Each Model Type
voting_model_lr = VotingClassifier(estimators=lr_model_list, voting='soft')
voting_model_rf = VotingClassifier(estimators=rf_model_list, voting='soft')
voting_model_xgb = VotingClassifier(estimators=xgb_model_list, voting='soft')

# Prepare Final Dataset for Voting Models
final_X = scaled_datasets[-1].drop(columns=['FraudFound_P'])
final_y = scaled_datasets[-1]['FraudFound_P']

# Fit Voting Models
voting_model_lr.fit(final_X, final_y)
voting_model_rf.fit(final_X, final_y)
voting_model_xgb.fit(final_X, final_y)


In [41]:
# Evaluate Individual Voting Models on Holdout
X_holdout_final = df_holdout_scaled.drop(columns=['FraudFound_P'])
y_holdout_final = df_holdout_scaled['FraudFound_P']

y_pred_lr = voting_model_lr.predict(X_holdout_final)
print("Logistic Regression Ensemble Classification Report on Holdout Data:")
print(classification_report(y_holdout_final, y_pred_lr))

y_pred_rf = voting_model_rf.predict(X_holdout_final)
print("Random Forest Ensemble Classification Report on Holdout Data:")
print(classification_report(y_holdout_final, y_pred_rf))

y_pred_xgb = voting_model_xgb.predict(X_holdout_final)
print("XGBoost Ensemble Classification Report on Holdout Data:")
print(classification_report(y_holdout_final, y_pred_xgb))

Logistic Regression Ensemble Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2175
           1       0.06      1.00      0.11       138

    accuracy                           0.06      2313
   macro avg       0.03      0.50      0.06      2313
weighted avg       0.00      0.06      0.01      2313

Random Forest Ensemble Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       0.98      0.64      0.77      2175
           1       0.12      0.82      0.22       138

    accuracy                           0.65      2313
   macro avg       0.55      0.73      0.49      2313
weighted avg       0.93      0.65      0.74      2313

XGBoost Ensemble Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       1.00      0.55      0.71      2175
           1       0.12      0.96      0.21       1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
# Final Ensemble of All Models (LR + RF + XGB)
all_models_list = lr_model_list + rf_model_list + xgb_model_list
final_voting_model = VotingClassifier(estimators=all_models_list, voting='soft')

# Fit Final Voting Model on Full 85% Data
final_voting_model.fit(final_X, final_y)

# Evaluate Final Voting Model on Holdout Data
y_pred_final = final_voting_model.predict(X_holdout_final)
print("Final Ensemble Model (LR + RF + XGB) Classification Report on Holdout Data:")
print(classification_report(y_holdout_final, y_pred_final))

Final Ensemble Model (LR + RF + XGB) Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       1.00      0.49      0.66      2175
           1       0.11      0.97      0.20       138

    accuracy                           0.52      2313
   macro avg       0.55      0.73      0.43      2313
weighted avg       0.94      0.52      0.63      2313



# Grid Search

In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import joblib

In [9]:
df = pd.read_csv('data/fraud_oracle_processed.csv')
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

# Define Categorical Columns for Encoding
categorical_cols = [
    'Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed',
    'Fault', 'PolicyType', 'VehicleCategory', 'BasePolicy'
]

# One-Hot Encode Categorical Columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Convert Boolean Columns to Integers
boolean_cols = ['MaritalStatus_Married', 'MaritalStatus_Single', 'MaritalStatus_Widow']
df_encoded[boolean_cols] = df_encoded[boolean_cols].astype(int)

# Split Fraud and Non-Fraud Data
df_fraud = df_encoded[df_encoded["FraudFound_P"] == 1]
df_no_fraud = df_encoded[df_encoded["FraudFound_P"] == 0]

df_fraud_train, df_fraud_holdout = train_test_split(
    df_fraud, test_size=0.1, random_state=42, stratify=df_fraud["FraudFound_P"]
)

df_no_fraud_train, df_no_fraud_holdout = train_test_split(
    df_no_fraud, test_size=0.1, random_state=42, stratify=df_no_fraud["FraudFound_P"]
)

# Combine Holdout Data for Final Evaluation
df_holdout = pd.concat([df_fraud_holdout, df_no_fraud_holdout]).sample(frac=1, random_state=42).reset_index(drop=True)

In [10]:
datasets = []
for i in range(10):
    df_no_fraud_sampled = df_no_fraud_train.sample(n=len(df_fraud_train), random_state=i, replace=False)
    df_combined = pd.concat([df_fraud_train, df_no_fraud_sampled]).sample(frac=1, random_state=i).reset_index(drop=True)
    datasets.append(df_combined)

scaler = StandardScaler()
scaled_datasets = []

for df_combined in datasets:
    X = df_combined.drop(columns=['FraudFound_P'])
    y = df_combined['FraudFound_P']
    
    # Fit and transform the scaler on the training datasets
    X_scaled = scaler.fit_transform(X)
    
    # Create a new DataFrame with scaled values
    df_combined_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    df_combined_scaled['FraudFound_P'] = y.values
    
    # Append the scaled dataset to the list
    scaled_datasets.append(df_combined_scaled)

X_holdout = df_holdout.drop(columns=['FraudFound_P'])
y_holdout = df_holdout['FraudFound_P']

X_holdout_scaled = scaler.transform(X_holdout)
df_holdout_scaled = pd.DataFrame(X_holdout_scaled, columns=X_holdout.columns)
df_holdout_scaled['FraudFound_P'] = y_holdout.values


In [11]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0.01, 0.1, 1]
}

lr_param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.1, 1.0, 10],
    'solver': ['saga'],
    'max_iter': [5000, 10000]
}

In [12]:
rf_best_models = []
for i, df_combined_scaled in enumerate(scaled_datasets):
    X = df_combined_scaled.drop(columns=['FraudFound_P'])
    y = df_combined_scaled['FraudFound_P']

    rf_model = RandomForestClassifier(random_state=42)
    rf_grid_search = GridSearchCV(
        estimator=rf_model,
        param_grid=rf_param_grid,
        cv=3,
        n_jobs=-1,
        scoring='f1',
        verbose=1
    )
    rf_grid_search.fit(X, y)
    best_rf_model = rf_grid_search.best_estimator_
    rf_best_models.append(best_rf_model)
    print(f"Best RF Model {i+1} Parameters: {rf_grid_search.best_params_}")

Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best RF Model 1 Parameters: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best RF Model 2 Parameters: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best RF Model 3 Parameters: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best RF Model 4 Parameters: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best RF Model 5 Parameters: {'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Fitting 3 folds f

In [13]:
xgb_best_models = []
for i, df_combined_scaled in enumerate(scaled_datasets):
    X = df_combined_scaled.drop(columns=['FraudFound_P'])
    y = df_combined_scaled['FraudFound_P']

    xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')
    xgb_grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=xgb_param_grid,
        cv=3,
        n_jobs=-1,
        scoring='f1',
        verbose=1
    )
    xgb_grid_search.fit(X, y)
    best_xgb_model = xgb_grid_search.best_estimator_
    xgb_best_models.append(best_xgb_model)
    print(f"Best XGB Model {i+1} Parameters: {xgb_grid_search.best_params_}")

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best XGB Model 1 Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 0.01}
Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best XGB Model 2 Parameters: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 200, 'reg_alpha': 0.01, 'reg_lambda': 1}
Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best XGB Model 3 Parameters: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 1}
Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best XGB Model 4 Parameters: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'reg_alpha': 0.01, 'reg_lambda': 1}
Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best XGB Model 5 Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 1}
Fitting 3 folds for each of 243 candidates, totalling 729 fits
B

In [14]:
lr_best_models = []
for i, df_combined_scaled in enumerate(scaled_datasets):
    X = df_combined_scaled.drop(columns=['FraudFound_P'])
    y = df_combined_scaled['FraudFound_P']

    lr_model = LogisticRegression(random_state=42)
    lr_grid_search = GridSearchCV(
        estimator=lr_model,
        param_grid=lr_param_grid,
        cv=3,
        n_jobs=-1,
        scoring='f1',
        verbose=1
    )
    lr_grid_search.fit(X, y)
    best_lr_model = lr_grid_search.best_estimator_
    lr_best_models.append(best_lr_model)
    print(f"Best LR Model {i+1} Parameters: {lr_grid_search.best_params_}")


Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 1 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 2 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 3 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 4 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 5 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 6 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 7 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 8 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 9 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 10 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}


In [15]:
rf_model_list = [(f'rf_model_{i+1}', rf_best_models[i]) for i in range(10)]
xgb_model_list = [(f'xgb_model_{i+1}', xgb_best_models[i]) for i in range(10)]
lr_model_list = [(f'lr_model_{i+1}', lr_best_models[i]) for i in range(10)]

# Combine all models into the final ensemble
all_models_list = rf_model_list + xgb_model_list + lr_model_list
final_voting_model = VotingClassifier(estimators=all_models_list, voting='soft')

In [18]:
# Concatenate final training data (fraud + non-fraud for 90% of the data)
final_X = pd.concat([df_fraud_train.drop(columns=['FraudFound_P']),
                     df_no_fraud_train.drop(columns=['FraudFound_P'])],
                    ignore_index=True)

final_y = pd.concat([df_fraud_train['FraudFound_P'],
                     df_no_fraud_train['FraudFound_P']],
                    ignore_index=True)

# Correctly transform the final training set using the fitted scaler
final_X_scaled = scaler.transform(final_X)

# Fit the final voting model on the scaled final training set
final_voting_model.fit(final_X_scaled, final_y)

# Prepare the holdout set for final evaluation
X_holdout_final = df_holdout_scaled.drop(columns=['FraudFound_P'])
y_holdout_final = df_holdout_scaled['FraudFound_P']

# Predict on the holdout set
y_pred_holdout = final_voting_model.predict(X_holdout_final)

# Print classification report for the holdout set
print("✅ Final Optimized Ensemble Model (RF + XGB + LR) Evaluation on 10% Holdout Data:")
print(classification_report(y_holdout_final, y_pred_holdout))




✅ Final Optimized Ensemble Model (RF + XGB + LR) Evaluation on 10% Holdout Data:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1450
           1       0.00      0.00      0.00        93

    accuracy                           0.94      1543
   macro avg       0.47      0.50      0.48      1543
weighted avg       0.88      0.94      0.91      1543



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
