In [95]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import joblib
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [96]:
df = pd.read_csv('data/fraud_oracle_processed.csv')
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

# Define Categorical Columns for Encoding
categorical_cols = [
    'Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed',
    'Fault', 'PolicyType', 'VehicleCategory', 'BasePolicy'
]

# One-Hot Encode Categorical Columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Convert Boolean Columns to Integers
boolean_cols = ['MaritalStatus_Married', 'MaritalStatus_Single', 'MaritalStatus_Widow']
df_encoded[boolean_cols] = df_encoded[boolean_cols].astype(int)

In [97]:
df_fraud = df_encoded[df_encoded["FraudFound_P"] == 1]
df_no_fraud = df_encoded[df_encoded["FraudFound_P"] == 0]

# Create 10 Balanced Datasets
datasets = []
for i in range(10):
    df_no_fraud_sampled = df_no_fraud.sample(n=1450, random_state=i, replace=False)
    df_combined = pd.concat([df_fraud, df_no_fraud_sampled]).sample(frac=1, random_state=i).reset_index(drop=True)
    datasets.append(df_combined)

scaler = StandardScaler()
scaled_datasets = []

for df_combined in datasets:
    X = df_combined.drop(columns=['FraudFound_P'])
    y = df_combined['FraudFound_P']
    X_scaled = scaler.fit_transform(X)
    df_combined_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    df_combined_scaled['FraudFound_P'] = y.values
    scaled_datasets.append(df_combined_scaled)


In [98]:
lr_models = []

for i, df_combined_scaled in enumerate(scaled_datasets):
    X = df_combined_scaled.drop(columns=['FraudFound_P'])
    y = df_combined_scaled['FraudFound_P']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    model = LogisticRegression(max_iter=5000, solver='saga', random_state=42)
    model.fit(X_train, y_train)
    lr_models.append(model)

    y_pred = model.predict(X_test)
    print(f"Logistic Regression Model {i+1} Classification Report:")
    print(classification_report(y_test, y_pred))

KeyboardInterrupt: 

In [None]:
rf_models = []
xgb_models = []

for i, df_combined_scaled in enumerate(scaled_datasets):
    X = df_combined_scaled.drop(columns=['FraudFound_P'])
    y = df_combined_scaled['FraudFound_P']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Random Forest with Controlled Parameters to Prevent Overfitting
    rf_model = RandomForestClassifier(
        n_estimators=50,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    rf_model.fit(X_train, y_train)
    rf_models.append(rf_model)

    # XGBoost with Controlled Parameters to Prevent Overfitting
    xgb_model = XGBClassifier(
        n_estimators=50,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        eval_metric='logloss',
        reg_alpha=0.01,
        reg_lambda=0.01
    )
    xgb_model.fit(X_train, y_train)
    xgb_models.append(xgb_model)

    y_pred_rf = rf_model.predict(X_test)
    print(f"Random Forest Model {i+1} Classification Report:")
    print(classification_report(y_test, y_pred_rf))

    y_pred_xgb = xgb_model.predict(X_test)
    print(f"XGBoost Model {i+1} Classification Report:")
    print(classification_report(y_test, y_pred_xgb))

Random Forest Model 1 Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.75      0.79       290
           1       0.66      0.75      0.70       185

    accuracy                           0.75       475
   macro avg       0.74      0.75      0.74       475
weighted avg       0.76      0.75      0.75       475

XGBoost Model 1 Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.76      0.82       290
           1       0.69      0.86      0.77       185

    accuracy                           0.80       475
   macro avg       0.80      0.81      0.79       475
weighted avg       0.82      0.80      0.80       475

Random Forest Model 2 Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.80      0.81       290
           1       0.70      0.74      0.72       185

    accuracy                           0.77       475

In [None]:
rf_model_list = [(f'rf_model_{i+1}', rf_models[i]) for i in range(10)]
xgb_model_list = [(f'xgb_model_{i+1}', xgb_models[i]) for i in range(10)]
lr_model_list = [(f'lr_model_{i+1}', lr_models[i]) for i in range(10)]

voting_model_rf = VotingClassifier(estimators=rf_model_list, voting='soft')
voting_model_xgb = VotingClassifier(estimators=xgb_model_list, voting='soft')
voting_model_lr = VotingClassifier(estimators=lr_model_list, voting='soft')

final_X = scaled_datasets[-1].drop(columns=['FraudFound_P'])
final_y = scaled_datasets[-1]['FraudFound_P']

voting_model_rf.fit(final_X, final_y)
voting_model_xgb.fit(final_X, final_y)
voting_model_lr.fit(final_X, final_y)

# Evaluate Random Forest and XGBoost Ensembles
X_train, X_test, y_train, y_test = train_test_split(final_X, final_y, test_size=0.2, random_state=42, stratify=final_y)

y_pred_rf = voting_model_rf.predict(X_test)
print("Random Forest Ensemble Model Classification Report:")
print(classification_report(y_test, y_pred_rf))

y_pred_xgb = voting_model_xgb.predict(X_test)
print("XGBoost Ensemble Model Classification Report:")
print(classification_report(y_test, y_pred_xgb))

KeyboardInterrupt: 

In [None]:
all_models_list = rf_model_list + xgb_model_list + lr_model_list
final_voting_model = VotingClassifier(estimators=all_models_list, voting='soft')
final_voting_model.fit(final_X, final_y)

y_pred_final = final_voting_model.predict(X_test)
print("Final Ensemble Model (RF + XGB + LR) Classification Report:")
print(classification_report(y_test, y_pred_final))


Final Ensemble Model (RF + XGB + LR) Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.90      0.92       290
           1       0.85      0.92      0.89       185

    accuracy                           0.91       475
   macro avg       0.90      0.91      0.90       475
weighted avg       0.91      0.91      0.91       475



# Grid Search

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import joblib

In [None]:
df = pd.read_csv('data/fraud_oracle_processed.csv')
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

# Define Categorical Columns for Encoding
categorical_cols = [
    'Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed',
    'Fault', 'PolicyType', 'VehicleCategory', 'BasePolicy'
]

# One-Hot Encode Categorical Columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Convert Boolean Columns to Integers
boolean_cols = ['MaritalStatus_Married', 'MaritalStatus_Single', 'MaritalStatus_Widow']
df_encoded[boolean_cols] = df_encoded[boolean_cols].astype(int)

# Split Fraud and Non-Fraud Data
df_fraud = df_encoded[df_encoded["FraudFound_P"] == 1]
df_no_fraud = df_encoded[df_encoded["FraudFound_P"] == 0]

df_fraud_train, df_fraud_holdout = train_test_split(
    df_fraud, test_size=0.1, random_state=42, stratify=df_fraud["FraudFound_P"]
)

df_no_fraud_train, df_no_fraud_holdout = train_test_split(
    df_no_fraud, test_size=0.1, random_state=42, stratify=df_no_fraud["FraudFound_P"]
)

# Combine Holdout Data for Final Evaluation
df_holdout = pd.concat([df_fraud_holdout, df_no_fraud_holdout]).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
datasets = []
for i in range(10):
    df_no_fraud_sampled = df_no_fraud_train.sample(n=len(df_fraud_train), random_state=i, replace=False)
    df_combined = pd.concat([df_fraud_train, df_no_fraud_sampled]).sample(frac=1, random_state=i).reset_index(drop=True)
    datasets.append(df_combined)

scaler = StandardScaler()
scaled_datasets = []

for df_combined in datasets:
    X = df_combined.drop(columns=['FraudFound_P'])
    y = df_combined['FraudFound_P']
    
    # Fit and transform the scaler on the training datasets
    X_scaled = scaler.fit_transform(X)
    
    # Create a new DataFrame with scaled values
    df_combined_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    df_combined_scaled['FraudFound_P'] = y.values
    
    # Append the scaled dataset to the list
    scaled_datasets.append(df_combined_scaled)

X_holdout = df_holdout.drop(columns=['FraudFound_P'])
y_holdout = df_holdout['FraudFound_P']

X_holdout_scaled = scaler.transform(X_holdout)
df_holdout_scaled = pd.DataFrame(X_holdout_scaled, columns=X_holdout.columns)
df_holdout_scaled['FraudFound_P'] = y_holdout.values


In [None]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0.01, 0.1, 1]
}

lr_param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.1, 1.0, 10],
    'solver': ['saga'],
    'max_iter': [5000, 10000]
}

In [None]:
rf_best_models = []
for i, df_combined_scaled in enumerate(scaled_datasets):
    X = df_combined_scaled.drop(columns=['FraudFound_P'])
    y = df_combined_scaled['FraudFound_P']

    rf_model = RandomForestClassifier(random_state=42)
    rf_grid_search = GridSearchCV(
        estimator=rf_model,
        param_grid=rf_param_grid,
        cv=3,
        n_jobs=-1,
        scoring='f1',
        verbose=1
    )
    rf_grid_search.fit(X, y)
    best_rf_model = rf_grid_search.best_estimator_
    rf_best_models.append(best_rf_model)
    print(f"Best RF Model {i+1} Parameters: {rf_grid_search.best_params_}")

Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best RF Model 1 Parameters: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best RF Model 2 Parameters: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best RF Model 3 Parameters: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best RF Model 4 Parameters: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best RF Model 5 Parameters: {'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Fitting 3 folds f

In [None]:
xgb_best_models = []
for i, df_combined_scaled in enumerate(scaled_datasets):
    X = df_combined_scaled.drop(columns=['FraudFound_P'])
    y = df_combined_scaled['FraudFound_P']

    xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')
    xgb_grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=xgb_param_grid,
        cv=3,
        n_jobs=-1,
        scoring='f1',
        verbose=1
    )
    xgb_grid_search.fit(X, y)
    best_xgb_model = xgb_grid_search.best_estimator_
    xgb_best_models.append(best_xgb_model)
    print(f"Best XGB Model {i+1} Parameters: {xgb_grid_search.best_params_}")

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best XGB Model 1 Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 0.01}
Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best XGB Model 2 Parameters: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 200, 'reg_alpha': 0.01, 'reg_lambda': 1}
Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best XGB Model 3 Parameters: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 1}
Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best XGB Model 4 Parameters: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'reg_alpha': 0.01, 'reg_lambda': 1}
Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best XGB Model 5 Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 1}
Fitting 3 folds for each of 243 candidates, totalling 729 fits
B

In [None]:
lr_best_models = []
for i, df_combined_scaled in enumerate(scaled_datasets):
    X = df_combined_scaled.drop(columns=['FraudFound_P'])
    y = df_combined_scaled['FraudFound_P']

    lr_model = LogisticRegression(random_state=42)
    lr_grid_search = GridSearchCV(
        estimator=lr_model,
        param_grid=lr_param_grid,
        cv=3,
        n_jobs=-1,
        scoring='f1',
        verbose=1
    )
    lr_grid_search.fit(X, y)
    best_lr_model = lr_grid_search.best_estimator_
    lr_best_models.append(best_lr_model)
    print(f"Best LR Model {i+1} Parameters: {lr_grid_search.best_params_}")


Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 1 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 2 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 3 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 4 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 5 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 6 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 7 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 8 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 9 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\xutia\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1203, in fit
    raise ValueError("l1_ratio must be specified when penalty i

Best LR Model 10 Parameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}


In [None]:
rf_model_list = [(f'rf_model_{i+1}', rf_best_models[i]) for i in range(10)]
xgb_model_list = [(f'xgb_model_{i+1}', xgb_best_models[i]) for i in range(10)]
lr_model_list = [(f'lr_model_{i+1}', lr_best_models[i]) for i in range(10)]

# Combine all models into the final ensemble
all_models_list = rf_model_list + xgb_model_list + lr_model_list
final_voting_model = VotingClassifier(estimators=all_models_list, voting='soft')

In [99]:
# Concatenate final training data (fraud + non-fraud for 90% of the data)
final_X = pd.concat([df_fraud_train.drop(columns=['FraudFound_P']),
                     df_no_fraud_train.drop(columns=['FraudFound_P'])],
                    ignore_index=True)

final_y = pd.concat([df_fraud_train['FraudFound_P'],
                     df_no_fraud_train['FraudFound_P']],
                    ignore_index=True)

# Scale the final training set using the same scaler fitted on datasets
final_X_scaled = scaler.transform(final_X)


# Fit the final voting model on the scaled final training set
final_voting_model.fit(final_X_scaled, final_y)

# Prepare the holdout set for final evaluation
X_holdout_final = df_holdout_scaled.drop(columns=['FraudFound_P'])
y_holdout_final = df_holdout_scaled['FraudFound_P']

# Predict on holdout set
y_pred_holdout = final_voting_model.predict(X_holdout_final)

# Print classification report for the holdout set
print("✅ Final Optimized Ensemble Model (RF + XGB + LR) Evaluation on 10% Holdout Data:")
print(classification_report(y_holdout_final, y_pred_holdout))



✅ Final Optimized Ensemble Model (RF + XGB + LR) Evaluation on 10% Holdout Data:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1450
           1       1.00      0.01      0.02        93

    accuracy                           0.94      1543
   macro avg       0.97      0.51      0.50      1543
weighted avg       0.94      0.94      0.91      1543



