In [8]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  # For Logistic Regression model
from sklearn.tree import DecisionTreeClassifier      # For Decision Tree model
from sklearn.model_selection import GridSearchCV      # For hyperparameter tuning
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score  # For evaluation metrics
import numpy as np                                   # For numerical operations
import pandas as pd

In [6]:
df = pd.read_csv("dataset/bank-additional-full.csv", sep = ";")
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [11]:
X = df.drop(columns='y')
y = df['y']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y, random_state=42)

In [21]:
# Define parameter grid for Logistic Regression
lr_param_grid = {
    'C': np.logspace(-3, 3, 10),                     # Regularization strength
    'penalty': ['l1', 'l2'],                          # Regularization type
    'solver': ['liblinear'],                          # Solver option
    'fit_intercept': [True, False]                   # Whether to fit the intercept
}

# Define parameter grid for Decision Tree
dt_param_grid = {
    'max_depth': np.arange(1, 20),                   # Maximum depth of the tree
    'min_samples_split': [2, 5, 10, 20],             # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4, 8],                # Minimum samples at a leaf node
    'max_features': ['log2']                          # Number of features to consider at each split
}

In [22]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier()
}


In [23]:
# Sample datasets (to be replaced with finalized datasets after EDA)
datasets = [
    (X_train, y_train)                 # Final training dataset
    # (X_train_sample1_final, y_train_sample1_final), # Sample 1
    # (X_train_sample2_final, y_train_sample2_final), # Sample 2
    # (X_train_sample3_final, y_train_sample3_final)  # Sample 3
]


In [24]:
# Loop through each dataset and train models
for X, y in datasets:
    print(f"\nTraining on dataset with {X.shape[0]} samples")

    # Train Models with Grid Search for Logistic Regression
    lr_grid = GridSearchCV(estimator=models["Logistic Regression"], param_grid=lr_param_grid, cv=3, scoring='roc_auc', return_train_score=True)
    lr_grid.fit(X, y)

    # Train Models with Grid Search for Decision Tree
    dt_grid = GridSearchCV(estimator=models["Decision Tree"], param_grid=dt_param_grid, cv=3, scoring='recall', return_train_score=True)
    dt_grid.fit(X, y)

    # Predictions
    # Predict on the dataset using the best Logistic Regression model
    lr_pred = lr_grid.predict(X)

    # Predict on the dataset using the best Decision Tree model
    dt_pred = dt_grid.predict(X)

    # Report Metrics for Logistic Regression
    lr_accuracy = accuracy_score(y, lr_pred)
    lr_precision = classification_report(y, lr_pred, output_dict=True)['1']['precision']
    lr_recall = classification_report(y, lr_pred, output_dict=True)['1']['recall']
    lr_f1 = classification_report(y, lr_pred, output_dict=True)['1']['f1-score']
    lr_auc = roc_auc_score(y, lr_grid.predict_proba(X)[:, 1])

    # Report Metrics for Decision Tree
    dt_accuracy = accuracy_score(y, dt_pred)
    dt_precision = classification_report(y, dt_pred, output_dict=True)['1']['precision']
    dt_recall = classification_report(y, dt_pred, output_dict=True)['1']['recall']
    dt_f1 = classification_report(y, dt_pred, output_dict=True)['1']['f1-score']
    dt_auc = roc_auc_score(y, dt_grid.predict_proba(X)[:, 1])

    # Print results for Logistic Regression
    print("Logistic Regression Metrics:")
    print(f"Accuracy: {lr_accuracy:.4f}")
    print(f"Precision: {lr_precision:.4f}")
    print(f"Recall: {lr_recall:.4f}")
    print(f"F1-Score: {lr_f1:.4f}")
    print(f"AUC: {lr_auc:.4f}")

    # Print results for Decision Tree
    print("\nDecision Tree Metrics:")
    print(f"Accuracy: {dt_accuracy:.4f}")
    print(f"Precision: {dt_precision:.4f}")
    print(f"Recall: {dt_recall:.4f}")
    print(f"F1-Score: {dt_f1:.4f}")
    print(f"AUC: {dt_auc:.4f}")


Training on dataset with 21623 samples


ValueError: 
All the 120 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1223, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1301, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\_array_api.py", line 751, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'blue-collar'

--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1223, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1301, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\_array_api.py", line 751, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'admin.'


## Placeholders to Replace

### Finalized Datasets:
- Replace `X_train_final` with the finalized training feature set.
- Replace `X_train_sample1_final`, `X_train_sample2_final`, `X_train_sample3_final` with the finalized feature sets for the sample datasets.
- Replace `y_train_final` with the finalized target variable for the training dataset.
- Replace `y_train_sample1_final`, `y_train_sample2_final`, `y_train_sample3_final` with the finalized target variables for the sample datasets.

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  # For Logistic Regression model
from sklearn.tree import DecisionTreeClassifier      # For Decision Tree model
from sklearn.model_selection import GridSearchCV      # For hyperparameter tuning
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score  # For evaluation metrics
from sklearn.preprocessing import OneHotEncoder      # For encoding categorical variables
from sklearn.compose import ColumnTransformer        # For transforming columns
from sklearn.pipeline import Pipeline                 # For creating pipelines
import numpy as np                                   # For numerical operations
import pandas as pd

# Load dataset
df = pd.read_csv("dataset/bank-additional-full.csv", sep=";")

# Separate features and target
X = df.drop(columns='y')
y = df['y']

# One-hot encoding for categorical variables
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Create a transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols)  # One-hot encode categorical features
    ],
    remainder='passthrough'  # Keep the remaining columns (like numerical features)
)

# Create a pipeline for Logistic Regression
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))  # Using solver for compatibility with l1 penalty
])

# Create a pipeline for Decision Tree
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Define parameter grid for Logistic Regression
lr_param_grid = {
    'classifier__C': np.logspace(-3, 3, 10),                     # Regularization strength
    'classifier__penalty': ['l1', 'l2'],                          # Regularization type
    'classifier__fit_intercept': [True, False]                   # Whether to fit the intercept
}

# Define parameter grid for Decision Tree
dt_param_grid = {
    'classifier__max_depth': np.arange(1, 20),                   # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10, 20],             # Minimum samples to split a node
    'classifier__min_samples_leaf': [1, 2, 4, 8],                # Minimum samples at a leaf node
    'classifier__max_features': ['log2']                          # Number of features to consider at each split
}

# Initialize models
models = {
    "Logistic Regression": lr_pipeline,
    "Decision Tree": dt_pipeline
}

# Sample datasets (to be replaced with finalized datasets after EDA)
datasets = [
    (X_train, y_train)                 # Final training dataset
    # (X_train_sample1_final, y_train_sample1_final), # Sample 1
    # (X_train_sample2_final, y_train_sample2_final), # Sample 2
    # (X_train_sample3_final, y_train_sample3_final)  # Sample 3
]

# Loop through each dataset and train models
for X, y in datasets:
    print(f"\nTraining on dataset with {X.shape[0]} samples")

    # Train Models with Grid Search for Logistic Regression
    lr_grid = GridSearchCV(estimator=models["Logistic Regression"], param_grid=lr_param_grid, scoring='roc_auc', cv=3, return_train_score=True)
    lr_grid.fit(X, y)

    # Train Models with Grid Search for Decision Tree
    dt_grid = GridSearchCV(estimator=models["Decision Tree"], param_grid=dt_param_grid, scoring='recall', cv=3, return_train_score=True)
    dt_grid.fit(X, y)

    # Predictions
    lr_pred = lr_grid.predict(X)
    dt_pred = dt_grid.predict(X)

    # Report Metrics for Logistic Regression
    lr_accuracy = accuracy_score(y, lr_pred)
    lr_precision = classification_report(y, lr_pred, output_dict=True)['yes']['precision']
    lr_recall = classification_report(y, lr_pred, output_dict=True)['yes']['recall']
    lr_f1 = classification_report(y, lr_pred, output_dict=True)['yes']['f1-score']
    lr_auc = roc_auc_score(y, lr_grid.predict_proba(X)[:, 1])

    # Report Metrics for Decision Tree
    dt_accuracy = accuracy_score(y, dt_pred)
    dt_precision = classification_report(y, dt_pred, output_dict=True)['yes']['precision']
    dt_recall = classification_report(y, dt_pred, output_dict=True)['yes']['recall']
    dt_f1 = classification_report(y, dt_pred, output_dict=True)['yes']['f1-score']
    dt_auc = roc_auc_score(y, dt_grid.predict_proba(X)[:, 1])

    # Print results for Logistic Regression
    print("Logistic Regression Metrics:")
    print(f"Accuracy: {lr_accuracy:.4f}")
    print(f"Precision: {lr_precision:.4f}")
    print(f"Recall: {lr_recall:.4f}")
    print(f"F1-Score: {lr_f1:.4f}")
    print(f"AUC: {lr_auc:.4f}")

    # Print results for Decision Tree
    print("\nDecision Tree Metrics:")
    print(f"Accuracy: {dt_accuracy:.4f}")
    print(f"Precision: {dt_precision:.4f}")
    print(f"Recall: {dt_recall:.4f}")
    print(f"F1-Score: {dt_f1:.4f}")
    print(f"AUC: {dt_auc:.4f}")


Training on dataset with 28831 samples


