In [57]:
# # Install scikit-learn for all the core metrics and modeling
# %pip install scikit-learn

# # Install imbalanced-learn for handling imbalanced datasets (SMOTE)
# %pip install imbalanced-learn

# import sys
# !{sys.executable} -m pip install -q scikit-learn imbalanced-learn joblib

# 1: Project Setup and Data Acquisition

In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, 
                             classification_report, roc_curve)
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.pipeline import Pipeline as ImbPipeline 


# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

try:
    df = pd.read_csv('bank-additional.csv', sep=';')
    print("Data loaded.")
except FileNotFoundError:
    print("Error: bank-additional.csv not found. Please ensure the file is in the correct path and try again.")
    df = pd.DataFrame() 

if not df.empty:

    print("INITIAL DATA OVERVIEW")
    print("\nFirst 5 rows:")
    print(df.head())

    print("\n= BASIC STATISTICS =")
    print(df.describe(include='all'))

    print("\nData Types and Missing Values:")
    missing = df.isnull().sum()
    if missing.sum() == 0:
        print("✓ No missing values found")
    else:
        print(missing[missing > 0])

    # Check duplicates
    duplicates = df.duplicated().sum()
    print(f"\n== DUPLICATES ==")
    print(f"Found {duplicates} duplicate rows")
    
    print("\nStatistical Summary:")
    print(df.describe(include='all'))
    
    # Target variable distribution
    print("TARGET VARIABLE DISTRIBUTION")
    print(df['y'].value_counts())
    print(f"\nClass Imbalance Ratio: {df['y'].value_counts()['no']/df['y'].value_counts()['yes']:.2f}:1")


Data loaded.
INITIAL DATA OVERVIEW

First 5 rows:
   age          job  marital          education default  housing     loan  \
0   30  blue-collar  married           basic.9y      no      yes       no   
1   39     services   single        high.school      no       no       no   
2   25     services  married        high.school      no      yes       no   
3   38     services  married           basic.9y      no  unknown  unknown   
4   47       admin.  married  university.degree      no      yes       no   

     contact month day_of_week  ...  campaign  pdays  previous     poutcome  \
0   cellular   may         fri  ...         2    999         0  nonexistent   
1  telephone   may         fri  ...         4    999         0  nonexistent   
2  telephone   jun         wed  ...         1    999         0  nonexistent   
3  telephone   jun         fri  ...         3    999         0  nonexistent   
4   cellular   nov         mon  ...         1    999         0  nonexistent   

  emp.var.ra

# 2: Data Cleaning and Preparation 

In [None]:

if not df.empty:

    # 1. Data Cleaning: Target Variable Encoding 
    # Convert 'yes'/'no' to 1/0 immediately after loading
    df['y'] = df['y'].map({'yes': 1, 'no': 0})
    print("✓ Target variable 'y' successfully encoded (yes=1, no=0).")
    
    # Check for NaN values in the target *after* encoding
    if df['y'].isnull().any():
        df.dropna(subset=['y'], inplace=True)
        print(f" Removed {df['y'].isnull().sum()} rows with NaN in target.")

    # 2. Check for 'unknown' values and duplicates
    print("\n'Unknown' values in categorical features:")
    for col in df.select_dtypes(include='object').columns:
        unknown_count = (df[col] == 'unknown').sum()
        if unknown_count > 0:
            print(f"  • {col}: {unknown_count} ({unknown_count/len(df)*100:.1f}%)")
    
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        df.drop_duplicates(inplace=True)
        print(f" Removed {duplicates} duplicate rows.")
    else:
        print(" No duplicate rows found.")

    # **Feature 1: Has been contacted before?** (0 if pdays=999, 1 otherwise)
    df['was_previously_contacted'] = df['pdays'].apply(lambda x: 0 if x == 999 else 1)
    
    # **Feature 2: Campaign Efficiency Indicator** (1 if less than 5 contacts, 0 otherwise)
    df['campaign_successful'] = df['campaign'].apply(lambda x: 1 if x < 5 else 0)
    
    # **Feature 3: Simplify 'poutcome'** (1 for success, 0 for failure/nonexistent)
    df['poutcome_success'] = df['poutcome'].map({'success': 1, 'failure': 0, 'nonexistent': 0})
    print("\n✓ 3 New features created: 'was_previously_contacted', 'campaign_successful', 'poutcome_success'.")

    # Final check of the target distribution after cleaning/encoding
    print("\nFinal Target Distribution:")
    print(df['y'].value_counts(normalize=True))

✓ Target variable 'y' successfully encoded (yes=1, no=0).

'Unknown' values in categorical features:
  • job: 39 (0.9%)
  • marital: 11 (0.3%)
  • education: 167 (4.1%)
  • default: 803 (19.5%)
  • housing: 105 (2.5%)
  • loan: 105 (2.5%)
 No duplicate rows found.

✓ 3 New features created: 'was_previously_contacted', 'campaign_successful', 'poutcome_success'.

Final Target Distribution:
y
0    0.890507
1    0.109493
Name: proportion, dtype: float64


# 4: Data Preprocessing and Creating Pipeline


In [82]:

if not df.empty:
    # Separate features (X) and target (y)
    X = df.drop('y', axis=1)
    y = df['y']


    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"Training set size: {X_train.shape[0]} samples")
    print(f"TestingF set size: {X_test.shape[0]} samples")

    # Define feature types
    numerical_features = ['age', 'campaign', 'previous', 'emp.var.rate', 
                          'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
                          'was_previously_contacted', 'campaign_efficient', 'poutcome_success']
    
    categorical_features = ['job', 'marital', 'education', 'default', 'housing', 
                           'loan', 'contact', 'month', 'day_of_week', 'poutcome']
    
    # Features to scale (excluding binary engineered features)
    features_to_scale = ['age', 'campaign', 'previous', 'emp.var.rate', 
                         'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
    

    # 2. Engineered/Binary Features (To be passed through without scaling)
    engineered_features = ['was_previously_contacted', 'campaign_successful', 'poutcome_success']
    
     # 'duration' is removed for realistic modeling, 'pdays' is replaced by 'was_previously_contacted'
    # Check for features to remove from the scaling list 
    for col in engineered_features + ['pdays', 'duration']:
        if col in features_to_scale:
             features_to_scale.remove(col)


    # Create a preprocessor using ColumnTransformer
        preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), features_to_scale),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
            ('bin', 'passthrough', engineered_features)
        ],
        # CRITICAL FIX: Drop 'pdays' and 'duration'. Only explicitly listed features are kept.
        remainder='drop' 
    )
    print("\nColumnTransformer (Preprocessor) created for scaling and encoding.")

# --- SMOTE Application (for reporting, showing the effect of balancing) ---
    
    # Apply preprocessor fit/transform once
    X_train_processed = preprocessor.fit_transform(X_train)
    
    # Apply SMOTE to address class imbalance 
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_processed, y_train)

    print(f"\nTraining set size after SMOTE: {X_train_smote.shape[0]} samples (Balanced)")
    print(f"Target distribution after SMOTE: \n{y_train_smote.value_counts()}")
    
    # NOTE: The final modeling (Cells 55 & 56) will use the base X_train/y_train 
    # and rely on the `class_weight='balanced'` parameter for robust cross-validation.

Training set size: 3295 samples
TestingF set size: 824 samples

ColumnTransformer (Preprocessor) created for scaling and encoding.

Training set size after SMOTE: 5868 samples (Balanced)
Target distribution after SMOTE: 
y
1    2934
0    2934
Name: count, dtype: int64


# 5: Feature Selection and Model Selection

In [None]:

if not df.empty:

    # Re-define fixed preprocessor for robustness within this cell
    features_to_scale = ['age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
    categorical_features = X_train.select_dtypes(include='object').columns.tolist()
    engineered_passthrough = ['was_previously_contacted', 'campaign_successful', 'poutcome_success']
    
    preprocessor_fixed = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), features_to_scale),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
            ('bin', 'passthrough', engineered_passthrough)
        ],
        remainder='drop' 
    )
 
    # --- Define Pipelines for Model Comparison (Using class_weight='balanced' for imbalance) ---
    
    logreg_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('classifier', LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced'))])

    rf_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))])

    mlp_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', MLPClassifier(random_state=42, max_iter=300))])

    models = {'Logistic Regression': logreg_pipe, 'Random Forest': rf_pipe, 'MLP Classifier': mlp_pipe}

    results = {}
    print("\nTraining and Evaluating Models (using binary labels and class_weight='balanced')...")
    
    for name, model in models.items():
        model.fit(X_train, y_train) 
        y_pred = model.predict(X_test)
        
        # Metrics now calculated correctly on binary labels
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
        
        results[name] = {'F1-Score': f1, 'Recall': recall, 'ROC AUC': roc_auc}
        print(f"--- {name} ---")
        print(f"F1-Score: {f1:.4f}, Recall: {recall:.4f}, ROC AUC: {roc_auc:.4f}")

    best_model_name = max(results, key=lambda name: results[name]['F1-Score'])
    best_model_pipe = models[best_model_name]
    print(f"\nSelected Model for Tuning (based on F1-Score): **{best_model_name}**")


Training and Evaluating Models (using binary labels and class_weight='balanced')...
--- Logistic Regression ---
F1-Score: 0.6000, Recall: 0.8333, ROC AUC: 0.9423
--- Random Forest ---
F1-Score: 0.3871, Recall: 0.2667, ROC AUC: 0.9422
--- MLP Classifier ---
F1-Score: 0.6087, Recall: 0.7000, ROC AUC: 0.9333

Selected Model for Tuning (based on F1-Score): **MLP Classifier**


# 6: Hyperparameter Tuning and Feature Selection

In [81]:

from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.neural_network import MLPClassifier # Ensure MLP is imported

if not df.empty:
   
    best_model_name = 'MLP Classifier' 
    
    # 1. Re-define the preprocessor 
    feature_selector = SelectKBest(score_func=f_classif)


    # 2. Define the FINAL PIPELINE: Preprocessor -> Feature Selector -> Classifier
    final_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('feature_selection', feature_selector), 
        ('classifier', MLPClassifier(random_state=42, max_iter=500)) # Increased max_iter for tuning stability
    ])
    
    # 3. Define the parameter grid, including parameters for the new steps
    param_grid = {
        # Tuning parameters for Feature Selection (Selecting top K features)
        'feature_selection__k': [20, 30, 'all'], # Tune the number of features to keep
        
        # Tuning parameters for the Classifier
        'classifier__hidden_layer_sizes': [(50,), (100, 50,), (100,)],
        'classifier__alpha': [0.0001, 0.001], # L2 regularization term
        'classifier__learning_rate_init': [0.001, 0.01]
        # MLP does not natively support class_weight, so we rely on tuning parameters
    }

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        final_pipeline, 
        param_grid, 
        cv=5, 
        scoring='roc_auc', # Optimize for ROC AUC
        n_jobs=-1, 
        verbose=1
    )

    print(f"\nStarting GridSearchCV for {best_model_name} with Feature Selection...")
 
    grid_search.fit(X_train, y_train)

    final_model = grid_search.best_estimator_
    print("\nBest Parameters found by GridSearchCV:")
    print(grid_search.best_params_)

# if not df.empty:


#     # Define parameter grid based on best model
#     if best_model_name == 'Random Forest':
#         param_grid = {
#             'classifier__n_estimators': [100, 200],
#             'classifier__max_depth': [10, 20, None],
#             'classifier__min_samples_split': [2, 5],
#             'classifier__class_weight': ['balanced', 'balanced_subsample']
#         }
#         base_model = RandomForestClassifier(random_state=42, n_jobs=-1)
#     elif best_model_name == 'Logistic Regression':
#         param_grid = {
#             'classifier__C': [0.1, 1, 10],
#             'classifier__penalty': ['l1', 'l2'],
#             'classifier__class_weight': ['balanced', None]
#         }
#         base_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
#     else:  # Neural Network
#         param_grid = {
#             'classifier__hidden_layer_sizes': [(100,), (100, 50), (50, 50)],
#             'classifier__alpha': [0.0001, 0.001],
#             'classifier__learning_rate': ['constant', 'adaptive']
#         }
#         base_model = MLPClassifier(random_state=42, max_iter=500, early_stopping=True)
    
#     # Create pipeline with feature selection
#     final_pipeline = Pipeline([
#         ('preprocessor', preprocessor),
#         ('feature_selection', SelectFromModel(
#             RandomForestClassifier(n_estimators=100, random_state=42),
#             threshold='median'
#         )),
#         ('classifier', base_model)
#     ])
    
#     # Grid search
#     grid_search = GridSearchCV(
#         final_pipeline,
#         param_grid,
#         cv=5,
#         scoring='f1',
#         n_jobs=-1,
#         verbose=1
#     )
    
#     print("Starting Grid Search...")
#     grid_search.fit(X_train, y_train)
    
#     print("\n✓ Grid Search completed")
#     print(f"\nBest Parameters: {grid_search.best_params_}")
#     print(f"Best CV F1-Score: {grid_search.best_score_:.4f}")
    
#     # Get best model
#     final_model = grid_search.best_estimator_



Starting GridSearchCV for MLP Classifier with Feature Selection...
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Best Parameters found by GridSearchCV:
{'classifier__alpha': 0.0001, 'classifier__hidden_layer_sizes': (50,), 'classifier__learning_rate_init': 0.001, 'feature_selection__k': 'all'}


# 7: Evaluation of the Final Model

In [None]:
# project.ipynb (Cell for Section 7: Evaluation of the Final Model)

if not df.empty:
    y_test_pred = final_model.predict(X_test)
    y_test_proba = final_model.predict_proba(X_test)[:, 1]

    final_accuracy = accuracy_score(y_test, y_test_pred)
    final_recall = recall_score(y_test, y_test_pred)
    final_precision = precision_score(y_test, y_test_pred)
    final_f1 = f1_score(y_test, y_test_pred)
    final_roc_auc = roc_auc_score(y_test, y_test_proba)

    print("\n*** Final Model Performance on Test Set ***")
    print(f"Accuracy: {final_accuracy:.4f}")
    print(f"Recall (Subscription 'yes'): {final_recall:.4f}")
    print(f"Precision (Subscription 'yes'): {final_precision:.4f}")
    print(f"F1-Score: {final_f1:.4f}")
    print(f"ROC AUC: {final_roc_auc:.4f}")


    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred))


    # Confusion Matrix Plot 
    cm = confusion_matrix(y_test, y_test_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['No (0)', 'Yes (1)'], 
                yticklabels=['No (0)', 'Yes (1)'])
    plt.title('Confusion Matrix - Final Model (MLP)')
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.show()

    # Feature Selection Output 
    try:
        selector = final_model.named_steps['feature_selection']
        print("\nFeatures Selected by SelectKBest:")
        print(f"Number of features kept: {selector.k_}")
        
    except Exception as e:
        print(f"\nCould not display feature selection details. Error: {e}")


    try:
        selector = final_model.named_steps['feature_selection']
        # If the feature names are available, print them
        print("\nFeatures Selected by SelectFromModel:")
    
        print(f"Number of features kept: {selector.transform(X_test).shape[1]}")
    except Exception:
        pass # Ignore if feature selection step is not found in the final model.