In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import train_test_split, GroupShuffleSplit, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, RocCurveDisplay
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

%load_ext autoreload
%autoreload 2

In [None]:
all_bank_records = "bank-full.csv"
sampled_bank_records = "bank.csv"
data_folder = "C:/Users/maran/OneDrive/Documents/Git Profile/Data-Projects/capgemini/data"
path_to_bank_data = f"{data_folder}/{all_bank_records}"
path_to_sampled_bank_data = f"{data_folder}/{sampled_bank_records}"
# Option to use GroupShuffleSplit or normal train_test_split
use_group_split = False  # Set to True to use GroupShuffleSplit
group_col = 'job'
group_split = ['job', 'y']

GOAL: Predict if a client will subscribe to a term deposit based on past marketing campaign data

In [None]:
raw_bank_df = pd.read_csv(path_to_sampled_bank_data, sep=';')

# Exploration

Missing values. (None from text file)

In [None]:
raw_bank_df.isnull().sum()

In [None]:
raw_bank_df.groupby('y').agg({'age': ['mean', 'min', 'max'], 'balance': ['mean', 'min', 'max'], 'duration': ['mean', 'min', 'max']})

In [None]:
raw_bank_df.groupby('y').agg({'job': 'nunique', 'marital': 'nunique', 'education': 'nunique', 'default': 'nunique', 'housing': 'nunique', 'loan': 'nunique', 'contact': 'nunique', 'month': 'nunique', 'poutcome': 'nunique'})

In [None]:
raw_bank_df.groupby('y').agg({'job': 'count', 'marital': 'count', 'education': 'count', 'default': 'count', 'housing': 'count', 'loan': 'count', 'contact': 'count', 'month': 'count', 'poutcome': 'count'})

In [None]:
total_rows = len(raw_bank_df)
unknown_or_minus1_count = raw_bank_df[(raw_bank_df == 'unknown') | (raw_bank_df == -1)].notna().sum()
unknown_or_minus1_pct = (unknown_or_minus1_count / total_rows) * 100
unknown_or_minus1_pct_series = pd.Series(unknown_or_minus1_pct)
unknown_df = pd.DataFrame({
    'count': unknown_or_minus1_count,
    'percentage': unknown_or_minus1_pct_series
})
unknown_df

In [None]:
raw_bank_df.groupby(['job','y']).agg({'age':'count'}).plot(kind='barh', figsize=(12,6), title='Count by Job Type')

In [None]:
raw_bank_df.groupby(['education','y']).agg({'age':'count'}).plot(kind='barh', figsize=(12,6), title='Count of Deposit by Education Level')

In [None]:
raw_bank_df.groupby('y')['balance'].hist(bins=1000, alpha=0.5, legend=True, figsize=(10,6))

In [None]:
raw_bank_df[['day','duration', 'campaign', 'previous']].describe()

In [None]:
raw_bank_df[raw_bank_df['pdays'] != -1]['pdays'].describe()

In [None]:
raw_bank_df

# Pre Procesessing

In [None]:
df = raw_bank_df.drop_duplicates()
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 45, 60, 100], 
                         labels=['young', 'adult', 'middle', 'senior'])


df['balance_level'] = pd.cut(df['balance'], 
                             bins=[-float('inf'), 0, 1000, 10000, float('inf')],
                             labels=['negative', 'low', 'medium', 'high'])

month_to_season = {
    'jan': 'winter', 'feb': 'winter', 'mar': 'spring',
    'apr': 'spring', 'may': 'spring', 'jun': 'summer',
    'jul': 'summer', 'aug': 'summer', 'sep': 'autumn',
    'oct': 'autumn', 'nov': 'autumn', 'dec': 'winter'
}
df['season'] = df['month'].map(month_to_season)

df['call_length'] = pd.cut(df['duration'], 
                           bins=[0, 60, 180, 600, float('inf')],
                           labels=['very_short', 'short', 'medium', 'long'])

df['has_debt'] = ((df['housing'] == 'yes') | (df['loan'] == 'yes')).astype(int)

job_stability_map = {
    'student': 0, 'unemployed': 0, 
    'blue-collar': 1, 'services': 1, 'housemaid': 1,
    'admin.': 2, 'technician': 2, 'self-employed': 2,
    'management': 3, 'entrepreneur': 3, 'retired': 3
}
df['job_stability'] = df['job'].map(job_stability_map)

df['contact_ratio'] = df['previous'] / (df['campaign'] + 1)

df['high_value'] = ((df['balance'] > 5000) & 
                    (df['job'].isin(['management', 'entrepreneur']))).astype(int)

df['risk_score'] = (df['default'] == 'yes').astype(int) + \
                   (df['housing'] == 'yes').astype(int) + \
                   (df['loan'] == 'yes').astype(int)

df['is_weekend'] = (df['day'] > 25).astype(int)  

# Model Development

In [None]:
# Set up features and target
all_features_list = df.drop(columns=['y']).columns.tolist()
target_column_name = 'y'

if use_group_split:
    group_split_list = group_split  # ['job', 'y']
    group_indices_series = df.groupby(group_split_list).ngroup()
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_idx, test_idx in gss.split(df, groups=group_indices_series):
        train_split_df = df.iloc[train_idx]
        test_split_df = df.iloc[test_idx]
    X_train = train_split_df[all_features_list]
    y_train = train_split_df[target_column_name]
    X_test = test_split_df[all_features_list]
    y_test = test_split_df[target_column_name]
else:
    X = df[all_features_list]
    y = df[target_column_name]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

# Define preprocessing for numeric and categorical columns
numeric_cols = [
    'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
    'job_stability', 'contact_ratio', 'risk_score'
]

categorical_cols = [
    'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome',
    'age_group', 'balance_level', 'season', 'call_length'
] + ['high_value', 'has_debt', 'is_weekend']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Build pipeline
clf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Cross-validated score
cv_scores = cross_val_score(clf_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validated accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Fit and evaluate
clf_pipeline.fit(X_train, y_train)
y_pred = clf_pipeline.predict(X_test)
y_pred_proba = clf_pipeline.predict_proba(X_test)[:, 1] if hasattr(clf_pipeline.named_steps['classifier'], "predict_proba") else None

acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='yes')
recall = recall_score(y_test, y_pred, pos_label='yes')
f1 = f1_score(y_test, y_pred, pos_label='yes')
roc_auc = roc_auc_score((y_test == 'yes').astype(int), y_pred_proba) if y_pred_proba is not None else None

print(f"Test Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1: {f1:.4f}")
if roc_auc is not None:
    print(f"ROC AUC: {roc_auc:.4f}")

In [None]:
# Feature importance visualization
feature_names = numeric_cols + list(clf_pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_cols))
importances = clf_pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(len(importances)), importances[indices], align="center")
plt.xticks(range(len(importances)), np.array(feature_names)[indices], rotation=90)
plt.tight_layout()
plt.show()

# ROC curve
if y_pred_proba is not None:
    fpr, tpr, thresholds = roc_curve((y_test == 'yes').astype(int), y_pred_proba)
    RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
    plt.show()

# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=['no', 'yes'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['no', 'yes'])
disp.plot()
plt.show()

In [None]:
from sklearn.dummy import DummyClassifier

# Create a baseline classifier that predicts the most frequent class
dummy_clf = DummyClassifier(strategy="most_frequent", random_state=42)
dummy_cv_scores = cross_val_score(dummy_clf, X_train, y_train, cv=5, scoring='accuracy')
print(f"DummyClassifier (most frequent) CV accuracy: {dummy_cv_scores.mean():.4f} ± {dummy_cv_scores.std():.4f}")
