In [6]:
# Bank Marketing - Full EDA Notebook
# ---------------------------------
# This script is written as a Jupyter-style notebook but saved as a single Python file.
# It performs a full Exploratory Data Analysis for the UCI Bank Marketing dataset.
# How to use:
# - Place this file in your project (e.g., in notebooks/)
# - Make sure PROJECT_ROOT is set or automatically discovered (see detection section)
# - Run it in a Jupyter environment or convert to a .ipynb using jupytext if desired

# %%
"""## 0. Imports and config"""
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')
plt.rcParams['figure.dpi'] = 120

# %%
"""## 1. Project root detection and paths
We attempt automatic detection first; fallback to a hard-coded path if needed.
Replace the hard-coded path below with your project root if auto-detection fails.
"""
# Option A: automatic detection (works if this file is in <root>/notebooks or similar)
try:
    PROJECT_ROOT = Path().resolve()
    # if running from notebooks folder, assume parent is project root
    if PROJECT_ROOT.name.lower() in ("notebooks", "notebook"):
        PROJECT_ROOT = PROJECT_ROOT.parent
except Exception:
    PROJECT_ROOT = None

# Option B: fallback (uncomment and edit if auto-detect is not correct)
if PROJECT_ROOT is None or not (PROJECT_ROOT.exists()):
    PROJECT_ROOT = Path(r"D:\development\ML Dev projects\Bank-Marketing-ML-Dev")

DATA_RAW = PROJECT_ROOT / "data" / "raw"
REPORTS_DIR = PROJECT_ROOT / "reports" / "figures"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_RAW:", DATA_RAW)
print("REPORTS_DIR:", REPORTS_DIR)

# %%
"""## 2. Load the data
The UCI Bank Marketing dataset uses semicolons as separators and double quotes for quoted fields.
"""
# Replace the filename if different
csv_path = DATA_RAW / "bank-full.csv"
if not csv_path.exists():
    # try common alternative names
    for alt in ["bank.csv", "bank_marketing.csv", "bank-full.csv"]:
        p = DATA_RAW / alt
        if p.exists():
            csv_path = p
            break

print('Reading:', csv_path)

# Read using pandas with proper separator and quoting
data = pd.read_csv(csv_path, sep=';', quotechar='"')

# Quick info
print('\nData shape:', data.shape)
print('\nColumns:\n', data.columns.tolist())

# %%
"""## 3. Quick data sanity checks"""
# Show first rows
display(data.head())

# dtypes and missing values
print('\nData types:')
print(data.dtypes)

print('\nMissing values per column:')
print(data.isnull().sum())

# Unique counts for each column
print('\nUnique value counts:')
print(data.nunique())

# %%
"""## 4. Column typing (numerical / categorical / binary)
Re-using your earlier logic: a column is categorical if dtype is object/categorical or nunique < 10.
"""
from pandas.api.types import is_numeric_dtype, is_object_dtype, is_categorical_dtype

cat_threshold = 10
categorical_cols = []
numerical_cols = []
binary_cols = []

for col in data.columns:
    unique_vals = data[col].nunique()
    if unique_vals == 2:
        binary_cols.append(col)
        continue
    if is_object_dtype(data[col]) or is_categorical_dtype(data[col]) or unique_vals < cat_threshold:
        categorical_cols.append(col)
        continue
    if is_numeric_dtype(data[col]):
        numerical_cols.append(col)

print('Categorical:', categorical_cols)
print('Numerical:', numerical_cols)
print('Binary   :', binary_cols)

# %%
"""## 5. Target distribution (y)
Check class balance and save a plot."""

plt.figure(figsize=(6,4))
sns.countplot(data=data, x='y')
plt.title('Target distribution (y)')
plt.tight_layout()
plt.savefig(REPORTS_DIR / 'target_distribution.png')
plt.show()

print('\nProportions:')
print(data['y'].value_counts(normalize=True))

# %%
"""## 6. Numerical variable exploration
- Distribution (histogram + KDE)
- Boxplot split by target
"""
for col in numerical_cols:
    fig, axes = plt.subplots(1, 2, figsize=(10,3.5))
    sns.histplot(data=data, x=col, kde=True, ax=axes[0])
    axes[0].set_title(f'Distribution of {col}')

    sns.boxplot(data=data, x='y', y=col, ax=axes[1])
    axes[1].set_title(f'{col} by target (y)')

    plt.tight_layout()
    fname = REPORTS_DIR / f"num_{col}.png"
    plt.savefig(fname)
    plt.show()

# %%
"""## 7. Categorical variable exploration
- Countplots split by target
- Conversion rates per category
"""
for col in categorical_cols:
    plt.figure(figsize=(10,4))
    sns.countplot(data=data, x=col, hue='y')
    plt.title(f'{col} counts by target')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(REPORTS_DIR / f"cat_{col}_counts.png")
    plt.show()

    # conversion rates
    conv = data.groupby(col)['y'].apply(lambda x: (x=="yes").mean()).sort_values(ascending=False)
    print(f"\nConversion rate by {col}:\n", conv)

# %%
"""## 8. Binary features effect on conversion
Plot mean(y==yes) for each binary feature."""
for col in binary_cols:
    if col == 'y':
        continue
    plt.figure(figsize=(6,4))
    rate = data.groupby(col)['y'].apply(lambda x: (x=="yes").mean())
    rate = rate.reset_index()
    sns.barplot(data=rate, x=col, y='y')
    plt.ylabel('Conversion rate (mean of y==yes)')
    plt.title(f'Conversion rate by {col}')
    plt.tight_layout()
    plt.savefig(REPORTS_DIR / f"bin_{col}_conversion.png")
    plt.show()

# %%
"""## 9. Correlation analysis for numerical columns
Correlation matrix + heatmap. Note: 'y' should be numeric for correlation; encode temporarily.
"""
corr_df = data[numerical_cols].copy()
# temporary numeric encoding of y
corr_with_y = data[numerical_cols].assign(y_numeric=data['y'].map({'no':0,'yes':1}))

plt.figure(figsize=(10,8))
sns.heatmap(corr_with_y.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation matrix (numerical + y)')
plt.tight_layout()
plt.savefig(REPORTS_DIR / 'corr_heatmap.png')
plt.show()

# %%
"""## 10. Feature engineering suggestions and notes
- `duration` is highly predictive but *leaks* because it is available only after contact; exclude for live prediction.
- `balance` is skewed: consider log1p transform for algorithms sensitive to distribution.
- `campaign` (number of contacts) shows diminishing returns: consider bucketing.
- `pdays` uses 999 to indicate not previously contacted; create an indicator `pdays_not_contacted`.

Let's create these candidate engineered features (for EDA only).
"""
# create engineered features for analysis
edata = data.copy()
# encode pdays_not_contacted
if 'pdays' in edata.columns:
    edata['pdays_not_contacted'] = (edata['pdays'] == 999).astype(int)

# log transform balance for visualization
edata['balance_log1p'] = np.log1p(edata['balance'] - edata['balance'].min()+1)

# bucket campaign
edata['campaign_bucket'] = pd.cut(edata['campaign'], bins=[-1,0,1,2,4,10,100], labels=['0','1','2','3-4','5-10','10+'])

# show quick comparison
plt.figure(figsize=(6,4))
sns.boxplot(data=edata, x='y', y='balance_log1p')
plt.title('Log-transformed balance by target')
plt.tight_layout()
plt.savefig(REPORTS_DIR / 'balance_log1p_by_y.png')
plt.show()

# %%
"""## 11. Missing values and rare categories handling"""
print('Missing values per column:\n', data.isnull().sum())

# Rare categories
for col in categorical_cols:
    vc = data[col].value_counts(normalize=True)
    rare = vc[vc < 0.01]
    if not rare.empty:
        print(f"\nColumn {col} has rare categories (<1%):\n", rare.index.tolist())

# %%
"""## 12. Prepare a quick modeling-ready dataset (for prototyping)
- Drop `duration` for real predictive models (but keep for experimentation)
- One-hot encode categoricals (drop_first to avoid collinearity for linear models)
- Keep a copy with and without duration
"""
from sklearn.model_selection import train_test_split

# encode target
data_model = data.copy()
data_model['y_bin'] = data_model['y'].map({'no':0, 'yes':1})

# drop duration for final model candidate
X = pd.get_dummies(data_model.drop(columns=['y','y_bin','duration']), columns=categorical_cols+binary_cols, drop_first=True)
y = data_model['y_bin']

print('\nModel matrix shape (without duration):', X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print('Train/test split:', X_train.shape, X_test.shape)

# %%
"""## 13. Simple baseline model (Logistic Regression) - for sanity check
We will run a quick logistic regression and output classification report.
"""
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:,1]

print('\nClassification report (Logistic Regression):')
print(classification_report(y_test, y_pred))
print('\nROC AUC:', roc_auc_score(y_test, y_prob))

# %%
"""## 14. Save a short EDA report (CSV summary tables)"""
# conversion rates per categorical
conv_tables = {}
for col in categorical_cols:
    conv = data.groupby(col)['y'].apply(lambda x: (x=="yes").mean()).sort_values(ascending=False)
    conv_tables[col] = conv
    conv.to_csv(REPORTS_DIR / f'conv_rate_{col}.csv')

# numerical summary
data[numerical_cols].describe().to_csv(REPORTS_DIR / 'numerical_summary.csv')

print('Saved conversion tables and numerical summary to', REPORTS_DIR)

# %%
"""## 15. Next steps (suggested)
- Build and compare tree-based models (RandomForest, XGBoost, LightGBM).
- Use calibration and probability-based ranking for marketing (lift charts).
- Address imbalance with class weights or resampling (SMOTE) and evaluate with precision-recall.
- Create a simple FastAPI endpoint to serve model predictions (and exclude `duration` from features).
- Add dataset and model versioning using DVC and a small CI pipeline for retraining.

You can convert this script into a Jupyter notebook using jupytext, or copy cells into a notebook for interactivity.

"""

# %%
"""## 16. Profiling Reports (YData & Pandas)
This section generates profiling reports using **ydata-profiling** (the modern name) and the legacy **pandas-profiling** import alias.
It saves HTML reports into `reports/profiling/` and shows them in-notebook when possible.
"""

output_profile_dir = REPORTS_DIR.parent / "profiling"
output_profile_dir.mkdir(parents=True, exist_ok=True)

# --- YData profiling ---
try:
    from ydata_profiling import ProfileReport
    profile = ProfileReport(
        data,
        title="Bank Marketing Dataset - YData Profiling Report",
        explorative=True
    )
    ydata_path = output_profile_dir / "bank_marketing_ydata_profiling.html"
    profile.to_file(ydata_path)
    print('Saved YData profiling report to', ydata_path)
except Exception as e:
    print('Could not run ydata_profiling:', e)

# --- Legacy pandas-profiling (alias) ---
try:
    from pandas_profiling import ProfileReport as PandasProfile
    profile_old = PandasProfile(data, title="Bank Marketing Dataset - Pandas Profiling Report")
    pandas_path = output_profile_dir / "bank_marketing_pandas_profiling.html"
    profile_old.to_file(pandas_path)
    print('Saved pandas-profiling report to', pandas_path)
except Exception as e:
    print('Could not run pandas_profiling:', e)

# %%
"""## 17. Automated profiling summary extraction
Create compact CSV summaries from the profiling output for quick review in CI or dashboards.
We derive: missing values table, top correlations with target, and column type summary.
"""

# Missing values summary
missing_df = data.isnull().sum().rename('missing_count').to_frame()
missing_df['missing_pct'] = missing_df['missing_count'] / len(data)
missing_df.to_csv(output_profile_dir / 'missing_summary.csv')

# Column types & unique counts
col_summary = pd.DataFrame({
    'dtype': data.dtypes.astype(str),
    'nunique': data.nunique(),
})
col_summary.to_csv(output_profile_dir / 'columns_summary.csv')

# Correlation with target (numeric only)
if 'y' in data.columns:
    numeric_for_corr = data.select_dtypes(include=[np.number]).copy()
    numeric_for_corr['y_num'] = data['y'].map({'no':0,'yes':1})
    corr_with_target = numeric_for_corr.corr()['y_num'].sort_values(ascending=False)
    corr_with_target.to_csv(output_profile_dir / 'corr_with_target.csv')

print('Saved automated profiling summaries to', output_profile_dir)

# %%
"""## 18. Full ML pipeline: preprocessing, modeling, evaluation, and exports
This section builds a reproducible pipeline including preprocessing (with ColumnTransformer), model comparison (logistic, random forest, lightgbm/xgboost if available), grid search, and evaluation.
It intentionally excludes `duration` from features for production-ready models.
"""

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib

# 18.1 Prepare modeling dataset
model_data = data.copy()
model_data['y_bin'] = model_data['y'].map({'no':0,'yes':1})

# Drop duration for production model
features = [c for c in model_data.columns if c not in ['y','y_bin','duration']]

# Recompute categorical/numeric lists (safer to base on dtype)
categorical = [c for c in features if model_data[c].dtype == 'object']
numeric = [c for c in features if np.issubdtype(model_data[c].dtype, np.number)]

print('Model features:', len(features))
print('Categorical:', categorical)
print('Numeric:', numeric)

X = model_data[features]
y = model_data['y_bin']

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 18.2 Preprocessing pipeline
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric),
    ('cat', cat_pipeline, categorical)
], remainder='drop')

# 18.3 Models to compare
models = {
    'logistic': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'rf': RandomForestClassifier(n_jobs=-1, class_weight='balanced')
}

print("everything is OK till this point")
# Try to add LightGBM and XGBoost if installed
try:
    import lightgbm as lgb
    models['lgb'] = lgb.LGBMClassifier(n_jobs=-1)
except Exception:
    print('LightGBM not available')

try:
    import xgboost as xgb
    models['xgb'] = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
except Exception:
    print('XGBoost not available')

# 18.4 Helper to fit and evaluate
results = {}
for name, estimator in models.items():
    print('Training pipeline for', name)
    pipe = Pipeline([
        ('preproc', preprocessor),
        ('clf', estimator)
    ])

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    if hasattr(pipe, 'predict_proba'):
        y_prob = pipe.predict_proba(X_test)[:,1]
    else:
        # some xgboost wrappers might not have predict_proba in older versions
        y_prob = pipe.named_steps['clf'].predict_proba(preprocessor.transform(X_test))[:,1]

    print('Model:', name)
    print(classification_report(y_test, y_pred))
    print('ROC AUC:', roc_auc_score(y_test, y_prob))

    # Precision-recall AUC
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    pr_auc = auc(recall, precision)
    print('PR AUC:', pr_auc)

    # Save
    model_path = PROJECT_ROOT / 'models'
    model_path.mkdir(exist_ok=True, parents=True)
    joblib.dump(pipe, model_path / f'{name}_pipeline.joblib')
    print('Saved pipeline to', model_path / f'{name}_pipeline.joblib')

    results[name] = {
        'roc_auc': roc_auc_score(y_test, y_prob),
        'pr_auc': pr_auc
    }

print('Model comparison results:', results)

# %%
"""## 19. Simple hyperparameter tuning example (RandomForest)
A GridSearchCV example with a small parameter grid. For larger searches consider RandomizedSearchCV or Optuna.
"""

param_grid = {
    'clf__n_estimators': [100, 300],
    'clf__max_depth': [None, 6, 12]
}

rf_pipe = Pipeline([
    ('preproc', preprocessor),
    ('clf', RandomForestClassifier(n_jobs=-1, class_weight='balanced', random_state=42))
])

grid = GridSearchCV(rf_pipe, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print('Best params:', grid.best_params_)
print('Best CV score:', grid.best_score_)

# Save best model
joblib.dump(grid.best_estimator_, PROJECT_ROOT / 'models' / 'rf_grid_best.joblib')
print('Saved best RF grid model')

# %%
"""## 20. Model explainability hints
- Use SHAP (shap.Explainer) on tree-based models to extract feature importances and local explanations.
- For logistic regression, coefficients + odds ratios are useful.

# Quick feature importance for the saved RF grid model
"""
try:
    best = grid.best_estimator_
    # Extract feature names after preprocessing
    ohe_cols = []
    if isinstance(best.named_steps['preproc'].named_transformers_.get('cat').named_steps['onehot'], OneHotEncoder):
        ohe = best.named_steps['preproc'].named_transformers_['cat'].named_steps['onehot']
        ohe_cols = list(ohe.get_feature_names_out(categorical))

    feature_names = numeric + ohe_cols
    importances = best.named_steps['clf'].feature_importances_
    fi = pd.Series(importances, index=feature_names).sort_values(ascending=False)
    print('
Top features from RF:')
    print(fi.head(20))
    fi.to_csv(PROJECT_ROOT / 'models' / 'rf_feature_importances.csv')
except Exception as e:
    print('Could not extract feature importances:', e)

# %%
"""## 21. Export: model, preprocessing objects and README
Save a small README for using the trained model(s).
"""
readme = PROJECT_ROOT / 'models' / 'README.md'
readme.write_text('# Trained models

Pipelines saved: logistic_pipeline.joblib, rf_pipeline.joblib, lgb_pipeline.joblib (if present)

To load and predict:

```python
import joblib
pipe = joblib.load("rf_pipeline.joblib")
proba = pipe.predict_proba(X_new)[:,1]
```')

print('Wrote model README at', readme)

# %%
"""## 22. Next automation steps (suggested)
- Add this notebook/script to CI and run profiling as a nightly job (save artifacts to storage).
- Add DVC to version datasets and models.
- Wrap the best pipeline in a FastAPI service endpoint for production scoring.
- Create a small dashboard for campaign uplift and decile analysis to prioritize prospects.

End of extended notebook additions.
"""


SyntaxError: unterminated string literal (detected at line 516) (2861340117.py, line 516)