Step 1. Importing all libries 

In [29]:
# Data Handling & Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")


2: Load Dataset and Inspect

In [None]:

import pandas as pd
import numpy as np

# 1. Load original raw CSV
df = pd.read_csv("data/bank-additional-full.csv", sep=";")

# 2. Replace any "unknown" with NaN
df.replace("unknown", np.nan, inplace=True)

# 3. Impute all categorical missing values with mode
for col in ['job', 'marital', 'education', 'default', 'housing', 'loan']:
    df[col] = df[col].fillna(df[col].mode()[0])

# 4. One-hot encode ALL categorical columns (including month, day_of_week, contact, poutcome)
#    and drop the first level to avoid redundant columns. 
#    Leave 'y' untouched for now.
cat_cols = ['job', 'marital', 'education', 'default', 
            'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# 5. Encode target column 'y' to 0/1
df_encoded['y'] = df['y'].map({'no': 0, 'yes': 1})

# 6. Confirm there are no object dtypes remaining:
print("➡️ dtypes after encoding:", df_encoded.dtypes.value_counts())

# 7. (Same as before) Drop highly correlated features if necessary:
#    Compute correlation matrix and drop 'emp.var.rate' and 'nr.employed'
corr_matrix = df_encoded.corr(numeric_only=True)
mask_upper = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
high_corr = (corr_matrix.where(mask_upper)
             .stack()
             .reset_index()
             .rename(columns={'level_0': 'f1', 'level_1': 'f2', 0: 'corr'}))
high_corr = high_corr.loc[high_corr['corr'].abs() > 0.85]

print("➡️ Highly correlated pairs (|corr|>0.85):")
print(high_corr)

# If the same three remain, drop 'emp.var.rate' and 'nr.employed'
df_final = df_encoded.drop(columns=['emp.var.rate', 'nr.employed'], errors='ignore')

print("➡️ Final DataFrame shape:", df_final.shape)
print("➡️ dtypes after dropping:", df_final.dtypes.value_counts())


Model Splitting

In [None]:
# Cell: Re-Split & Save Pickle Splits (Member 1)

from sklearn.model_selection import train_test_split
import pickle

# Separate features and target
X = df_final.drop("y", axis=1)
y = df_final["y"]

print("➡️ X shape:", X.shape, "| y shape:", y.shape)

# 80/20 stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("➡️ X_train:", X_train.shape, "| y_train:", y_train.shape)
print("➡️ X_test :", X_test.shape,  "| y_test :", y_test.shape)

# Save to disk as .pkl
with open("X_train.pkl", "wb") as f:
    pickle.dump(X_train, f)
with open("X_test.pkl", "wb") as f:
    pickle.dump(X_test, f)
with open("y_train.pkl", "wb") as f:
    pickle.dump(y_train, f)
with open("y_test.pkl", "wb") as f:
    pickle.dump(y_test, f)

print("✅ Saved new pickle splits.")
