# Data Retrieval


In [1]:
import os

import pandas as pd
from ucimlrepo import fetch_ucirepo

In [2]:
folder_path = os.path.join("..", "dataset")
data_path = os.path.join(folder_path, "data.csv")
labels_path = os.path.join(folder_path, "labels.csv")

In [3]:
def check_if_dataset_exists():
    data_exists = os.path.exists(data_path)
    labels_exists = os.path.exists(labels_path)
    return data_exists and labels_exists

In [4]:
def read_in_from_csv():
    x = pd.read_csv(data_path, sep=";")
    y = pd.read_csv(labels_path, sep=";")
    return x, y

In [5]:
def save_to_csv(x, y):
    os.makedirs(os.path.dirname(data_path), exist_ok=True)
    os.makedirs(os.path.dirname(labels_path), exist_ok=True)
    x.to_csv(data_path, index=False, sep=";")
    y.to_csv(labels_path, index=False, sep=";")

In [6]:
def fetch_dataset():
    print("Fetching dataset...")
    bank_marketing = fetch_ucirepo(id=222)
    x = bank_marketing.data.features
    y = bank_marketing.data.targets
    save_to_csv(x, y)

    os.makedirs(os.path.dirname(folder_path), exist_ok=True)
    bank_marketing.variables.to_csv(
        os.path.join(folder_path, "variables.csv"), index=False
    )
    return x, y, bank_marketing.variables

In [7]:
dataset_exists = check_if_dataset_exists()

if dataset_exists:
    print("Dataset already exists.")
    x, y = read_in_from_csv()
    variables = pd.read_csv(os.path.join(folder_path, "variables.csv"))
else:
    x, y, variables = fetch_dataset()

Dataset already exists.


# Split data to prevent leakage

# Data Transforms


## Binarization


In [8]:
# Binary encode yes/no columns
for col in ["default", "housing", "loan"]:
    x[col] = x[col].map({"yes": 1, "no": 0})

In [9]:
y["y"] = y["y"].map({"yes": 1, "no": 0})

In [10]:
y

Unnamed: 0,y
0,0
1,0
2,0
3,0
4,0
...,...
45206,1
45207,1
45208,1
45209,0


## One-hot encoding


In [11]:
x = pd.get_dummies(x, columns=["job", "marital", "contact", "month"], prefix_sep="_")
x

Unnamed: 0,age,education,default,balance,housing,loan,day_of_week,duration,campaign,pdays,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,58,tertiary,0,2143,1,0,5,261,1,-1,...,False,False,False,False,False,False,True,False,False,False
1,44,secondary,0,29,1,0,5,151,1,-1,...,False,False,False,False,False,False,True,False,False,False
2,33,secondary,0,2,1,1,5,76,1,-1,...,False,False,False,False,False,False,True,False,False,False
3,47,,0,1506,1,0,5,92,1,-1,...,False,False,False,False,False,False,True,False,False,False
4,33,,0,1,0,0,5,198,1,-1,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,tertiary,0,825,0,0,17,977,3,-1,...,False,False,False,False,False,False,False,True,False,False
45207,71,primary,0,1729,0,0,17,456,2,-1,...,False,False,False,False,False,False,False,True,False,False
45208,72,secondary,0,5715,0,0,17,1127,5,184,...,False,False,False,False,False,False,False,True,False,False
45209,57,secondary,0,668,0,0,17,508,4,-1,...,False,False,False,False,False,False,False,True,False,False


## Ordinal Categorization


In [12]:
# ordinal encode 'education' and 'poutcome'

edu_order = ["unknown", "primary", "secondary", "tertiary"]
pout_order = ["unknown", "failure", "other", "success"]
x["education"] = pd.Categorical(
    x["education"], categories=edu_order, ordered=True
).codes
x["poutcome"] = pd.Categorical(x["poutcome"], categories=pout_order, ordered=True).codes
x

Unnamed: 0,age,education,default,balance,housing,loan,day_of_week,duration,campaign,pdays,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,58,3,0,2143,1,0,5,261,1,-1,...,False,False,False,False,False,False,True,False,False,False
1,44,2,0,29,1,0,5,151,1,-1,...,False,False,False,False,False,False,True,False,False,False
2,33,2,0,2,1,1,5,76,1,-1,...,False,False,False,False,False,False,True,False,False,False
3,47,-1,0,1506,1,0,5,92,1,-1,...,False,False,False,False,False,False,True,False,False,False
4,33,-1,0,1,0,0,5,198,1,-1,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,3,0,825,0,0,17,977,3,-1,...,False,False,False,False,False,False,False,True,False,False
45207,71,1,0,1729,0,0,17,456,2,-1,...,False,False,False,False,False,False,False,True,False,False
45208,72,2,0,5715,0,0,17,1127,5,184,...,False,False,False,False,False,False,False,True,False,False
45209,57,2,0,668,0,0,17,508,4,-1,...,False,False,False,False,False,False,False,True,False,False


## Drop columns


In [13]:
x = x.drop(columns=["duration"])
x

Unnamed: 0,age,education,default,balance,housing,loan,day_of_week,campaign,pdays,previous,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,58,3,0,2143,1,0,5,1,-1,0,...,False,False,False,False,False,False,True,False,False,False
1,44,2,0,29,1,0,5,1,-1,0,...,False,False,False,False,False,False,True,False,False,False
2,33,2,0,2,1,1,5,1,-1,0,...,False,False,False,False,False,False,True,False,False,False
3,47,-1,0,1506,1,0,5,1,-1,0,...,False,False,False,False,False,False,True,False,False,False
4,33,-1,0,1,0,0,5,1,-1,0,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,3,0,825,0,0,17,3,-1,0,...,False,False,False,False,False,False,False,True,False,False
45207,71,1,0,1729,0,0,17,2,-1,0,...,False,False,False,False,False,False,False,True,False,False
45208,72,2,0,5715,0,0,17,5,184,3,...,False,False,False,False,False,False,False,True,False,False
45209,57,2,0,668,0,0,17,4,-1,0,...,False,False,False,False,False,False,False,True,False,False


## Scaling

In [14]:
# Split data before scaling 
from sklearn.model_selection import train_test_split

# Assuming X and y are your feature and target DataFrames/Series:

# 1. Split off 20% for test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    x, y,
    test_size=0.20,
    stratify=y,        # maintain class distribution
    random_state=42
)

# 2. From the remaining 80%, split 12.5% (0.125) → 10% of overall for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.125,   # 0.125 * 0.80 = 0.10 overall
    stratify=y_train_val,
    random_state=42
)
# 3. Verify the splits
print(f"Train set:      {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape},   {y_val.shape}")
print(f"Test set:       {X_test.shape},  {y_test.shape}")


Train set:      (31647, 39), (31647, 1)
Validation set: (4521, 39),   (4521, 1)
Test set:       (9043, 39),  (9043, 1)


In [15]:

# Why
# 1. ensures all features constribute equally to penalty if using L1/L2 regularization
# 2. faster convergence
# 3. avoid dominance of on term over others if creating interaction vars

from sklearn.preprocessing import MinMaxScaler

def scale_independent(df, cols):
    """Fit & transform a new MinMaxScaler on df[cols]."""
    scaler = MinMaxScaler()
    df[cols] = scaler.fit_transform(df[cols])
    return df

numeric_cols = ["age", "balance", "campaign"]
X_train = scale_independent(X_train, numeric_cols)
X_val   = scale_independent(X_val,   numeric_cols)
X_test  = scale_independent(X_test,  numeric_cols)


In [16]:
X_test

Unnamed: 0,age,education,default,balance,housing,loan,day_of_week,campaign,pdays,previous,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
1392,0.293333,1,0,0.036442,1,1,8,0.018519,-1,0,...,False,False,False,False,False,False,True,False,False,False
7518,0.346667,2,0,0.033861,1,0,30,0.018519,-1,0,...,False,False,False,False,False,False,True,False,False,False
12007,0.173333,2,0,0.033644,1,0,20,0.074074,-1,0,...,False,False,False,False,True,False,False,False,False,False
5536,0.240000,1,0,0.036590,1,0,23,0.055556,-1,0,...,False,False,False,False,False,False,True,False,False,False
29816,0.213333,2,0,0.049066,1,0,4,0.000000,-1,0,...,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12636,0.106667,2,0,0.026854,0,1,4,0.000000,-1,0,...,False,False,False,True,False,False,False,False,False,False
13364,0.413333,2,0,0.038847,1,0,8,0.037037,-1,0,...,False,False,False,True,False,False,False,False,False,False
16102,0.186667,3,0,0.039635,1,0,22,0.018519,-1,0,...,False,False,False,True,False,False,False,False,False,False
42097,0.026667,1,0,0.030136,0,0,2,0.055556,182,4,...,False,False,False,False,False,False,False,True,False,False


# Save dataset


In [17]:
# Combine train
combined_train = pd.concat([X_train, y_train], axis=1)

# Combine validation
combined_val   = pd.concat([X_val,   y_val],   axis=1)

# Combine test
combined_test  = pd.concat([X_test,  y_test],  axis=1)

# Inspect
print(combined_train.shape, combined_val.shape, combined_test.shape)

(31647, 40) (4521, 40) (9043, 40)


In [18]:
combined_train.to_csv("../dataset/train.csv", index=False, encoding="utf-8")
combined_val.to_csv("../dataset/validate.csv",   index=False, encoding="utf-8")
combined_test.to_csv("../dataset/test.csv",  index=False, encoding="utf-8")