# Data Retrieval


In [1]:
import os

import pandas as pd
from ucimlrepo import fetch_ucirepo

In [2]:
folder_path = os.path.join("..", "dataset")
data_path = os.path.join(folder_path, "data.csv")
labels_path = os.path.join(folder_path, "labels.csv")

In [3]:
def check_if_dataset_exists():
    data_exists = os.path.exists(data_path)
    labels_exists = os.path.exists(labels_path)
    return data_exists and labels_exists

In [4]:
def read_in_from_csv():
    x = pd.read_csv(data_path, sep=";")
    y = pd.read_csv(labels_path, sep=";")
    return x, y

In [5]:
def save_to_csv(x, y):
    os.makedirs(os.path.dirname(data_path), exist_ok=True)
    os.makedirs(os.path.dirname(labels_path), exist_ok=True)
    x.to_csv(data_path, index=False, sep=";")
    y.to_csv(labels_path, index=False, sep=";")

In [6]:
def fetch_dataset():
    print("Fetching dataset...")
    bank_marketing = fetch_ucirepo(id=222)
    x = bank_marketing.data.features
    y = bank_marketing.data.targets
    save_to_csv(x, y)

    os.makedirs(os.path.dirname(folder_path), exist_ok=True)
    bank_marketing.variables.to_csv(
        os.path.join(folder_path, "variables.csv"), index=False
    )
    return x, y, bank_marketing.variables

In [7]:
dataset_exists = check_if_dataset_exists()

if dataset_exists:
    print("Dataset already exists.")
    x, y = read_in_from_csv()
    variables = pd.read_csv(os.path.join(folder_path, "variables.csv"))
else:
    x, y, variables = fetch_dataset()

Dataset already exists.


# Data Transforms


## Binarization


In [8]:
# Binary encode yes/no columns
for col in ["default", "housing", "loan"]:
    x[col] = x[col].map({"yes": 1, "no": 0})

In [9]:
y["y"] = y["y"].map({"yes": 1, "no": 0})

In [10]:
y

Unnamed: 0,y
0,0
1,0
2,0
3,0
4,0
...,...
45206,1
45207,1
45208,1
45209,0


## Scaling


In [11]:
# Why
# 1. ensures all features constribute equally to penalty if using L1/L2 regularization
# 2. faster convergence
# 3. avoid dominance of on term over others if creating interaction vars

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

x[["age", "balance", "campaign"]] = scaler.fit_transform(
    x[["age", "balance", "campaign"]]
)
x

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,0.519481,management,married,tertiary,0,0.092259,1,0,,5,may,261,0.000000,-1,0,
1,0.337662,technician,single,secondary,0,0.073067,1,0,,5,may,151,0.000000,-1,0,
2,0.194805,entrepreneur,married,secondary,0,0.072822,1,1,,5,may,76,0.000000,-1,0,
3,0.376623,blue-collar,married,,0,0.086476,1,0,,5,may,92,0.000000,-1,0,
4,0.194805,,single,,0,0.072812,0,0,,5,may,198,0.000000,-1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.428571,technician,married,tertiary,0,0.080293,0,0,cellular,17,nov,977,0.032258,-1,0,
45207,0.688312,retired,divorced,primary,0,0.088501,0,0,cellular,17,nov,456,0.016129,-1,0,
45208,0.701299,retired,married,secondary,0,0.124689,0,0,cellular,17,nov,1127,0.064516,184,3,success
45209,0.506494,blue-collar,married,secondary,0,0.078868,0,0,telephone,17,nov,508,0.048387,-1,0,


## One-hot encoding


In [12]:
x = pd.get_dummies(x, columns=["job", "marital", "contact", "month"], prefix_sep="_")
x

Unnamed: 0,age,education,default,balance,housing,loan,day_of_week,duration,campaign,pdays,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,0.519481,tertiary,0,0.092259,1,0,5,261,0.000000,-1,...,False,False,False,False,False,False,True,False,False,False
1,0.337662,secondary,0,0.073067,1,0,5,151,0.000000,-1,...,False,False,False,False,False,False,True,False,False,False
2,0.194805,secondary,0,0.072822,1,1,5,76,0.000000,-1,...,False,False,False,False,False,False,True,False,False,False
3,0.376623,,0,0.086476,1,0,5,92,0.000000,-1,...,False,False,False,False,False,False,True,False,False,False
4,0.194805,,0,0.072812,0,0,5,198,0.000000,-1,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.428571,tertiary,0,0.080293,0,0,17,977,0.032258,-1,...,False,False,False,False,False,False,False,True,False,False
45207,0.688312,primary,0,0.088501,0,0,17,456,0.016129,-1,...,False,False,False,False,False,False,False,True,False,False
45208,0.701299,secondary,0,0.124689,0,0,17,1127,0.064516,184,...,False,False,False,False,False,False,False,True,False,False
45209,0.506494,secondary,0,0.078868,0,0,17,508,0.048387,-1,...,False,False,False,False,False,False,False,True,False,False


## Ordinal Categorization


In [13]:
# ordinal encode 'education' and 'poutcome'

edu_order = ["unknown", "primary", "secondary", "tertiary"]
pout_order = ["unknown", "failure", "other", "success"]
x["education"] = pd.Categorical(
    x["education"], categories=edu_order, ordered=True
).codes
x["poutcome"] = pd.Categorical(x["poutcome"], categories=pout_order, ordered=True).codes
x

Unnamed: 0,age,education,default,balance,housing,loan,day_of_week,duration,campaign,pdays,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,0.519481,3,0,0.092259,1,0,5,261,0.000000,-1,...,False,False,False,False,False,False,True,False,False,False
1,0.337662,2,0,0.073067,1,0,5,151,0.000000,-1,...,False,False,False,False,False,False,True,False,False,False
2,0.194805,2,0,0.072822,1,1,5,76,0.000000,-1,...,False,False,False,False,False,False,True,False,False,False
3,0.376623,-1,0,0.086476,1,0,5,92,0.000000,-1,...,False,False,False,False,False,False,True,False,False,False
4,0.194805,-1,0,0.072812,0,0,5,198,0.000000,-1,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.428571,3,0,0.080293,0,0,17,977,0.032258,-1,...,False,False,False,False,False,False,False,True,False,False
45207,0.688312,1,0,0.088501,0,0,17,456,0.016129,-1,...,False,False,False,False,False,False,False,True,False,False
45208,0.701299,2,0,0.124689,0,0,17,1127,0.064516,184,...,False,False,False,False,False,False,False,True,False,False
45209,0.506494,2,0,0.078868,0,0,17,508,0.048387,-1,...,False,False,False,False,False,False,False,True,False,False


## Drop columns


In [14]:
x = x.drop(columns=["duration"])
x

Unnamed: 0,age,education,default,balance,housing,loan,day_of_week,campaign,pdays,previous,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,0.519481,3,0,0.092259,1,0,5,0.000000,-1,0,...,False,False,False,False,False,False,True,False,False,False
1,0.337662,2,0,0.073067,1,0,5,0.000000,-1,0,...,False,False,False,False,False,False,True,False,False,False
2,0.194805,2,0,0.072822,1,1,5,0.000000,-1,0,...,False,False,False,False,False,False,True,False,False,False
3,0.376623,-1,0,0.086476,1,0,5,0.000000,-1,0,...,False,False,False,False,False,False,True,False,False,False
4,0.194805,-1,0,0.072812,0,0,5,0.000000,-1,0,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.428571,3,0,0.080293,0,0,17,0.032258,-1,0,...,False,False,False,False,False,False,False,True,False,False
45207,0.688312,1,0,0.088501,0,0,17,0.016129,-1,0,...,False,False,False,False,False,False,False,True,False,False
45208,0.701299,2,0,0.124689,0,0,17,0.064516,184,3,...,False,False,False,False,False,False,False,True,False,False
45209,0.506494,2,0,0.078868,0,0,17,0.048387,-1,0,...,False,False,False,False,False,False,False,True,False,False


# Save dataset


In [15]:
combined = pd.concat([x, y], axis=1)
combined

Unnamed: 0,age,education,default,balance,housing,loan,day_of_week,campaign,pdays,previous,...,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,y
0,0.519481,3,0,0.092259,1,0,5,0.000000,-1,0,...,False,False,False,False,False,True,False,False,False,0
1,0.337662,2,0,0.073067,1,0,5,0.000000,-1,0,...,False,False,False,False,False,True,False,False,False,0
2,0.194805,2,0,0.072822,1,1,5,0.000000,-1,0,...,False,False,False,False,False,True,False,False,False,0
3,0.376623,-1,0,0.086476,1,0,5,0.000000,-1,0,...,False,False,False,False,False,True,False,False,False,0
4,0.194805,-1,0,0.072812,0,0,5,0.000000,-1,0,...,False,False,False,False,False,True,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.428571,3,0,0.080293,0,0,17,0.032258,-1,0,...,False,False,False,False,False,False,True,False,False,1
45207,0.688312,1,0,0.088501,0,0,17,0.016129,-1,0,...,False,False,False,False,False,False,True,False,False,1
45208,0.701299,2,0,0.124689,0,0,17,0.064516,184,3,...,False,False,False,False,False,False,True,False,False,1
45209,0.506494,2,0,0.078868,0,0,17,0.048387,-1,0,...,False,False,False,False,False,False,True,False,False,0


In [16]:
combined.to_csv("../dataset/dataset_prepped.csv", index=False, encoding="utf-8")