In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score

In [7]:
df = pd.read_csv('bank_marketing_data.csv', delimiter=';')
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


## 1. Cleanup

In [9]:
obsolete = ['default', 'duration', 'pdays', 'poutcome']
df = df.drop(obsolete, axis=1)

In [10]:
df = df.loc[df['education'] != 'illiterate', :]

In [11]:
education_mapping = {
    "basic.4y": "primary",
    "basic.6y": "primary",
    "basic.9y": "primary",
    "high.school": "secondary",
    "professional.course": "secondary",
    "university.degree": "tertiary",
}

contact_mapping = {
    "telephone": 0,
    "cellular": 1,
}

month_mapping = {
    "jan": 1,
    "feb": 2,
    "mar": 3,
    "apr": 4,
    "may": 5,
    "jun": 6,
    "jul": 7,
    "aug": 8,
    "sep": 9,
    "oct": 10,
    "nov": 11,
    "dec": 12,
}

day_of_week_mapping = {
    "mon": 1,
    "tue": 2,
    "wed": 3,
    "thu": 4,
    "fri": 5,
}

y_mapping = {
    "no": 0,
    "yes": 1,
}

mapping = {
    'education': education_mapping,
    'contact': contact_mapping,
    'month': month_mapping,
    'day_of_week': day_of_week_mapping,
    'y': y_mapping
}

In [12]:
df = df.replace(mapping)

In [13]:
y = df.pop('y')

## 2. Split

In [14]:
df_train, df_test, y_train, y_test = train_test_split(
    df,
    y,
    test_size=0.25,
    random_state=0,
    stratify=y
)

In [15]:
categorical_features = df.select_dtypes(include="object").columns
numerical_features = df.select_dtypes(exclude="object").columns

## 3. Transformation

In [16]:
column_transformer = ColumnTransformer([
    ('categorial_encoding', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features),
    ('numerical_standarization', StandardScaler(), numerical_features)
])

In [18]:
column_transformer.fit(df_train, y_train)

df_train = column_transformer.transform(df_train)
df_test = column_transformer.transform(df_test)

## 4. Training

### Helpers

In [23]:
def calculate_score(y_true, y_pred):
    return (
        recall_score(y_true, y_pred, average='binary'),
        precision_score(y_true, y_pred, average='binary'),
        f1_score(y_true, y_pred, average='binary')
    )

def print_stats(title, recall, precision, f1):
    print(f"{title}:")

    print(f"\trecall: {recall}")
    print(f"\tprecision: {precision}")
    print(f"\tf1: {f1}")

def assess_logistic_regression_model(model, X_train, X_test, y_train, y_test):
    y_train_predict = model.predict(X_train)
    y_test_predict = model.predict(X_test)

    train_score = calculate_score(y_train, y_train_predict)
    test_score = calculate_score(y_test, y_test_predict)

    print_stats("Train score", *train_score)
    print_stats("Test score", *test_score)

### 4.1. Classic model

In [19]:
model = LogisticRegression(
    penalty=None,
    class_weight='balanced',
    fit_intercept=False
)

In [21]:
model.fit(df_train, y_train)

In [24]:
assess_logistic_regression_model(
    model,
    df_train,
    df_test,
    y_train,
    y_test
)

Train score:
	recall: 0.6833477135461605
	precision: 0.26883910386965376
	f1: 0.38587088915956147
Test score:
	recall: 0.6686798964624676
	precision: 0.263784887678693
	f1: 0.37832560410056143


### 4.2. L2 regularized model

In [27]:
l2_model = LogisticRegressionCV(
    Cs=100,
    fit_intercept=False,
    class_weight='balanced',
    cv=5,
    scoring='f1',
    random_state=0,
    n_jobs=-1
)

In [29]:
l2_model.fit(df_train, y_train)

In [30]:
assess_logistic_regression_model(
    l2_model,
    df_train,
    df_test,
    y_train,
    y_test
)

Train score:
	recall: 0.6833477135461605
	precision: 0.26883910386965376
	f1: 0.38587088915956147
Test score:
	recall: 0.6686798964624676
	precision: 0.263784887678693
	f1: 0.37832560410056143


### 4.3. L1 regularized model

In [31]:
l1_model = LogisticRegressionCV(
    Cs=100,
    fit_intercept=False,
    class_weight='balanced',
    cv=5,
    penalty='l1',
    scoring='f1',
    solver='saga',
    random_state=0,
    n_jobs=-1
)

In [32]:
l1_model.fit(df_train, y_train)

In [33]:
assess_logistic_regression_model(
    l1_model,
    df_train,
    df_test,
    y_train,
    y_test
)

Train score:
	recall: 0.6833477135461605
	precision: 0.26883910386965376
	f1: 0.38587088915956147
Test score:
	recall: 0.6686798964624676
	precision: 0.263784887678693
	f1: 0.37832560410056143


### 4.4. Polynomial enriched model

In [34]:
poly_features = PolynomialFeatures(2, include_bias=False)

poly_features.fit(df_train, y_train)

df_train = poly_features.transform(df_train)
df_test = poly_features.transform(df_test)

In [42]:
poly_model = LogisticRegression(
    penalty=None,
    class_weight='balanced',
    fit_intercept=False,
    max_iter=10000,
    n_jobs=-1
)

In [43]:
poly_model.fit(df_train, y_train)

In [44]:
assess_logistic_regression_model(
    poly_model,
    df_train,
    df_test,
    y_train,
    y_test
)

Train score:
	recall: 0.6594765602530918
	precision: 0.341932597673725
	f1: 0.4503584405381519
Test score:
	recall: 0.634167385677308
	precision: 0.33093201260693383
	f1: 0.43491124260355035
