In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [3]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
import optuna as op

In [4]:
df = pd.read_csv("bank-full.csv", sep=";")

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [6]:
df.shape

(45211, 17)

In [7]:
df["y"].value_counts()

no     39922
yes     5289
Name: y, dtype: int64

In [8]:
5289/39922

0.1324833425179099

In [9]:
df["month"].unique().shape

(12,)

In [10]:
ordinal_cate = {
    "education": {"unknown":0, "primary":1, "secondary":2, "tertiary":3},
    "month": {"mar": "Spring", "apr": "Spring", "may": "Spring", 
              "jun": "Summer", "jul": "Summer", "aug": "Summer",
              "sep": "Autumn", "oct": "Autumn", "nov": "Autumn",
              "dec": "Winter", "jan": "Winter", "feb": "Winter"},
    "y": {"yes":1, "no":0}
}


In [11]:
df = df.replace(ordinal_cate)
df = df.replace({"unknown": "AA"}) #for drop first

In [12]:
df = pd.get_dummies(df,drop_first=True)
print(df.shape)
df.head()

(45211, 33)


Unnamed: 0,age,education,balance,day,duration,campaign,pdays,previous,y,job_admin.,...,housing_yes,loan_yes,contact_cellular,contact_telephone,month_Spring,month_Summer,month_Winter,poutcome_failure,poutcome_other,poutcome_success
0,58,3,2143,5,261,1,-1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1,44,2,29,5,151,1,-1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,33,2,2,5,76,1,-1,0,0,0,...,1,1,0,0,1,0,0,0,0,0
3,47,0,1506,5,92,1,-1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,33,0,1,5,198,1,-1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [13]:
df.columns

Index(['age', 'education', 'balance', 'day', 'duration', 'campaign', 'pdays',
       'previous', 'y', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'marital_married', 'marital_single', 'default_yes', 'housing_yes',
       'loan_yes', 'contact_cellular', 'contact_telephone', 'month_Spring',
       'month_Summer', 'month_Winter', 'poutcome_failure', 'poutcome_other',
       'poutcome_success'],
      dtype='object')

In [14]:
y = df["y"]
X = df.drop("y", axis=1)
X.head()

Unnamed: 0,age,education,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,...,housing_yes,loan_yes,contact_cellular,contact_telephone,month_Spring,month_Summer,month_Winter,poutcome_failure,poutcome_other,poutcome_success
0,58,3,2143,5,261,1,-1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1,44,2,29,5,151,1,-1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,33,2,2,5,76,1,-1,0,0,0,...,1,1,0,0,1,0,0,0,0,0
3,47,0,1506,5,92,1,-1,0,0,1,...,1,0,0,0,1,0,0,0,0,0
4,33,0,1,5,198,1,-1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [15]:
SEED = 2

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

X_train.shape, X_test.shape

((31647, 32), (13564, 32))

In [18]:
SAMPLING = "SMOTE"
smote = SMOTE(random_state=SEED)
X_train, y_train = smote.fit_resample(X_train, y_train)
y_train.value_counts()

0    27923
1    27923
Name: y, dtype: int64

In [19]:
logr = LogisticRegression(penalty='none')
logr.fit(X_train, y_train)

y_pred = logr.predict(X_test)
print(classification_report(y_test, y_pred, digits=5))
report = classification_report(y_test, y_pred, digits=8, output_dict=True)


y_pred = logr.predict(X_train)
print(classification_report(y_train, y_pred, digits=5))
report = classification_report(y_train, y_pred, digits=8, output_dict=True)



              precision    recall  f1-score   support

           0    0.94226   0.88407   0.91224     11999
           1    0.39679   0.58466   0.47275      1565

    accuracy                        0.84953     13564
   macro avg    0.66953   0.73437   0.69249     13564
weighted avg    0.87933   0.84953   0.86153     13564

              precision    recall  f1-score   support

           0    0.86669   0.88501   0.87575     27923
           1    0.88252   0.86388   0.87310     27923

    accuracy                        0.87444     55846
   macro avg    0.87461   0.87444   0.87443     55846
weighted avg    0.87461   0.87444   0.87443     55846

