In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Data processing

In [2]:
df = pd.read_csv('/kaggle/input/bankfullcsv/bank-full.csv')
df.head()

There are no nulls in the data, however some values are missing- they have the status 'unknown'. Let's take a look at this.

In [3]:
cat_cols = df.select_dtypes(exclude=['number']).columns
df[cat_cols] = df[cat_cols].replace('unknown', np.nan)
df.head()

Percentage of missing values per columns:

In [4]:
df.isna().sum()/len(df)*100

In [5]:
no = df[df.Target == 'no']
yes = df[df.Target == 'yes']
df = pd.concat([no.head(len(no.index)//20), yes.head(len(yes.index)//20)], axis=0)
df.sample(frac=1)
df

In [6]:
df.info()

There are too many missing values in 'poutcome' and 'contact'- we'll drop these columns for now.

We can fill the missing values in the remaining columns by taking the most common value.

In [7]:
df.drop('contact', axis=1, inplace=True)
df.drop('poutcome', axis=1, inplace=True)

In [8]:
df['job'] = df['job'].fillna(df['job'].value_counts().idxmax())
df['education'] = df['education'].fillna(df['job'].value_counts().idxmax())
df.info()

In [9]:
df.describe()

Change yes/no to 1/0:

In [10]:
yes_no_cols = ['default', 'housing', 'loan', 'Target']
df[yes_no_cols] = df[yes_no_cols].replace('yes', 1)
df[yes_no_cols] = df[yes_no_cols].replace('no', 0)
df.head()

Change month names to numbers:

In [11]:
dictionary = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
              'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12 }
df['month'] = df['month'].map(dictionary)
df.head()

In [12]:
df.job = pd.Categorical(df.job).codes
df.marital = pd.Categorical(df.marital).codes
df.education = pd.Categorical(df.education).codes
print(df)

In [13]:
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
sns.countplot(x='Target', data=df)

In [15]:
interesting_cols=['marital', 'education', 'job', 'loan']

for col in interesting_cols:
    sns.countplot(x=col, data=df)
    plt.show()
    sns.barplot(x=col, y='Target', data=df)
    plt.show()

## 2. Train test split

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, df['Target'], test_size=0.20, random_state=42)
X_train.drop('Target', inplace=True, axis=1)
X_test.drop('Target', inplace=True, axis=1)
y_train.head()

In [17]:
X_train.head()

## 3. Preparing models

In [18]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=5)

In [19]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

### SVC linear

In [20]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC(kernel='linear'))])


param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10]
}

grid_1 = GridSearchCV(pipe, param_grid, cv=kfold)

grid_1.fit(X_train, y_train)
print(grid_1.best_params_)
print(grid_1.best_score_)

### SVC poly

In [21]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC(kernel='poly'))])

param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100]
}

grid_2 = GridSearchCV(pipe, param_grid, cv=kfold)

grid_2.fit(X_train, y_train)
print(grid_2.best_params_)
print(grid_2.best_score_)

### Logistic Regression

In [22]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', LogisticRegression())])

param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10]
}

grid_3 = GridSearchCV(pipe, param_grid, cv=kfold)

grid_3.fit(X_train, y_train)
print(grid_3.best_params_)
print(grid_3.best_score_)

### Random Forests

In [23]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', RandomForestClassifier(random_state=420))])

param_grid = {           
    'classifier__n_estimators': [100,200,400,500],
    'classifier__max_depth': [10,15,20]
}

grid_4 = GridSearchCV(pipe, param_grid, cv=kfold)
    
grid_4.fit(X_train, y_train)
print(grid_4.best_params_)
print(grid_4.best_score_)

In [24]:
from sklearn.neural_network import MLPClassifier
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', MLPClassifier())])

param_grid = {
            'classifier__hidden_layer_sizes': [(20,10)],
            'classifier__learning_rate_init': [0.001, 0.01, 0.1],
            'classifier__max_iter': [100],
            'classifier__batch_size': [8, 16, 32],
}

grid_5 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)

grid_5.fit(X_train, y_train)
print(grid_5.best_params_)
print(grid_5.best_score_)

In [25]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import BatchNormalization
from keras.layers import Normalization
from keras.layers import Activation

tf.random.set_seed(420)
model = Sequential()
norm = Normalization()
X_t, y_t = X_train, y_train
norm.adapt(X_t)
model.add(norm)
model.add(Dense(200, use_bias=False , input_shape=(X_train.shape[1],)))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dense(100, use_bias=False ))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(Dense(20, use_bias=False ))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(Dense(1, activation='elu'))

model.compile(loss=tf.losses.MeanSquaredError(), optimizer='Adam', metrics=["accuracy"])
history=model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=16, epochs=100)

In [26]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)
plt.show()

## 4. Models estimation

In [27]:
from sklearn import  metrics


models = []
models.append(('SVM linear', grid_1.best_estimator_))
models.append(('SVM poly', grid_2.best_estimator_))
models.append(('Logistic regression', grid_3.best_estimator_))
models.append(('Random forest', grid_4.best_estimator_))
models.append(('MLP Classifier', grid_5.best_estimator_))
models.append(('Sequential', model))

precision_score = []
recall_score = []
f1_score = []
accuracy_score = []
results = []
for name, model in models:
    print(name)
    print("precision_score: {}".format(metrics.precision_score(y_test, model.predict(X_test).round().astype(int)) ))
    print("recall_score: {}".format( metrics.recall_score(y_test, model.predict(X_test).round().astype(int)) ))
    print("f1_score: {}".format( metrics.f1_score(y_test, model.predict(X_test).round().astype(int)) ))
    print("accuracy_score: {}".format( metrics.accuracy_score(y_test, model.predict(X_test).round().astype(int)) ))
    precision_score.append(metrics.precision_score(y_test, model.predict(X_test).round().astype(int)))
    recall_score.append(metrics.recall_score(y_test, model.predict(X_test).round().astype(int)))
    f1_score.append( metrics.f1_score(y_test, model.predict(X_test).round().astype(int)))
    accuracy_score.append(metrics.accuracy_score(y_test, model.predict(X_test).round().astype(int)))
    results.append([name, metrics.precision_score(y_test, model.predict(X_test).round().astype(int)), metrics.recall_score(y_test, model.predict(X_test).round().astype(int)),
                   metrics.f1_score(y_test, model.predict(X_test).round().astype(int)),metrics.accuracy_score(y_test, model.predict(X_test).round().astype(int))])

In [28]:
import pandas as pd
d = {'precision_score': precision_score, 
     'recall_score': recall_score, 
     'f1_score': f1_score,
     'accuracy_score' : accuracy_score
    }
df = pd.DataFrame(data=d)
df.insert(loc=0, column='Method', value=['SVM linear', 'SVM poly', 'Logistic regression','Random forest', 'MLP Classifier', 'Sequential'])
print(df)