In [2]:
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [3]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [4]:
label_encoder = LabelEncoder()
def label_data(jobs):
    global label_encoder
    jobs = list(jobs)
    data = {}
    integer_encoded = label_encoder.fit_transform(jobs)

    for z in list(set(zip(jobs,integer_encoded))):
        job,code = z[0],z[1]
        data[job] = code

    return integer_encoded, data

def is_numerical(value):
    try:
        float(value)
    except:
        return False
    else:
        return True


In [6]:
df = pd.read_csv('term-deposit-marketing-2020.csv', sep=',')
df = df.dropna(axis="columns", how="any")
months = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6, 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}

display(df.head())

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,no


In [7]:
# Change all string type columns to numerical 
# Convert month and days columns to time
dictionary = {}
for key in df.keys():
    if key == "month":
        df[key] = [months[val] for val in df[key]]
    if not is_numerical(df[key][0]):
        df[key], dictionary[key] = label_data(list(df[key]))
df['time'] = df['month']*30 + df['day']
df = df.drop(columns=['day', 'month'])
display(df.head())

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,y,time
0,58,4,1,2,0,2143,1,0,2,261,1,0,155
1,44,9,2,1,0,29,1,0,2,151,1,0,155
2,33,2,1,1,0,2,1,1,2,76,1,0,155
3,47,1,1,3,0,1506,1,0,2,92,1,0,155
4,33,11,2,3,0,1,0,0,2,198,1,0,155


In [8]:
# Separating data in equal numbers of two classes
# leave 500 data for best models testing
df_partial = df[df['y'] == 1]
df_temp = df[df['y'] == 0]
df_partial = df_partial.append(df_temp[:len(df_partial)  ], ignore_index = True)
df_partial = df_partial.sample(frac=1, random_state=42)
y = np.array(list(df_partial['y']))
df_partial = df_partial.drop(columns=['y'])
x = df_partial.to_numpy()



# abs_test lists are not going to use in training
abs_size = 500
abs_test_x = x[-abs_size:]
abs_test_y = y[-abs_size:]
x, y = x[:-abs_size], y[:-abs_size]

print("No  classes count ",list(y).count(0))
print("Yes classes count ",list(y).count(1))
print("Train x shape ", x.shape)
print("Train y shape ", y.shape)

No  classes count  2629
Yes classes count  2663
Train x shape  (5292, 12)
Train y shape  (5292,)


In [13]:
# Split the train data for 5-fold cross validation
from sklearn.model_selection import KFold

kfold = KFold(5)

train_x, test_x = [], []
for train, test in kfold.split(x):
    train_x.append(x[train])
    test_x.append(x[test])

train_y, test_y = [], []
for train, test in kfold.split(y):
    train_y.append(y[train])
    test_y.append(y[test])

train_x, test_x = np.array(train_x), np.array(test_x)
train_y, test_y = np.array(train_y), np.array(test_y)

print(train_x[0].shape)
print(train_y[0].shape)
print(test_x[0].shape)
print(test_y[0].shape)

(4233, 12)
(4233,)
(1059, 12)
(1059,)


In [14]:
from keras import models
from keras import layers

def build_model():
    # Because we will need to instantiate
    # the same model multiple times,
    # we use a function to construct it.
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu',
                           input_shape=(train_x[0].shape[1],)))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid') )
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy', 'mean_absolute_error'])
    return model

In [15]:
from keras import backend as K
K.clear_session()
all_scores = []
best_model, best_score = None, 0
epoch_num = 25

for i,(partial_train_x, partial_train_y, partial_test_x, partial_test_y) in enumerate(zip(train_x, train_y, test_x, test_y)):
  print(f"{i+1}th Fold")
  # Build the Keras model (already compiled)
  # Train the model (in silent mode, verbose=0)
  model = build_model()
  model.fit(partial_train_x, partial_train_y, epochs=epoch_num, batch_size=1, verbose=0)
  _, val_mae, _ = model.evaluate(partial_test_x, partial_test_y, verbose=0)
  if val_mae>best_score:
    best_model = model
    best_score = val_mae
  all_scores.append(val_mae)

1th Fold
2th Fold
3th Fold
4th Fold
5th Fold


In [17]:
for fold,score in enumerate(all_scores):
  print(f"Fold {fold} Score: {score}")
  
print(f"Average Performance Score Of All Folds {np.mean(all_scores)}")

Fold 0 Score: 0.9282341599464417
Fold 1 Score: 0.9565628170967102
Fold 2 Score: 0.942344069480896
Fold 3 Score: 0.9083175659179688
Fold 4 Score: 0.9120982885360718
Average Performance Score Of All Folds 0.9295113801956176
