# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support, matthews_corrcoef
from imblearn.over_sampling import SMOTE, ADASYN
#from dataprep.eda import create_report

# Define Useful Variables and Functions

In [None]:
# Define variables
RANDOM_STATES = [1,2,3,4,5]
N_ESTIMATORS = 50
MAX_ITER = 500
HIDDEN_LAYER_SIZE = (64,32,16,8)

# Implement function to calculate and display required metrics
def metrics(y_true, y_pred, target_names, iter_number):
    precision, recall, fmeasure, _ = precision_recall_fscore_support(y_true, y_pred, pos_label=1, average='binary')
    _, _, macro_fmeasure, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    _, _, micro_fmeasure, _ = precision_recall_fscore_support(y_true, y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    #cm = confusion_matrix(y_true, y_pred)
    #disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
    #disp.plot()
    #plt.title(f'Iteration {iter_number}')
    #plt.show()
    return {'precision': precision*100,
            'recall': recall*100,
            'fmeasure': fmeasure*100,
            'macro_fmeasure': macro_fmeasure*100,
            'micro_fmeasure': micro_fmeasure*100,
            'accuracy': accuracy*100,
            'mcc': mcc*100}

# Load and Display Data

In [None]:
train_path = "https://raw.githubusercontent.com/haidiazaman/nus-dsa5101/main/Project/trainingdata.txt"
train_data = pd.read_csv(train_path, sep=';')
train_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,42,technician,divorced,secondary,no,-78,yes,yes,cellular,29,jul,10,23,-1,0,unknown,no
1,50,management,married,tertiary,no,106,yes,no,unknown,15,may,388,2,-1,0,unknown,no
2,47,admin.,married,secondary,no,407,yes,no,unknown,5,jun,67,12,-1,0,unknown,no
3,39,admin.,divorced,secondary,no,952,yes,no,unknown,16,jun,62,1,-1,0,unknown,no
4,30,management,single,tertiary,no,364,no,no,cellular,30,apr,306,2,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36163,30,management,single,tertiary,no,1243,yes,no,telephone,13,nov,86,1,174,1,failure,no
36164,60,blue-collar,married,secondary,no,171,yes,no,unknown,19,may,219,2,-1,0,unknown,no
36165,41,technician,married,secondary,no,-274,yes,no,unknown,27,may,182,3,-1,0,unknown,no
36166,45,blue-collar,married,primary,no,103,no,no,cellular,28,jan,64,3,-1,0,unknown,no


In [None]:
test_path = "https://raw.githubusercontent.com/haidiazaman/nus-dsa5101/main/Project/testdata.txt"
test_data = pd.read_csv(test_path, sep=';')
test_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,48,housemaid,married,tertiary,no,468,yes,no,unknown,14,may,220,1,-1,0,unknown,no
1,31,management,single,tertiary,no,10215,no,no,cellular,22,aug,139,2,-1,0,unknown,no
2,45,blue-collar,married,primary,no,900,yes,no,unknown,9,jun,213,1,-1,0,unknown,no
3,58,blue-collar,married,primary,no,1231,no,no,unknown,20,jun,21,3,-1,0,unknown,no
4,35,technician,single,tertiary,no,5301,no,no,cellular,21,nov,937,2,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9038,27,services,single,secondary,no,1,yes,yes,unknown,2,jun,162,3,-1,0,unknown,no
9039,34,blue-collar,married,secondary,no,175,yes,no,cellular,14,may,50,2,364,2,failure,no
9040,34,blue-collar,married,primary,no,1873,yes,no,cellular,2,feb,192,1,-1,0,unknown,no
9041,37,technician,single,secondary,no,61,yes,no,unknown,19,may,182,8,-1,0,unknown,no


# Data Pre-Processing

In [None]:
# Drop poutcome column due to high number of unknowns
train_data.drop(['poutcome'], axis=1, inplace=True)
test_data.drop(['poutcome'], axis=1, inplace=True)

In [None]:
# Separate features and labels
x_train, y_train = train_data.iloc[:,:-1], train_data.iloc[:,-1]
x_test, y_test = test_data.iloc[:,:-1], test_data.iloc[:,-1]

### Imputing of data for features: job, contact and education

In [None]:
# Convert features to impute to integer values first since KNNInputer only works on numbers
ordinal_dict = {
    "job": OrdinalEncoder(categories=[['unknown', 'technician', 'management', 'admin.', 'services', 'blue-collar', 'unemployed', 'retired', 'self-employed', 'housemaid', 'student', 'entrepreneur']], dtype=np.int8),
    "contact": OrdinalEncoder(categories=[['unknown', 'cellular', 'telephone']], dtype=np.int8),
    "education": OrdinalEncoder(categories=[['unknown', 'primary', 'secondary', 'tertiary']], dtype=np.int8),
}

for key, value in ordinal_dict.items():
    x_train[key] = value.fit_transform(x_train[[key]])
    x_test[key] = value.transform(x_test[[key]])

# Replace unknown data with NaN
x_train = x_train.replace({'contact': 0, 'job': 0, 'education': 0}, np.nan)
x_test = x_test.replace({'contact': 0, 'job': 0, 'education': 0}, np.nan)

# Extract only features to impute so nothing else will mess with the imputing
features_to_impute_train = x_train[['job', 'contact', 'education']]
features_to_impute_test = x_test[['job', 'contact', 'education']]

# Impute
imputer = KNNImputer(n_neighbors=5)
features_to_impute_train = pd.DataFrame(imputer.fit_transform(features_to_impute_train.values), index=features_to_impute_train.index, columns=features_to_impute_train.columns)
features_to_impute_test = pd.DataFrame(imputer.transform(features_to_impute_test.values), index=features_to_impute_test.index, columns=features_to_impute_test.columns)

# Replace old values with new imputed values
x_train[['job', 'contact', 'education']] = features_to_impute_train
x_test[['job', 'contact', 'education']] = features_to_impute_test

# Reverse encoding of values to get back feature value in string
for key, value in ordinal_dict.items():
    x_train[key] = value.inverse_transform(x_train[[key]])
    x_test[key] = value.inverse_transform(x_test[[key]])

In [None]:
# Use ordinal encoder to encode month column
all_months = [['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']]
ordinal_encoder_month = OrdinalEncoder(categories=all_months)
x_train[['month']] = ordinal_encoder_month.fit_transform(x_train[['month']])
x_test[['month']] = ordinal_encoder_month.transform(x_test[['month']])

# One hot encoding for nominal columns
all_nominal_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact']
x_train = pd.get_dummies(x_train, columns=all_nominal_cols, drop_first=True)
x_test = pd.get_dummies(x_test, columns=all_nominal_cols, drop_first=True)

# Use label encoder to encode y_train and y_test
target_names = ['no', 'yes']
label_encoder = LabelEncoder()
label_encoder.fit(target_names)
y_train = label_encoder.transform(y_train.ravel())
y_test = label_encoder.transform(y_test.ravel())

# Use standard scaler to scale each column to standardised z score
for column in x_train.columns:
    scaler = MinMaxScaler()
    scaler.fit(x_train[[column]])
    x_train[[column]] = scaler.transform(x_train[[column]])
    x_test[[column]] = scaler.transform(x_test[[column]])

In [None]:
x_train

Unnamed: 0,age,balance,day,month,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,...,job_technician,job_unemployed,marital_married,marital_single,education_secondary,education_tertiary,default_yes,housing_yes,loan_yes,contact_telephone
0,0.311688,0.072095,0.933333,0.545455,0.002033,0.354839,0.000000,0.000000,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,0.415584,0.073766,0.466667,0.363636,0.078894,0.016129,0.000000,0.000000,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.376623,0.076498,0.133333,0.454545,0.013623,0.177419,0.000000,0.000000,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.272727,0.081446,0.500000,0.454545,0.012607,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.155844,0.076108,0.966667,0.272727,0.062220,0.016129,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36163,0.155844,0.084088,0.400000,0.909091,0.017487,0.000000,0.200688,0.003636,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
36164,0.545455,0.074356,0.600000,0.363636,0.044530,0.016129,0.000000,0.000000,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
36165,0.298701,0.070316,0.866667,0.363636,0.037007,0.032258,0.000000,0.000000,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
36166,0.350649,0.073738,0.900000,0.000000,0.013013,0.032258,0.000000,0.000000,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
x_train.columns

Index(['age', 'balance', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'marital_married',
       'marital_single', 'education_secondary', 'education_tertiary',
       'default_yes', 'housing_yes', 'loan_yes', 'contact_telephone'],
      dtype='object')

### Oversampling (Toy Example)

In [None]:
# With SMOTE
sampler_smote = SMOTE(random_state=1)
x_train_smote, y_train_smote = sampler_smote.fit_resample(x_train, y_train)

# With ADASYN
sampler_adasyn = ADASYN(random_state=1)
x_train_adasyn, y_train_adasyn = sampler_adasyn.fit_resample(x_train, y_train)

print('Length of Training Data\n')
print('Without Oversampling:', len(x_train))
print('With SMOTE:', len(x_train_smote))
print('With ADASYN:', len(x_train_adasyn))

Length of Training Data

Without Oversampling: 36168
With SMOTE: 63874
With ADASYN: 64087


# Method 1: Random Forest Classifier



In [None]:
store_dict = {
  'model':[],
  'oversampling':[],
  'precision': [],
  'recall': [],
  'fmeasure':  [],
  'macro_fmeasure': [],
  'micro_fmeasure': [],
  'accuracy': [],
  'mcc': []
}

method = "Random Forest"

### Without Oversampling

In [None]:
oversampling_method = "without oversampling"

for RANDOM_STATE in RANDOM_STATES:
    classifier = RandomForestClassifier(n_estimators=N_ESTIMATORS, criterion='entropy', random_state=RANDOM_STATE)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Oversampling Using SMOTE

In [None]:
oversampling_method = 'SMOTE'

for RANDOM_STATE in RANDOM_STATES:
    sampler_smote = SMOTE(random_state=RANDOM_STATE)
    x_train_smote, y_train_smote = sampler_smote.fit_resample(x_train, y_train)
    classifier = RandomForestClassifier(n_estimators=N_ESTIMATORS, criterion='entropy', random_state=RANDOM_STATE)
    classifier.fit(x_train_smote, y_train_smote)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Oversampling with ADASYN

In [None]:
oversampling_method = 'ADASYN'

for RANDOM_STATE in RANDOM_STATES:
    sampler_adasyn = ADASYN(random_state=RANDOM_STATE)
    x_train_adasyn, y_train_adasyn = sampler_adasyn.fit_resample(x_train, y_train)
    classifier = RandomForestClassifier(n_estimators=N_ESTIMATORS, criterion='entropy', random_state=RANDOM_STATE)
    classifier.fit(x_train_adasyn, y_train_adasyn)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Display Results

In [None]:
results_rf = pd.DataFrame.from_dict(store_dict)
results_rf

Unnamed: 0,model,oversampling,precision,recall,fmeasure,macro_fmeasure,micro_fmeasure,accuracy,mcc
0,Random Forest,without oversampling,62.541806,35.349716,45.169082,69.821303,89.959084,89.959084,42.092711
1,Random Forest,without oversampling,61.655405,34.499055,44.242424,69.322478,89.826385,89.826385,41.136232
2,Random Forest,without oversampling,62.350937,34.593573,44.49848,69.472645,89.903793,89.903793,41.52018
3,Random Forest,without oversampling,62.457338,34.593573,44.525547,69.489388,89.914851,89.914851,41.569485
4,Random Forest,without oversampling,62.354892,35.538752,45.273931,69.869842,89.948026,89.948026,42.125397
5,Random Forest,SMOTE,52.985075,60.396975,56.448763,75.108467,89.096539,89.096539,50.388061
6,Random Forest,SMOTE,52.874494,61.720227,56.955953,75.353174,89.08548,89.08548,50.948066
7,Random Forest,SMOTE,53.592073,61.342155,57.205818,75.533427,89.262413,89.262413,51.251121
8,Random Forest,SMOTE,52.793522,61.625709,56.868731,75.303231,89.063364,89.063364,50.847875
9,Random Forest,SMOTE,52.085037,60.20794,55.852696,74.740647,88.864315,88.864315,49.689687


In [None]:
a = results_rf
print('without oversampling\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][0:5].values.mean(), a[a.columns[[i]]][0:5].values.std())
print('')
print('smote\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][5:10].values.mean(), a[a.columns[[i]]][5:10].values.std())
print('')
print('adasyn\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][10:15].values.mean(), a[a.columns[[i]]][10:15].values.std())

without oversampling

precision 62.27207569654037 0.3163657528060671
recall 34.91493383742911 0.43764978838526375
fmeasure 44.741893084618255 0.40523563063038753
macro_fmeasure 69.59513132574605 0.21314047574442244
micro_fmeasure 89.91042795532455 0.04670731412571621
accuracy 89.91042795532455 0.04670731412571621
mcc 41.68880095988324 0.37463616982003795

smote

precision 52.86604005663636 0.4809550859066024
recall 61.05860113421551 0.632635180744103
fmeasure 56.666392159571274 0.47439005425776426
macro_fmeasure 75.2077893490002 0.26995263406346387
micro_fmeasure 89.07442220502045 0.12666434646268784
accuracy 89.07442220502045 0.12666434646268784
mcc 50.62496181730929 0.5435114143018094

adasyn

precision 52.54521250245695 0.5284083665762311
recall 61.701323251417776 0.8854471401610864
fmeasure 56.753948066978396 0.5845412297320781
macro_fmeasure 75.22598969916402 0.32709709106711765
micro_fmeasure 88.99922592060157 0.14237290226190924
accuracy 88.99922592060157 0.14237290226190924
mcc

# Method 2: AdaBoost Classifier



In [None]:
store_dict = {
  'model':[],
  'oversampling':[],
  'precision': [],
  'recall': [],
  'fmeasure':  [],
  'macro_fmeasure': [],
  'micro_fmeasure': [],
  'accuracy': [],
  'mcc': []
}

method = "AdaBoost"

### Without Oversampling

In [None]:
oversampling_method = "without oversampling"

for RANDOM_STATE in RANDOM_STATES:
    classifier = AdaBoostClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Oversampling using SMOTE

In [None]:
oversampling_method = 'SMOTE'

for RANDOM_STATE in RANDOM_STATES:
    sampler_smote = SMOTE(random_state=RANDOM_STATE)
    x_train_smote, y_train_smote = sampler_smote.fit_resample(x_train, y_train)
    classifier = AdaBoostClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE)
    classifier.fit(x_train_smote, y_train_smote)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Oversampling with ADASYN

In [None]:
oversampling_method = 'ADASYN'

for RANDOM_STATE in RANDOM_STATES:
    sampler_adasyn = ADASYN(random_state=RANDOM_STATE)
    x_train_adasyn, y_train_adasyn = sampler_adasyn.fit_resample(x_train, y_train)
    classifier = AdaBoostClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE)
    classifier.fit(x_train_adasyn, y_train_adasyn)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Display Results

In [None]:
results_adaboost = pd.DataFrame.from_dict(store_dict)
results_adaboost

Unnamed: 0,model,oversampling,precision,recall,fmeasure,macro_fmeasure,micro_fmeasure,accuracy,mcc
0,AdaBoost,without oversampling,56.912029,29.962193,39.256966,66.650522,89.15183,89.15183,36.038409
1,AdaBoost,without oversampling,56.912029,29.962193,39.256966,66.650522,89.15183,89.15183,36.038409
2,AdaBoost,without oversampling,56.912029,29.962193,39.256966,66.650522,89.15183,89.15183,36.038409
3,AdaBoost,without oversampling,56.912029,29.962193,39.256966,66.650522,89.15183,89.15183,36.038409
4,AdaBoost,without oversampling,56.912029,29.962193,39.256966,66.650522,89.15183,89.15183,36.038409
5,AdaBoost,SMOTE,44.430616,67.485822,53.58349,72.78098,86.320911,86.320911,47.340105
6,AdaBoost,SMOTE,44.40367,68.620038,53.917564,72.927729,86.276678,86.276678,47.801516
7,AdaBoost,SMOTE,44.23676,67.10775,53.32332,72.631964,86.254562,86.254562,47.024106
8,AdaBoost,SMOTE,43.578819,67.674858,53.017401,72.384554,85.967046,85.967046,46.734979
9,AdaBoost,SMOTE,44.641725,66.540643,53.434535,72.746651,86.431494,86.431494,47.103692


In [None]:
a = results_adaboost
print('without oversampling\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][0:5].values.mean(), a[a.columns[[i]]][0:5].values.std())
print('')
print('smote\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][5:10].values.mean(), a[a.columns[[i]]][5:10].values.std())
print('')
print('adasyn\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][10:15].values.mean(), a[a.columns[[i]]][10:15].values.std())

without oversampling

precision 56.91202872531418 0.0
recall 29.96219281663516 0.0
fmeasure 39.256965944272444 0.0
macro_fmeasure 66.65052170688213 0.0
micro_fmeasure 89.15183014486342 0.0
accuracy 89.15183014486342 0.0
mcc 36.03840888334707 0.0

smote

precision 44.25831798623286 0.3633174450483735
recall 67.48582230623819 0.6868015926119098
fmeasure 53.455261873475955 0.29655563619230585
macro_fmeasure 72.694375494433 0.18142312850047235
micro_fmeasure 86.25013822846401 0.1541192205257941
accuracy 86.25013822846401 0.15411922052579402
mcc 47.20087951944921 0.35718477876431265

adasyn

precision 41.92427866288759 0.5600282632735519
recall 65.5765595463138 0.7523534496080351
fmeasure 51.144483147033284 0.4685416461983532
macro_fmeasure 71.26058959070903 0.30144296218133565
micro_fmeasure 85.3411478491651 0.25232264862460885
accuracy 85.3411478491651 0.2523226486246036
mcc 44.50249351850961 0.5470298879140048


# Method 3: Decision Tree Classifier



In [None]:
store_dict = {
  'model':[],
  'oversampling':[],
  'precision': [],
  'recall': [],
  'fmeasure':  [],
  'macro_fmeasure': [],
  'micro_fmeasure': [],
  'accuracy': [],
  'mcc': []
}

method = "Decision Tree"

### Without Oversampling

In [None]:
oversampling_method = "without oversampling"

for RANDOM_STATE in RANDOM_STATES:
    classifier = DecisionTreeClassifier(random_state=RANDOM_STATE)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Oversampling with SMOTE

In [None]:
oversampling_method = 'SMOTE'

for RANDOM_STATE in RANDOM_STATES:
    sampler_smote = SMOTE(random_state=RANDOM_STATE)
    x_train_smote, y_train_smote = sampler_smote.fit_resample(x_train, y_train)
    classifier = DecisionTreeClassifier(random_state=RANDOM_STATE)
    classifier.fit(x_train_smote, y_train_smote)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Oversampling using ADASYN

In [None]:
oversampling_method = 'ADASYN'

for RANDOM_STATE in RANDOM_STATES:
    sampler_adasyn = ADASYN(random_state=RANDOM_STATE)
    x_train_adasyn, y_train_adasyn = sampler_adasyn.fit_resample(x_train, y_train)
    classifier = DecisionTreeClassifier(random_state=RANDOM_STATE)
    classifier.fit(x_train_adasyn, y_train_adasyn)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Display Results

In [None]:
results_dt = pd.DataFrame.from_dict(store_dict)
results_dt

Unnamed: 0,model,oversampling,precision,recall,fmeasure,macro_fmeasure,micro_fmeasure,accuracy,mcc
0,Decision Tree,without oversampling,44.493392,47.731569,46.055632,69.306051,86.918058,86.918058,38.653402
1,Decision Tree,without oversampling,45.132743,48.204159,46.617916,69.63554,87.083932,87.083932,39.307674
2,Decision Tree,without oversampling,45.013477,47.353497,46.153846,69.404287,87.072874,87.072874,38.830024
3,Decision Tree,without oversampling,44.154676,46.408318,45.253456,68.894635,86.862767,86.862767,37.809638
4,Decision Tree,without oversampling,44.573991,46.975425,45.743212,69.167088,86.962291,86.962291,38.357016
5,Decision Tree,SMOTE,42.417417,53.402647,47.280335,69.626406,86.066571,86.066571,39.720851
6,Decision Tree,SMOTE,42.666667,54.442344,47.840532,69.914653,86.110804,86.110804,40.359941
7,Decision Tree,SMOTE,42.58982,53.780718,47.535505,69.765713,86.110804,86.110804,40.014094
8,Decision Tree,SMOTE,41.371515,51.890359,46.037736,68.920403,85.767997,85.767997,38.283913
9,Decision Tree,SMOTE,42.551622,54.536862,47.804474,69.882329,86.066571,86.066571,40.314982


In [None]:
a = results_dt
print('without oversampling\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][0:5].values.mean(), a[a.columns[[i]]][0:5].values.std())
print('')
print('smote\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][5:10].values.mean(), a[a.columns[[i]]][5:10].values.std())
print('')
print('adasyn\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][10:15].values.mean(), a[a.columns[[i]]][10:15].values.std())

without oversampling

precision 44.67365596252963 0.35721358905687894
recall 47.33459357277883 0.6166165002003816
fmeasure 45.964812396806096 0.45291508066647634
macro_fmeasure 69.2815201931954 0.2464867999836983
micro_fmeasure 86.97998451841202 0.08639621800122341
accuracy 86.97998451841204 0.08639621800121809
mcc 38.59155069663212 0.4979800792733511

smote

precision 42.31940831140897 0.48077906067278786
recall 53.61058601134215 0.9572014292001646
fmeasure 47.29971629420622 0.662695737406177
macro_fmeasure 69.62190077376236 0.36505567934539734
micro_fmeasure 86.02454937520734 0.12979233220056363
accuracy 86.02454937520734 0.12979233220056363
mcc 39.73875629974627 0.7629687563452416

adasyn

precision 42.24731725711818 0.5166595329950289
recall 54.7069943289225 1.5092123747608372
fmeasure 47.666462366849764 0.6842435807275727
macro_fmeasure 69.7757118427179 0.35528928133743154
micro_fmeasure 85.94935309078846 0.1935166900738883
accuracy 85.94935309078846 0.1935166900738883
mcc 40.1564

# Method 4: Logistic Regression



In [None]:
store_dict = {
  'model':[],
  'oversampling':[],
  'precision': [],
  'recall': [],
  'fmeasure':  [],
  'macro_fmeasure': [],
  'micro_fmeasure': [],
  'accuracy': [],
  'mcc': []
}

method = "Logistic Regression"

### Without Oversampling

In [None]:
oversampling_method = "without oversampling"

for RANDOM_STATE in RANDOM_STATES:
    classifier = LogisticRegression(random_state=RANDOM_STATE, max_iter=MAX_ITER)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Oversampling with SMOTE

In [None]:
oversampling_method = 'SMOTE'

for RANDOM_STATE in RANDOM_STATES:
    sampler_smote = SMOTE(random_state=RANDOM_STATE)
    x_train_smote, y_train_smote = sampler_smote.fit_resample(x_train, y_train)
    classifier = LogisticRegression(random_state=RANDOM_STATE, max_iter=MAX_ITER)
    classifier.fit(x_train_smote, y_train_smote)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Oversampling with ADASYN

In [None]:
oversampling_method = 'ADASYN'

for RANDOM_STATE in RANDOM_STATES:
    sampler_adasyn = ADASYN(random_state=RANDOM_STATE)
    x_train_adasyn, y_train_adasyn = sampler_adasyn.fit_resample(x_train, y_train)
    classifier = LogisticRegression(random_state=RANDOM_STATE, max_iter=MAX_ITER)
    classifier.fit(x_train_adasyn, y_train_adasyn)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Display Results

In [None]:
results_lr = pd.DataFrame.from_dict(store_dict)
results_lr

Unnamed: 0,model,oversampling,precision,recall,fmeasure,macro_fmeasure,micro_fmeasure,accuracy,mcc
0,Logistic Regression,without oversampling,58.333333,17.863894,27.351664,60.670564,88.89749,88.89749,27.968621
1,Logistic Regression,without oversampling,58.333333,17.863894,27.351664,60.670564,88.89749,88.89749,27.968621
2,Logistic Regression,without oversampling,58.333333,17.863894,27.351664,60.670564,88.89749,88.89749,27.968621
3,Logistic Regression,without oversampling,58.333333,17.863894,27.351664,60.670564,88.89749,88.89749,27.968621
4,Logistic Regression,without oversampling,58.333333,17.863894,27.351664,60.670564,88.89749,88.89749,27.968621
5,Logistic Regression,SMOTE,35.045317,76.748582,48.118519,68.10793,80.636957,80.636957,42.630751
6,Logistic Regression,SMOTE,35.276207,76.654064,48.31695,68.268734,80.813889,80.813889,42.827546
7,Logistic Regression,SMOTE,35.204526,76.465028,48.212157,68.206554,80.780714,80.780714,42.684883
8,Logistic Regression,SMOTE,35.23768,76.37051,48.224411,68.224862,80.813889,80.813889,42.682746
9,Logistic Regression,SMOTE,35.248042,76.559546,48.271752,68.24314,80.802831,80.802831,42.763909


In [None]:
a = results_lr
print('without oversampling\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][0:5].values.mean(), a[a.columns[[i]]][0:5].values.std())
print('')
print('smote\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][5:10].values.mean(), a[a.columns[[i]]][5:10].values.std())
print('')
print('adasyn\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][10:15].values.mean(), a[a.columns[[i]]][10:15].values.std())

without oversampling

precision 58.333333333333336 0.0
recall 17.86389413988658 0.0
fmeasure 27.35166425470333 0.0
macro_fmeasure 60.67056392811795 0.0
micro_fmeasure 88.89748977109366 0.0
accuracy 88.89748977109366 0.0
mcc 27.968621092231608 3.552713678800501e-15

smote

precision 35.20235432247518 0.08179832573156978
recall 76.55954631379961 0.13366857867420898
fmeasure 48.22875764312331 0.06644318633622791
macro_fmeasure 68.21024401808798 0.05513361257898787
micro_fmeasure 80.76965608758157 0.06744642566394693
accuracy 80.76965608758155 0.06744642566394321
mcc 42.7179669746816 0.06936856177153708

adasyn

precision 35.14973784111668 0.06232728801912943
recall 76.33270321361059 0.1753046974573794
fmeasure 48.13444520868903 0.0854731075505255
macro_fmeasure 68.15970965276117 0.05176806686193866
micro_fmeasure 80.75417449961296 0.036071008360716306
accuracy 80.75417449961296 0.036071008360716306
mcc 42.58039990364568 0.11654611088410147


# Method 5: Multi-Layer Perceptron (MLP) Classifier



In [None]:
store_dict = {
  'model':[],
  'oversampling':[],
  'precision': [],
  'recall': [],
  'fmeasure':  [],
  'macro_fmeasure': [],
  'micro_fmeasure': [],
  'accuracy': [],
  'mcc': []
}

method = 'MLP'

### Without Oversampling

In [None]:
oversampling_method = "without oversampling"

for RANDOM_STATE in RANDOM_STATES:
    classifier = MLPClassifier(hidden_layer_sizes=HIDDEN_LAYER_SIZE, random_state=RANDOM_STATE, max_iter=MAX_ITER, early_stopping=True)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Oversampling with SMOTE

In [None]:
oversampling_method = 'SMOTE'

for RANDOM_STATE in RANDOM_STATES:
    sampler_smote = SMOTE(random_state=RANDOM_STATE)
    x_train_smote, y_train_smote = sampler_smote.fit_resample(x_train, y_train)
    classifier = MLPClassifier(hidden_layer_sizes=HIDDEN_LAYER_SIZE, random_state=RANDOM_STATE, max_iter=MAX_ITER, early_stopping=True)
    classifier.fit(x_train_smote, y_train_smote)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Oversampling with ADASYN

In [None]:
oversampling_method = 'ADASYN'

for RANDOM_STATE in RANDOM_STATES:
    sampler_adasyn = ADASYN(random_state=RANDOM_STATE)
    x_train_adasyn, y_train_adasyn = sampler_adasyn.fit_resample(x_train, y_train)
    classifier = MLPClassifier(hidden_layer_sizes=HIDDEN_LAYER_SIZE, random_state=RANDOM_STATE, max_iter=MAX_ITER, early_stopping=True)
    classifier.fit(x_train_adasyn, y_train_adasyn)
    y_pred = classifier.predict(x_test)

    metrics_= metrics(y_test, y_pred, target_names, RANDOM_STATE)
    store_dict['model'].append(method)
    store_dict['oversampling'].append(oversampling_method)
    store_dict['precision'].append(metrics_['precision'])
    store_dict['recall'].append(metrics_['recall'])
    store_dict['fmeasure'].append(metrics_['fmeasure'])
    store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
    store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
    store_dict['accuracy'].append(metrics_['accuracy'])
    store_dict['mcc'].append(metrics_['mcc'])

### Display Results

In [None]:
results_mlp = pd.DataFrame.from_dict(store_dict)
results_mlp

Unnamed: 0,model,oversampling,precision,recall,fmeasure,macro_fmeasure,micro_fmeasure,accuracy,mcc
0,MLP,without oversampling,56.644518,32.230624,41.084337,67.565181,89.185005,89.185005,37.343376
1,MLP,without oversampling,57.167235,31.663516,40.754258,67.415202,89.229238,89.229238,37.237002
2,MLP,without oversampling,55.389222,17.485822,26.58046,60.22925,88.698441,88.698441,26.619448
3,MLP,without oversampling,57.335582,32.136106,41.187159,67.639518,89.262413,89.262413,37.613009
4,MLP,without oversampling,56.699029,27.599244,37.12651,65.568645,89.063364,89.063364,34.404774
5,MLP,SMOTE,41.355754,74.385633,53.157717,71.993735,84.66217,84.66217,47.633941
6,MLP,SMOTE,39.766082,70.699433,50.901667,70.687514,84.042906,84.042906,44.750364
7,MLP,SMOTE,40.945728,72.022684,52.209661,71.50603,84.573703,84.573703,46.318048
8,MLP,SMOTE,39.35743,74.102079,51.409836,70.776746,83.611633,83.611633,45.737113
9,MLP,SMOTE,41.036955,70.321361,51.828631,71.369459,84.706403,84.706403,45.706936


In [None]:
a = results_mlp
print('without oversampling\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][0:5].values.mean(), a[a.columns[[i]]][0:5].values.std())
print('')
print('smote\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][5:10].values.mean(), a[a.columns[[i]]][5:10].values.std())
print('')
print('adasyn\n')
for i in range(2,9):
    print(a.columns[i], a[a.columns[[i]]][10:15].values.mean(), a[a.columns[[i]]][10:15].values.std())

without oversampling

precision 56.64711724758534 0.6825700814440037
recall 28.22306238185255 5.637138520428943
fmeasure 37.34654483564664 5.590901961786241
macro_fmeasure 65.68355917194776 2.8329334224649685
micro_fmeasure 89.08769213756497 0.20598100545172932
accuracy 89.08769213756497 0.20598100545172932
mcc 34.643521644380364 4.177941416494977

smote

precision 40.49238981771551 0.7827033519327645
recall 72.3062381852552 1.6823134564300195
fmeasure 51.90150245371226 0.7641666153993829
macro_fmeasure 71.26669664730517 0.4841340877617033
micro_fmeasure 84.31936304323786 0.42645396586196677
accuracy 84.31936304323787 0.42645396586196843
mcc 46.029280325039096 0.9469128366398671

adasyn

precision 37.62557528007078 1.6948125776273386
recall 75.51984877126654 1.8492698830087844
fmeasure 50.17787912963424 1.2048667552202694
macro_fmeasure 69.7494809395814 1.0029012587112363
micro_fmeasure 82.41955103394892 1.186989511476037
accuracy 82.41955103394892 1.186989511476037
mcc 44.644043693727

# Compare best model: Random Forest without and with feature selection
- explore different feature selection methods
- select best 10 features for "with feature selection" method

## Feature Selection exploration

In [None]:
subsets=[
    ['age'],['balance'], ['day'], ['month'], ['duration'], ['campaign'], ['pdays'],['previous'],
    ['job_blue-collar', 'job_entrepreneur', 'job_housemaid','job_management', 'job_retired', 'job_self-employed', 'job_services','job_student', 'job_technician', 'job_unemployed'],
    ['marital_married','marital_single'],['education_secondary', 'education_tertiary'],['default_yes'], ['housing_yes'], ['loan_yes'], ['contact_telephone']
]

len(subsets)

15

In [None]:
# feature selection using linear regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from tqdm import tqdm

linear_scores = []
for subset in tqdm(subsets):
  current_df = x_train[subset]
  train_x,test_x,train_y,test_y = train_test_split(current_df,y_train,test_size=0.2,shuffle=True,random_state=42)
  model = LinearRegression()
  clf=model.fit(train_x.to_numpy(),train_y)
  score=clf.score(test_x.to_numpy(),test_y)
  linear_scores.append([subset,score])

sorted(linear_scores,key=lambda x: abs(x[1]))
# linear regression not good enough to tell us what features to use. means each feature has low linearity with target y also

100%|██████████| 15/15 [00:00<00:00, 91.90it/s]


[[['contact_telephone'], -7.970339405649618e-05],
 [['age'], -0.00030878515982291077],
 [['default_yes'], 0.00037682079400802593],
 [['day'], 0.0005379690174296403],
 [['balance'], 0.000723224129287714],
 [['month'], 0.0008019785347432862],
 [['education_secondary', 'education_tertiary'], 0.0028663158529979027],
 [['campaign'], 0.004254446722601557],
 [['loan_yes'], 0.004751166239698179],
 [['marital_married', 'marital_single'], 0.006186618422328816],
 [['previous'], 0.009872794539235952],
 [['pdays'], 0.011940358023751818],
 [['housing_yes'], 0.014894753008829142],
 [['job_blue-collar',
   'job_entrepreneur',
   'job_housemaid',
   'job_management',
   'job_retired',
   'job_self-employed',
   'job_services',
   'job_student',
   'job_technician',
   'job_unemployed'],
  0.0174435261444007],
 [['duration'], 0.16018050138462492]]

linear regression not good enough to tell us what features to use. means each feature has low linearity with target y also

In [None]:
# feature selection using xgboost - select the features that generate model with best scores
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm

boost_scores = []
for subset in tqdm(subsets):
  current_df = x_train[subset]
  train_x,test_x,train_y,test_y = train_test_split(current_df,y_train,test_size=0.2,shuffle=True,random_state=42)
  model = XGBClassifier(n_estimators=100, random_state=42)
  clf=model.fit(train_x.to_numpy(),train_y)
  score=clf.score(test_x.to_numpy(),test_y)
  boost_scores.append([subset,score])

sorted(boost_scores,key=lambda x: abs(x[1]))

100%|██████████| 15/15 [00:05<00:00,  2.95it/s]


[[['age'], 0.8858169753939729],
 [['balance'], 0.8858169753939729],
 [['day'], 0.8859552115012441],
 [['campaign'], 0.8859552115012441],
 [['previous'], 0.8859552115012441],
 [['job_blue-collar',
   'job_entrepreneur',
   'job_housemaid',
   'job_management',
   'job_retired',
   'job_self-employed',
   'job_services',
   'job_student',
   'job_technician',
   'job_unemployed'],
  0.8859552115012441],
 [['marital_married', 'marital_single'], 0.8859552115012441],
 [['education_secondary', 'education_tertiary'], 0.8859552115012441],
 [['default_yes'], 0.8859552115012441],
 [['housing_yes'], 0.8859552115012441],
 [['loan_yes'], 0.8859552115012441],
 [['contact_telephone'], 0.8859552115012441],
 [['month'], 0.8874758086812276],
 [['pdays'], 0.888443461432126],
 [['duration'], 0.8887199336466685]]

In [None]:
# look at the best 10 features from random forest clf

np.array(sorted(forest_scores,key=lambda x: x[1])[-10:])[:,0]

  np.array(sorted(forest_scores,key=lambda x: x[1])[-10:])[:,0]


array([list(['campaign']),
       list(['job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed']),
       list(['marital_married', 'marital_single']),
       list(['education_secondary', 'education_tertiary']),
       list(['default_yes']), list(['housing_yes']), list(['loan_yes']),
       list(['contact_telephone']), list(['month']), list(['pdays'])],
      dtype=object)

In [None]:
best_10_from_forest=[
    "campaign","job","marital","education","default","housing","loan","contact","month","pdsays"
]

In [None]:
# look at the best 10 features from ada boost clf

np.array(sorted(boost_scores,key=lambda x: x[1])[-10:])[:,0]

# lets take this one because duration actually has the highest score in linear regression so it should be added in
# it also makes sense since if youre on the call longer, youre more likely to subscribe a term deposit (variable y).

  np.array(sorted(boost_scores,key=lambda x: x[1])[-10:])[:,0]


array([list(['job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed']),
       list(['marital_married', 'marital_single']),
       list(['education_secondary', 'education_tertiary']),
       list(['default_yes']), list(['housing_yes']), list(['loan_yes']),
       list(['contact_telephone']), list(['month']), list(['pdays']),
       list(['duration'])], dtype=object)

In [None]:
best_10_from_boost=[
    "job","marital","education","default","housing","loan","contact","month","pdays","duration"
]

shortlist final 10 features to use

In [None]:
shortlisted_features = [
  'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed',"marital_married","marital_single","education_tertiary","education_secondary","default_yes","housing_yes","loan_yes","contact_telephone","month","pdays","duration"
]

## comparison using Random Forest Classifier
- our best model
- only method we experimented with feature selection for baseline comparison

In [None]:
store_dict = {
  'model':[],
  'oversampling':[],
  'feature_selection':[],
  'precision': [],
  'recall': [],
  'fmeasure':  [],
  'macro_fmeasure': [],
  'micro_fmeasure': [],
  'accuracy': [],
  'mcc': []
}

RANDOM_STATES = [1,2,3,4,5]
method = "Random Forest"

### Without Oversampling

In [None]:
oversampling_method = "without oversampling"
feature_selection = "no"

for _,RANDOM_STATE in tqdm(zip(range(5),RANDOM_STATES)):
  classifier = RandomForestClassifier(n_estimators=N_ESTIMATORS, criterion='entropy', random_state=RANDOM_STATE)
  classifier.fit(x_train, y_train)
  y_pred = classifier.predict(x_test)

  metrics_= metrics(y_test, y_pred, target_names)
  store_dict['model'].append(method)
  store_dict['oversampling'].append(oversampling_method)
  store_dict['feature_selection'].append(feature_selection)
  store_dict['precision'].append(metrics_['precision'])
  store_dict['recall'].append(metrics_['recall'])
  store_dict['fmeasure'].append(metrics_['fmeasure'])
  store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
  store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
  store_dict['accuracy'].append(metrics_['accuracy'])
  store_dict['mcc'].append(metrics_['mcc'])

# pd.DataFrame.from_dict(store_dict)

5it [00:10,  2.10s/it]


### Oversampling Using SMOTE without feature selection
- use all features

In [None]:
oversampling_method = "SMOTE"
feature_selection = "no"

for _,RANDOM_STATE in tqdm(zip(range(5),RANDOM_STATES)):
  classifier = RandomForestClassifier(n_estimators=N_ESTIMATORS, criterion='entropy', random_state=RANDOM_STATE)
  classifier.fit(x_train_smote, y_train_smote)
  y_pred = classifier.predict(x_test)

  metrics_= metrics(y_test, y_pred, target_names)
  store_dict['model'].append(method)
  store_dict['oversampling'].append(oversampling_method)
  store_dict['feature_selection'].append(feature_selection)
  store_dict['precision'].append(metrics_['precision'])
  store_dict['recall'].append(metrics_['recall'])
  store_dict['fmeasure'].append(metrics_['fmeasure'])
  store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
  store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
  store_dict['accuracy'].append(metrics_['accuracy'])
  store_dict['mcc'].append(metrics_['mcc'])

# pd.DataFrame.from_dict(store_dict)

5it [00:25,  5.16s/it]


### Oversampling with ADASYN without feature selection
- use all features

In [None]:
oversampling_method = "ADASYN"
feature_selection = "no"

for _,RANDOM_STATE in tqdm(zip(range(5),RANDOM_STATES)):
  classifier = RandomForestClassifier(n_estimators=N_ESTIMATORS, criterion='entropy', random_state=RANDOM_STATE)
  classifier.fit(x_train, y_train)
  y_pred = classifier.predict(x_test)

  metrics_= metrics(y_test, y_pred, target_names)
  store_dict['model'].append(method)
  store_dict['oversampling'].append(oversampling_method)
  store_dict['feature_selection'].append(feature_selection)
  store_dict['precision'].append(metrics_['precision'])
  store_dict['recall'].append(metrics_['recall'])
  store_dict['fmeasure'].append(metrics_['fmeasure'])
  store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
  store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
  store_dict['accuracy'].append(metrics_['accuracy'])
  store_dict['mcc'].append(metrics_['mcc'])

# pd.DataFrame.from_dict(store_dict)

5it [00:10,  2.08s/it]


### Oversampling Using SMOTE with feature selection

In [None]:
oversampling_method = "SMOTE"
feature_selection = "yes"

x_train_smote_short=x_train_smote[shortlisted_features]
x_test_short=x_test[shortlisted_features]

for _,RANDOM_STATE in tqdm(zip(range(5),RANDOM_STATES)):
  classifier = RandomForestClassifier(n_estimators=N_ESTIMATORS, criterion='entropy', random_state=RANDOM_STATE)
  classifier.fit(x_train_smote_short, y_train_smote)
  y_pred = classifier.predict(x_test_short)

  metrics_= metrics(y_test, y_pred, target_names)
  store_dict['model'].append(method)
  store_dict['oversampling'].append(oversampling_method)
  store_dict['feature_selection'].append(feature_selection)
  store_dict['precision'].append(metrics_['precision'])
  store_dict['recall'].append(metrics_['recall'])
  store_dict['fmeasure'].append(metrics_['fmeasure'])
  store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
  store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
  store_dict['accuracy'].append(metrics_['accuracy'])
  store_dict['mcc'].append(metrics_['mcc'])

# pd.DataFrame.from_dict(store_dict)

5it [00:16,  3.28s/it]


### Oversampling with ADASYN with feature selection

In [None]:
oversampling_method = "ADASYN"
feature_selection = "yes"

x_train_adasyn_short=x_train_adasyn[shortlisted_features]
x_test_short=x_test[shortlisted_features]

for _,RANDOM_STATE in tqdm(zip(range(5),RANDOM_STATES)):
  classifier = RandomForestClassifier(n_estimators=N_ESTIMATORS, criterion='entropy', random_state=RANDOM_STATE)
  classifier.fit(x_train_adasyn_short, y_train_adasyn)
  y_pred = classifier.predict(x_test_short)

  metrics_= metrics(y_test, y_pred, target_names)
  store_dict['model'].append(method)
  store_dict['oversampling'].append(oversampling_method)
  store_dict['feature_selection'].append(feature_selection)
  store_dict['precision'].append(metrics_['precision'])
  store_dict['recall'].append(metrics_['recall'])
  store_dict['fmeasure'].append(metrics_['fmeasure'])
  store_dict['macro_fmeasure'].append(metrics_['macro_fmeasure'])
  store_dict['micro_fmeasure'].append(metrics_['micro_fmeasure'])
  store_dict['accuracy'].append(metrics_['accuracy'])
  store_dict['mcc'].append(metrics_['mcc'])

# pd.DataFrame.from_dict(store_dict)

5it [00:16,  3.21s/it]


## compare results without and without feature selection
- Conclusion: we can see that the best model: Random Forest Clf performs better without feature selection, when all the features are used for training the model

In [None]:
pd.DataFrame.from_dict(store_dict)

Unnamed: 0,model,oversampling,feature_selection,precision,recall,fmeasure,macro_fmeasure,micro_fmeasure,accuracy,mcc
0,Random Forest,without oversampling,no,61.835749,36.294896,45.741513,70.094502,89.92591,89.92591,42.356581
1,Random Forest,without oversampling,no,63.504823,37.334594,47.02381,70.799482,90.158133,90.158133,43.804494
2,Random Forest,without oversampling,no,61.111111,35.349716,44.790419,69.586974,89.804268,89.804268,41.418659
3,Random Forest,without oversampling,no,61.047463,35.255198,44.697424,69.537601,89.79321,89.79321,41.329047
4,Random Forest,without oversampling,no,62.541254,35.822306,45.552885,70.017948,89.981201,89.981201,42.392957
5,Random Forest,SMOTE,no,51.729819,59.357278,55.28169,74.428502,88.76479,88.76479,49.042859
6,Random Forest,SMOTE,no,52.784504,61.814745,56.94384,75.339993,89.063364,89.063364,50.932023
7,Random Forest,SMOTE,no,52.384868,60.20794,56.024626,74.850158,88.941723,88.941723,49.892806
8,Random Forest,SMOTE,no,51.483561,60.680529,55.704989,74.617592,88.709499,88.709499,49.503618
9,Random Forest,SMOTE,no,53.097345,62.381853,57.366362,75.575801,89.15183,89.15183,51.415799
