# Imports

In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report, recall_score, precision_score, accuracy_score, f1_score
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict
import plotly.graph_objects as go
import numpy as np
encoder = LabelEncoder()
smote = SMOTE(sampling_strategy="minority")
data = pd.read_csv('C:/Users/joao_/Desktop/4 ANO JOAO/SINO/VS CODE PROJETO/Jupyter Notebook/Data Transformation/bank-full-transformed.csv')
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,campaign,pdays,previous,y
0,50,management,married,tertiary,0,0,1,0,0,Primavera,1,-1,0,0
1,40,technician,single,secondary,0,0,1,0,0,Primavera,1,-1,0,0
2,30,entrepreneur,married,secondary,0,0,1,1,0,Primavera,1,-1,0,0
3,40,blue-collar,married,other,0,0,1,0,0,Primavera,1,-1,0,0
4,30,other,single,other,0,0,0,0,0,Primavera,1,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45185,50,technician,married,tertiary,0,0,0,0,1,Outono,2,-1,0,1
45186,60,retired,divorced,primary,0,0,0,0,1,Outono,2,-1,0,1
45187,60,retired,married,secondary,0,2,0,0,1,Outono,2,4,2,1
45188,50,blue-collar,married,secondary,0,0,0,0,1,Outono,2,-1,0,0


# Preprocessing Data

X = df[["marital", "education", "default", "housing", "loan", "age_range","season","period" ,"poutcome_binary", "balance_normal", "duration_normal", "campaign_normal"]]
X['marital'] = encoder.fit_transform(X['marital'])
X["education"] = encoder.fit_transform(X["education"])
X["default"] = encoder.fit_transform(X["default"])
X["housing"] = encoder.fit_transform(X["housing"])
X["loan"] = encoder.fit_transform(X["loan"])
X["season"] = encoder.fit_transform(X["season"])
X["period"] = encoder.fit_transform(X["period"])
X["poutcome_binary"] = encoder.fit_transform(X["poutcome_binary"])

In [2]:
def encode(data_train, col):
    return pd.concat([data, pd.get_dummies(col, prefix=col.name)], axis=1)

data = encode(data, data.job)
data = encode(data, data.marital)
data = encode(data, data.education)
data = encode(data, data.month)


data.drop(['job', "marital", "education", "month"], axis=1, inplace=True)

In [3]:
X = data.drop(["y", "pdays"], axis =1)
y = data["y"]

# Models Creation

In [4]:
models = []
models.append(('Decision tree', DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, min_samples_leaf = 13)))
models.append(("Logistic Regression", LogisticRegression(C = 0.01, max_iter = 100, penalty =  'l1', solver =  'liblinear')))
models.append(("MLP", MLPClassifier(max_iter = 100, solver = 'sgd',learning_rate =  'adaptive', hidden_layer_sizes =  (20,), alpha =  0.05, activation =  'relu')))
models.append(("Random Forest", RandomForestClassifier(n_estimators = 30, min_samples_split = 0.6, max_features =  'auto', max_depth=16)))
models.append(("Gaussian Naive Bayes", GaussianNB(var_smoothing = 0.3511191734215131)))
models.append(("K-Nearest Neighbours", KNeighborsClassifier(n_neighbors = 27, leaf_size = 25, p = 1)))
knn = KNeighborsClassifier(n_neighbors = 27, leaf_size = 25, p = 1)

# Model evaluation

## Train test split

### Metrics

In [5]:
def get_confusion_matrix_values(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    return(cm, cm[0][0], cm[0][1], cm[1][0], cm[1][1])

In [6]:
X = X.to_numpy()
y = y.to_numpy()

In [7]:
# cv = StratifiedKFold(n_splits=5)

# #cv = KFold(n_splits=10)
# for train_idx, test_idx, in cv.split(X, y):
#     X_train, y_train = X[train_idx], y[train_idx]
#     X_test, y_test = X[test_idx], y[test_idx]
#     X_train, y_train = smote.fit_sample(X_train, y_train)
#     model = knn
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     print(f'For fold {fold}:')
#     print(f'Accuracy: {model.score(X_test, y_test)}')
#     print(f'f-score: {f1_score(y_test, y_pred)}')

In [8]:
kf = KFold(n_splits=10)
accuracy = []
sensivity = []
specifity = []
precision = []
f_measure = []
accuracy_module = []
sensivity_module = []
precision_module = []
f_measure_module = []
auc_list = []
for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    X_train_oversampled, y_train_oversampled = smote.fit_sample(X_train, y_train)
    model = knn 
    model.fit(X_train_oversampled, y_train_oversampled)
    y_pred = model.predict(X_test)
    confusion, TP, FP, FN, TN = get_confusion_matrix_values(y_test, y_pred)
    accuracy_matrix = round(((TP + TN) / float(TP + FN + FP + TN)) * 100, 2)
    specificity = round(TN / float(TN + FP) * 100, 2)
    sensibility_matrix = round((TP / float(TP + FN)) * 100, 2)
    precision_matrix = round((TP / float(TP + FP))  * 100, 2)
    f_measure_matrix = (2 * precision_matrix * sensibility_matrix) / (precision_matrix + sensibility_matrix)
    f_measure_matrix = round(f_measure_matrix, 2)
    accuracy_mod = round(accuracy_score(y_test, y_pred) * 100 , 2)
    sensibility_mod = round(recall_score(y_test, y_pred, average = 'weighted') * 100, 2)
    precision_mod = round(precision_score(y_test, y_pred, average = 'weighted') * 100, 2)
    f_measure_mod = round(f1_score(y_test, y_pred, average = 'weighted') * 100 , 2)
    y_pred_proba = model.predict_proba(X_test)[::,1]
    auc = roc_auc_score(y_test, y_pred_proba)
    auc_list.append(auc)
    accuracy.append(accuracy_matrix)
    sensivity.append(sensibility_matrix)
    specifity.append(specificity)
    precision.append(precision_matrix)
    f_measure.append(f_measure_matrix)
    accuracy_module.append(accuracy_mod)
    sensivity_module.append(sensibility_mod)
    precision_module.append(precision_mod)
    f_measure_module.append(f_measure_mod)

In [9]:
# scores = [("Acuidade", np.mean(accuracy)), ("Sensibilidade", np.mean(sensivity)), ("Especificidade", np.mean(specifity)), ("Precisão", np.mean(precision)), ("F-Measure", np.mean(f_measure)),("Acuidade Modulo", np.mean(accuracy_module)), ("Sensibilidade Modulo", np.mean(sensivity_module)), ("Precisão Modulo", np.mean(precision_module)), ("F-Measure Modulo", np.mean(f_measure_module))]
# scores

In [10]:
scores = [("Acuidade", np.mean(accuracy)), ("Sensibilidade", np.mean(sensivity)), ("Especificidade", np.mean(specifity)), ("Precisão", np.mean(precision)), ("F-Measure", np.mean(f_measure)), ("AUC", np.mean(auc_list))]
scores

[('Acuidade', 69.811),
 ('Sensibilidade', 89.53),
 ('Especificidade', 14.459999999999999),
 ('Precisão', 71.551),
 ('F-Measure', 78.92699999999999),
 ('AUC', 0.5576790141008238)]

In [11]:
scores = [("Acuidade Modulo", np.mean(accuracy_module)), ("Sensibilidade Modulo", np.mean(sensivity_module)), ("Precisão Modulo", np.mean(precision_module)), ("F-Measure Modulo", np.mean(f_measure_module))]
scores

[('Acuidade Modulo', 69.811),
 ('Sensibilidade Modulo', 69.811),
 ('Precisão Modulo', 83.855),
 ('F-Measure Modulo', 74.90100000000001)]