In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

datos = pd.read_csv('data.csv')

X=datos.drop(['PRED'], axis=1)
y = datos['PRED']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier


model1 = RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=15,
                       max_leaf_nodes=50, min_samples_leaf=4,
                       min_samples_split=10, n_estimators=800, random_state=42).fit(X_train, y_train)

              
model2 = MLPClassifier(alpha=0.012273800987852959, beta_1=0.2788441133807552,
              beta_2=0.10551659500647881, epsilon=8.254614284548341e-08,
              hidden_layer_sizes=(850,), learning_rate='adaptive',
              learning_rate_init=0.014477786306928656,
              momentum=0.6453639773029103, random_state=42, solver='sgd').fit(X_train, y_train)


model = ensemble_clf = VotingClassifier(estimators=[('rf', model1), ('ann', model2)], voting='soft').fit(X_train, y_train)

# Save model
import joblib
joblib.dump(model, 'model.pkl')

#Cross-validationz
def confusion_matrix_scorer(model, X_train, y_train):
        y_pred = model.predict(X_train)
        cm = confusion_matrix(y_train, y_pred)
        return {'tn': cm[0, 0], 'fp': cm[0, 1],
                'fn': cm[1, 0], 'tp': cm[1, 1]}
        
cv_results = cross_validate(model, X_train, y_train, cv=10,
                            scoring=confusion_matrix_scorer)
# Getting the test set true positive scores
TP = cv_results['test_tp'].mean()

# Getting the test set false negative scores
FN = cv_results['test_fn'].mean()

# Getting the test set false positive scores
FP = cv_results['test_fp'].mean()

# Getting the test set true negative scores
TN = cv_results['test_tn'].mean()

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np
# Perform cross-validation and obtain predictions
y_scores = cross_val_predict(model, X_train, y_train, cv=10, method='predict_proba')

# Compute ROC curve for each cross-validation fold
fprs, tprs, aucs = [], [], []
for i in range(10):
    fpr, tpr, _ = roc_curve(y_train, y_scores[:,1], pos_label=1)
    roc_auc = auc(fpr, tpr)
    fprs.append(fpr)
    tprs.append(tpr)
    aucs.append(roc_auc)

# Compute average ROC curve
mean_fpr = np.mean(fprs, axis=0)
mean_tpr = np.mean(tprs, axis=0)
mean_auc = np.mean(aucs)


fig, ax = plt.subplots()
ax.plot(mean_fpr, mean_tpr, color='darkblue', label=f'Mean ROC (AUC = {mean_auc:.2f})')
ax.plot([0, 1], [0, 1], 'k--', color="red")
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC curve on training dataset')
ax.legend()
plt.savefig('hy_paac_training_remove.png', dpi=1200)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Make predictions on the test set
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Calculate false positive rate and true positive rate
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Calculate area under the curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, color='darkgreen', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve on testing dataset')
plt.legend(loc="lower right")
plt.savefig('hy_paac_testing_remove.png', dpi=1200)
plt.show()

In [None]:
####TRAINING###
import numpy as np

acurracy = (TP+TN) / (TP+TN+FP+FN)
F1_score = 2*TP / ((2*TP) + (FP + FN))
precision = TP / (TP + FP)
specificity = TN / (FP + TN)
sensitivity_recall = TP / (TP + FN)

def cohen_kappa(TP, FP, TN, FN):
    N = TP + FP + TN + FN
    Po = (TP + TN) / N
    Pe = ((TP + FP) * (TP + FN) + (TN + FP) * (TN + FN)) / N**2
    return (Po - Pe) / (1 - Pe)

kappa = cohen_kappa(TP=TP, FP=FP, TN=TN, FN=FN)

import math 
MCC = ((TP*TN) - (FP*FN)) / math.sqrt(((TP+FP)*(TP+FN))*((TN+FP)*(TN+FN)))

print("Accuracy: ", acurracy)
print("F1_score: ", F1_score)
print("Precision: ", precision)
print("Specificity: ", specificity)
print("Sensitivity/Recall: ", sensitivity_recall)
print("Kappa: ", kappa)
print("MCC: ", MCC)
print("Mean_AUC: ", mean_auc)

In [None]:
####TESTING###
from sklearn.metrics import classification_report

pred_test=model.predict(X_test)

conf = confusion_matrix(y_test, pred_test)
TPt = conf[1, 1]
FPt = conf[0, 1]
TNt = conf[0, 0]
FNt = conf[1, 0]

acurracy = (TPt+TNt) / (TPt+TNt+FPt+FNt)
F1_score = 2*TPt / ((2*TPt) + (FPt + FNt))
precision = TPt / (TPt + FPt)
specificity = TNt / (FPt + TNt)
sensitivity_recall = TPt / (TPt + FNt)

def cohen_kappa(TPt, FPt, TNt, FNt):
    N = TPt + FPt + TNt + FNt
    Po = (TPt + TNt) / N
    Pe = ((TPt + FPt) * (TPt + FNt) + (TNt + FPt) * (TNt + FNt)) / N**2
    return (Po - Pe) / (1 - Pe)

kappa = cohen_kappa(TPt, FPt, TNt, FNt)

import math 
MCC = ((TPt*TNt) - (FPt*FNt)) / math.sqrt(((TPt+FPt)*(TPt+FNt))*((TNt+FPt)*(TNt+FNt)))

print("Accuracy: ", acurracy)
print("F1_score: ", F1_score)
print("Precision: ", precision)
print("Specificity: ", specificity)
print("Sensitivity/Recall: ", sensitivity_recall)
print("Kappa: ", kappa)
print("MCC: ", MCC)
print("AUC: ", roc_auc)