In [None]:
# Headers
import pandas as pd # pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu, wilcoxon, friedmanchisquare # mannwhitneyu and wilcoxon
import scikit_posthocs as sp
from sklearn.linear_model import Perceptron         # perceptron
from sklearn.neighbors import KNeighborsClassifier  # knn
from sklearn.tree import DecisionTreeClassifier     # decision tree
from sklearn.linear_model import LogisticRegression # logistic regression
from xgboost import XGBClassifier			        # xgboost
from sklearn.model_selection import cross_val_score, StratifiedKFold # cross-validation and kfold
from sklearn.pipeline import Pipeline # pipeline
from sklearn.preprocessing import MinMaxScaler # minmaxscaler

In [None]:
# Imports dataset
dataset = pd.read_csv("treated_train.csv")

In [None]:
# Displays dataset
display(dataset)

In [None]:
# Runs k-fold cross-validation with scaling and returns the scores
def run_validation(model, X, y, scoring_metric, k, n_runs):
    scores = []
    pipe = Pipeline([
        ("scaler", MinMaxScaler()),
        ("model", model)
    ])
    for seed in range(n_runs):
        cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        cv_score = cross_val_score(pipe, X, y, cv=cv, scoring=scoring_metric)
        scores.append(cv_score.mean())
    return np.array(scores)

In [None]:
# KNN model with k=7
knn = KNeighborsClassifier(n_neighbors=7)

# Logistic Regression max_iter = 5000
reglog = LogisticRegression(max_iter=5000)

# Perceptron max_iter = 2000, tol = 1e-3
perceptron = Perceptron(max_iter=2000, tol=1e-3)

# Decision Tree with max_depth = 5
arvdec = DecisionTreeClassifier(max_depth=5)

# XGBoost with n_estimators=2, max_depth=2, learning_rate=1 and objective='binary:logistic'
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')


In [None]:
# Separates features e target
X = dataset.drop(columns=['route_changed'])
y = dataset['route_changed']

# Parameters
k = 10 
n_runs = 101

In [None]:
# Calculates knn accuracies and f1-scores
knn_accuracy = run_validation(knn, X, y, 'accuracy', k, n_runs)
knn_f1 = run_validation(knn, X, y, 'f1', k, n_runs)

In [None]:
# Calculates reglog accuracies and f1-scores
reglog_accuracy = run_validation(reglog, X, y, 'accuracy', k, n_runs)
reglog_f1 = run_validation(reglog, X, y, 'f1', k, n_runs)

In [None]:
# Calculates perceptron accuracies and f1-scores
perceptron_accuracy = run_validation(perceptron, X, y, 'accuracy', k, n_runs)
perceptron_f1 = run_validation(perceptron, X, y, 'f1', k, n_runs)


In [None]:
# Calculates decision tree accuracies and f1-scores
arvdec_accuracy = run_validation(arvdec, X, y, 'accuracy', k, n_runs)
arvdec_f1 = run_validation(arvdec, X, y, 'f1', k, n_runs)

In [None]:
# Calculates xgboost accuracies and f1-scores
bst_accuracy = run_validation(bst, X, y, 'accuracy', k, n_runs)
bst_f1 = run_validation(bst, X, y, 'f1', k, n_runs)

In [None]:
# Plots accuracy distributions
plt.hist(knn_accuracy, bins=30, alpha=0.5, label="Acurácia do KNN")
plt.hist(reglog_accuracy, bins=30, alpha=0.5, label="Acurácia da Reg. Log.")
plt.hist(perceptron_accuracy, bins=30, alpha=0.5, label="Acurácia do Perceptron")
plt.hist(arvdec_accuracy, bins=30, alpha=0.5, label="Acurácia da Árvore de Decisão")
plt.hist(bst_accuracy, bins=30, alpha=0.5, label="Acurácia do XGBoost")
plt.legend()
plt.show()

In [None]:
# Plots F1-score distributions
plt.hist(knn_f1, bins=30, alpha=0.5, label="f1-score do KNN")
plt.hist(reglog_f1, bins=30, alpha=0.5, label="f1-score da Reg. Log.")
plt.hist(perceptron_f1, bins=30, alpha=0.5, label="f1-score do Perceptron")
plt.hist(arvdec_f1, bins=30, alpha=0.5, label="f1-score da Árvore de Decisão")
plt.hist(bst_f1, bins=30, alpha=0.5, label="f1-score do XGBoost")
plt.legend()
plt.show()

In [None]:
# Displays means
print(f"KNN Accuracy: {knn_accuracy.mean():.4f} ± {knn_accuracy.std():.4f}")
print(f"Logistic Regression Accuracy: {reglog_accuracy.mean():.4f} ± {reglog_accuracy.std():.4f}")
print(f"Perceptron Accuracy: {perceptron_accuracy.mean():.4f} ± {perceptron_accuracy.std():.4f}")
print(f"Decision Tree Accuracy: {arvdec_accuracy.mean():.4f} ± {arvdec_accuracy.std():.4f}")
print(f"XGBoost Accuracy: {bst_accuracy.mean():.4f} ± {bst_accuracy.std():.4f}")