In [None]:
# imports
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, \
                            roc_auc_score, precision_score, f1_score, recall_score, \
                            mean_squared_error, mean_absolute_error, max_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from datetime import timedelta
from time import time
import pandas as pd
import numpy as np
import sys

In [None]:
# change flag to run local or in colab
colab_env = False

results_dir = ""
model = "knn"
if colab_env:
    # default to run in colab
    from google.colab import drive
    drive.mount('/content/drive')

    sys.path.insert(1, "/content/drive/Shareddrives/tcc_pos/utils")
    import model_utils

    results_dir = "/content/drive/Shareddrives/tcc_pos/results_/"
    dataframe_path = "/content/drive/Shareddrives/tcc_pos/datasets/no_filtered_analysis.csv"   
else:
    # default to run local
    dataframe_path = "../../pre_processing/datasets/preprocessed_data/no_filtered.csv"

    sys.path.insert(1, "../../utils/")
    import model_utils

    results_dir = "./"

In [None]:
df = pd.read_csv(dataframe_path, sep = ",")

In [None]:
# Drop RA column if exists
if "RA" in df.columns:
    print("Removing column RA")
    df.drop('RA', inplace=True, axis=1)

In [None]:
df.head()

In [None]:
# Run all predictions based on fold n_splits
def run_prediction(X_train, X_test, y_train, y_test, splits):
    
    # used to evaluate best model params
    gridsearch_metrics = {
        'accuracy': 'accuracy',
        'rec':'recall', 
        'auc':'roc_auc', 
        'f1': 'f1',
        'precision': 'precision',
        'mse': 'neg_mean_squared_error',
        'mae': 'neg_mean_absolute_error'
    }
    
    # model gridsearch params
    model_params = {
        'clf__weights': ['uniform', 'distance'], 
        'clf__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'clf__p': [1, 2], # 1 = minkowski and 2 = euclidian
        'clf__algorithm': ["auto"],
        'clf__leaf_size': [10, 20, 30, 40, 50],
    }
    
    ## Estimator Standard Scaler
    estimator = [('scl', StandardScaler()),
                 ('clf', KNeighborsClassifier())]
    model_pipeline = Pipeline(estimator)
    
    # Gridsearch initialization
    clf = GridSearchCV(model_pipeline,
                       n_jobs= 4,
                       verbose= 1,
                       cv = StratifiedKFold(n_splits = splits),
                       param_grid = model_params,
                       #scoring = gridsearch_metrics,
                       refit='accuracy'
                      )
    
    ## GridSearch with best params
    start_time = time()
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    end_time = time()
    total_time = timedelta(seconds=end_time-start_time)
    best_param = clf.best_params_
    
    ## Prediction data
    predict_data = {
        "database": "",
        "n_splits": n_splits,
        "confusion_matrix": confusion_matrix(y_test, predictions, labels=np.unique(predictions)),
        "classification_report": classification_report(y_test, predictions),
        "model_accuracy": accuracy_score(y_test, predictions),
        "f1_score": f1_score(y_test, predictions, labels=np.unique(predictions)),
        "precision_score": precision_score(y_test, predictions),
        "roc_score": roc_auc_score(y_test, predictions),
        "recall_score": recall_score(y_test, predictions),
        "log_loss": "na",
        "epoch_params": best_param,
        "mean_squared_error": mean_squared_error(y_test, predictions),
        "mean_absolute_error": mean_absolute_error(y_test, predictions),
        "train_test_duration": total_time,
        "train_size": "uniform",
        "train_size": len(X_train),
        "test_size": len(X_test)
    }

    return predict_data

In [None]:
def main():
    X = df.iloc[:, df.columns != "EVADIDO"].values
    y = df["EVADIDO"]

    all_predictions = []
    max_splits = 10
    n_runs = 10
    X_train, X_test, y_train, y_test \
        = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)

    for run in range(n_runs):
        print(f"Running {run+1} of {n_runs} \n\n")
        for split in range(2, max_splits, 2):
            print(f"\nStratifiedKFold config: {split} \n")

            # return best k element
            all_predictions.append(run_prediction(X_train, X_test, y_train, y_test, split))

        # General model outputs
        model_utils.generate_output(all_predictions, results_dir, model, run)
        print(f"############################ FINISHED RUN {run+1} ############################")

    # CSV Output results
    model_utils.generate_output_csv(all_predictions, results_dir, model)
    print(f"############################ FINISHED ALL ############################")

In [None]:
if __name__ == "__main__":
    main()