In [1]:
## IMPORTS
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, \
                            roc_auc_score, precision_score, f1_score, recall_score, \
                            mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from ast import literal_eval as make_tuple
from sklearn.pipeline import Pipeline
from datetime import timedelta
from sklearn.svm import SVC
from time import time
import pandas as pd
import numpy as np
import sys

In [2]:
sys.path.insert(1, "../../utils/")
import model_utils

results_dir = "./"
best_results_path = "../grid_search_results/svc_params.csv"
best_results = pd.read_csv(best_results_path)

In [3]:
# get best params according to grid_search best params
top3_best_params = best_results.nlargest(3,['model_accuracy', 'f1_score', 'roc_score'])
top3_best_params.drop_duplicates(["epoch_best_params"], inplace = True)

# list top 3 results in which epoch_best_params are different
top3_best_params.head(3)
#print(top3_best_params["epoch_best_params"].values)

Unnamed: 0,database,n_splits,model_accuracy,f1_score,precision_score,roc_score,recall_score,max_error,mean_squared_error,mean_absolute_error,train_test_duration,epoch_best_params
2,backward,6,0.934132,0.912,0.982759,0.920373,0.850746,1,0.065868,0.065868,0 days 00:00:07.336496,"{'clf__C': 2, 'clf__decision_function_shape': ..."


In [4]:
all_predictions = []
for row in top3_best_params.iterrows():
    # load correct database in which best results were saved
    dbname = row[1]["database"]
    dataframe_path = f"../../pre_processing/datasets/preprocessed_data/{dbname}.csv"
    df = pd.read_csv(dataframe_path, sep = ",")

    # Drop RA column if exists
    if "RA" in df.columns:
        print("Removing column RA")
        df.drop('RA', inplace=True, axis=1)

    X = df.iloc[:, df.columns != "EVADIDO"].values
    y = df["EVADIDO"]

    input_dim = len(df.columns)
    input_dim = input_dim - 1
    n_splits = row[1]["n_splits"]
    best_param = row[1][-1] # row with best params
    best_param_tuple = make_tuple(best_param)
    
    kernel = best_param_tuple['clf__kernel']
    c = best_param_tuple['clf__C']
    gamma = best_param_tuple['clf__gamma']
    decision_function_shape = best_param_tuple['clf__decision_function_shape']
    shrinking = best_param_tuple['clf__shrinking']
    max_iter = best_param_tuple['clf__max_iter']

    # Apply best params to model to check accuracy
    clf = SVC(
        kernel = kernel,
        C = c,
        gamma = gamma,
        decision_function_shape =  decision_function_shape,
        shrinking = shrinking,
        max_iter = max_iter
    )

    estimator = Pipeline([("scl", StandardScaler()),
                          ("clf", clf)])

    skf = StratifiedKFold(n_splits = n_splits, shuffle = True)
    for i in range(1, 11):
        for index, (train_index, test_index) in enumerate(skf.split(X, y)):
            print(f"{i}'st run for split {index}")
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            start_time = time()
            estimator.fit(X_train, y_train)
            predictions = estimator.predict(X_test)
            end_time = time()
            total_time = timedelta(seconds=end_time-start_time)

            predict_data = {
                "database": dbname,
                "n_splits": f"{i}_{index}_{n_splits}",
                "confusion_matrix": confusion_matrix(y_test, predictions, labels=np.unique(predictions)),
                "classification_report": classification_report(y_test, predictions),
                "model_accuracy": accuracy_score(y_test, predictions),
                "f1_score": f1_score(y_test, predictions, labels=np.unique(predictions)),
                "precision_score": precision_score(y_test, predictions),
                "roc_score": roc_auc_score(y_test, predictions),
                "recall_score": recall_score(y_test, predictions),
                "log_loss": "na",
                "epoch_params": best_param,
                "mean_squared_error": mean_squared_error(y_test, predictions),
                "mean_absolute_error": mean_absolute_error(y_test, predictions),
                "train_test_duration": total_time,
                "train_size": "uniform",
                "train_size": len(X_train),
                "test_size": len(X_test)
            }

            all_predictions.append(predict_data)

# CSV Output results
model_utils.generate_output_csv(all_predictions, results_dir, "svc_results")
print("FINISHED")

1'st run for split 0
1'st run for split 1
1'st run for split 2
1'st run for split 3
1'st run for split 4
1'st run for split 5
2'st run for split 0
2'st run for split 1
2'st run for split 2
2'st run for split 3
2'st run for split 4
2'st run for split 5
3'st run for split 0
3'st run for split 1
3'st run for split 2
3'st run for split 3
3'st run for split 4
3'st run for split 5
4'st run for split 0
4'st run for split 1
4'st run for split 2
4'st run for split 3
4'st run for split 4
4'st run for split 5
5'st run for split 0
5'st run for split 1
5'st run for split 2
5'st run for split 3
5'st run for split 4
5'st run for split 5
6'st run for split 0
6'st run for split 1
6'st run for split 2
6'st run for split 3
6'st run for split 4
6'st run for split 5
7'st run for split 0
7'st run for split 1
7'st run for split 2
7'st run for split 3
7'st run for split 4
7'st run for split 5
8'st run for split 0
8'st run for split 1
8'st run for split 2
8'st run for split 3
8'st run for split 4
8'st run for 