# Import modules

In [25]:
import os 
import glob
import csv
import shutil
import seaborn as sb
import numpy as np
import pandas as pd
from sklearn.metrics import plot_confusion_matrix
from matplotlib import pyplot as plt
import matplotlib.colors as mcolor
import random
import matplotlib.patches as mpatches
from Robert_functions import *

# Code

In [26]:
#general parameters for the workflow


# number of k-folds for cross validation
cv_kfold = 5



Excluded parameters:
x1 : R**2 = 1.0 with x3
x1 : R**2 = 0.96 with x6
x3 : R**2 = 0.95 with x6

Successfully created 37 datapoints.


Descriptors used after correlation filters:
x2
x5
x6
x7
x8
x9
x10
x11
Csub-Csub
Csub-H
Csub-O
H-O


In [28]:
# separate fixed descriptors and create X and y dataframes
fixed_data = DFT_parameters_filtered[fixed_descriptors]

fixed_descriptors.append(response_value)


In [32]:
# Check if the folders exist and if they do, delete them and replace them with new ones.
folder_names = ['Benchmark_methods', 'Raw_data/Best_Model', 'Raw_data/Model_params']

for folder in folder_names:
    try:
        # Si la carpeta existe, se elimina
        if os.path.exists(folder):
            shutil.rmtree(folder)
        # Si no existe, se crea
        os.makedirs(folder)
    except Exception as e:
        # Si se produce algún error, se imprime un mensaje
        print(f'Error while deleting/creating folder "{folder}": {e}')

from Robert_functions import *
def train_and_evaluate_models(X, y, train, split, model, mode, seed, w_dir, csv_params, cv_kfold):
    if train == ['all']:
        train = [90,80,70,60,50,40,30,20,10]

    size_data= [ ]
    for size in train:
        size_data_indiv = []
            
        if split in ['KN','RND']:

            X_train, y_train, X_validation, y_validation, training_points = data_split(X,y,size,seed,split)
        else:
            print('x  Select a valid method for splitting data (options: KN, RMD)!')
            sys.exit()

        fixed_data_train = fixed_data.iloc[training_points]
        fixed_data_validation = fixed_data.drop(training_points)
        #print(X_train)
        # standardizes the data sets using the mean and standard dev from the train set
        Xmean = X_train.mean(axis=0)
        Xstd = X_train.std(axis=0)
        X_train_scaled = (X_train - Xmean) / Xstd
        X_validation_scaled = (X_validation - Xmean) / Xstd
        if model == ['all']:
            model = ['RF','GB','AdaB','MVL','NN','VR']
            
        models_data = []
        for MODEL in model:
            models_data_indiv = []
            # hyperopt process including k-neighbours-based splitting of the data
            hyperopt_process = run_hyperopt(epochs, MODEL, X, size, mode, seed, w_dir, X_train_scaled, y_train, X_validation_scaled, y_validation, csv_params)

            # read the csv to load and print information about the parameters
            best_parameters_df = pd.read_csv(csv_params+'.csv')
        
            # print information about the hyperopt process
            #print_hyperopt_params(MODEL,best_parameters_df,train,w_dir)
            if mode == 'reg':
            # calculate R2, MAE and RMSE for train and validation sets
                r2_train,mae_train,rmse_train,r2_validation,mae_validation,rmse_validation,_,_ = predictor_workflow(seed,MODEL,best_parameters_df,X_train_scaled,y_train,X_validation_scaled,y_validation,mode,size)
            # calculates k-fold cross validation
                cv_score = cross_val_calc(seed,MODEL,best_parameters_df,X_train_scaled,y_train,mode,cv_kfold)
            # print stats
            # print_model_stats(MODEL,X_train_scaled,X_validation_scaled,r2_train,mae_train,rmse_train,r2_validation,mae_validation,rmse_validation,mode,cv_score,cv_kfold,None)

            elif mode == 'clas':
            # calculate accuracy, F1 score and MCC for train and validation sets
                accuracy_train,f1score_train,mcc_train,accuracy_validation,f1score_validation,mcc_validation,_,_ = predictor_workflow(seed,MODEL,best_parameters_df,X_train_scaled,y_train,X_validation_scaled,y_validation,mode,size)
            # calculates k-fold cross validation
                cv_score = cross_val_calc(seed,MODEL,best_parameters_df,X_train_scaled,y_train,mode,cv_kfold)
            # print stats
                #print_model_stats(MODEL,X_train_scaled,X_validation_scaled,accuracy_train,f1score_train,mcc_train,accuracy_validation,f1score_validation,mcc_validation,mode,cv_score,cv_kfold,None)
        
        # calculate the permutation feature importance (PFI) of the descriptors in the 
            # model and generates a new dataset
            # print(rmse_validation)
            dict_model = {
                            "MODEL": MODEL,
                            "size": size,
                            "best_parameters_df": best_parameters_df,
                            "r2_train": r2_train,
                            "mae_train": mae_train,
                            "rmse_train": rmse_train,
                            "r2_validation": r2_validation,
                            "mae_validation": mae_validation,
                            "rmse_validation": rmse_validation,
                            "X_validation_scaled": X_validation_scaled,
                            "X_train_scaled": X_train_scaled,
                            "cv_score": cv_score,
                            "X_validation": X_validation,
                            "mode": mode,
                            "cv_kfold": cv_kfold,
                            "Robert_results": "Robert_results.txt",
                            "y_train": y_train,
                            "y_validation": y_validation,
                            "X": X,
                            "fixed_data_train": fixed_data_train,
                            "fixed_data_validation": fixed_data_validation
                        }
            dict_model_pd = pd.DataFrame.from_dict(dict_model, orient='index')
            dict_model_pd=dict_model_pd.transpose()
            dict_model_excel = dict_model_pd.to_csv(f'Raw_data/Model_params/{dict_model["MODEL"]}_{size}.csv', index = None, header=True)
        
            # PFI function
            combined_descriptor_list = PFI_workflow(X,MODEL,best_parameters_df,X_train_scaled,y_train,X_validation_scaled,y_validation,n_repeats,PFI_threshold,False,mode,PFI)

            # creates X and y sets
            # creates a database with the most important descriptors after PFI

            df_PFI_model = pd.DataFrame()
            df_PFI_model[response_value] = DFT_parameters_filtered[response_value]

            for i,column in enumerate(DFT_parameters_filtered.columns):
                if column in combined_descriptor_list:
                    df_PFI_model[column] = DFT_parameters_filtered[column]

            X_PFI = df_PFI_model.drop([response_value], axis=1)
            y_PFI = df_PFI_model[response_value]

            # k-neighbours-based data splitting using previous training points
            X_train_PFI = X_PFI.iloc[training_points]
            y_train_PFI = y_PFI.iloc[training_points]
            X_validation_PFI = X_PFI.drop(training_points)
            y_validation_PFI = y_PFI.drop(training_points)

            # standardizes the data sets using the mean and standard dev from the train set
            Xmean = X_train_PFI.mean(axis=0)
            Xstd = X_train_PFI.std(axis=0)
            X_train_PFI_scaled = (X_train_PFI - Xmean) / Xstd
            X_validation_PFI_scaled = (X_validation_PFI - Xmean) / Xstd
            # run the best model from hyperopt and calculates its efficiency using only 
            # the most important features from the PFI analysis
            try:
                if int(best_parameters_df['max_features'][0]) > len(X_PFI.columns):
                    best_parameters_df.at[0,'max_features'] = len(X_PFI.columns)
                    # replace the value in the parameters csv
                    export_param_excel = best_parameters_df.to_csv(csv_params+'.csv', index = None, header=True)
            except KeyError:
                pass

            if mode == 'reg':
                # calculate R2, MAE and RMSE for train and validation sets
                r2_train_PFI,mae_train_PFI,rmse_train_PFI,r2_validation_PFI,mae_validation_PFI,rmse_validation_PFI,y_pred_train_PFI,y_pred_validation_PFI = predictor_workflow(seed,MODEL,best_parameters_df,X_train_PFI_scaled,y_train_PFI,X_validation_PFI_scaled,y_validation_PFI,mode,size)
                # calculates k-fold cross validation
                cv_score = cross_val_calc(seed,MODEL,best_parameters_df,X_train_PFI_scaled,y_train_PFI,mode,cv_kfold)
                # print stats
                #print_model_stats(MODEL,X_train_PFI_scaled,X_validation_PFI_scaled,r2_train_PFI,mae_train_PFI,rmse_train_PFI,r2_validation_PFI,mae_validation_PFI,rmse_validation_PFI,mode,cv_score,cv_kfold,'Robert_results.txt')
                # data of the model
                dict_model_PFI = {
                    "MODEL": MODEL,
                    "size": size,
                    "best_parameters_df": best_parameters_df,
                    "r2_train_PFI": r2_train_PFI,
                    "mae_train_PFI": mae_train_PFI,
                    "rmse_train_PFI": rmse_train_PFI,
                    "r2_validation_PFI": r2_validation_PFI,
                    "mae_validation_PFI": mae_validation_PFI,
                    "rmse_validation_PFI": rmse_validation_PFI,
                    "rmse_validation": rmse_validation,
                    "X_train_PFI_scaled": X_train_PFI_scaled,
                    "X_train_scaled": X_train_scaled,
                    "y_pred_train_PFI": y_pred_train_PFI,
                    "y_pred_validation_PFI": y_pred_validation_PFI,
                    "cv_score": cv_score,
                    "X_validation_PFI_scaled": X_validation_PFI_scaled,
                    "mode": mode,
                    "cv_kfold": cv_kfold,
                    "Robert_results": "Robert_results.txt",
                    "y_train_PFI": y_train_PFI,
                    "y_validation_PFI": y_validation_PFI,
                    "X_PFI": X_PFI,
                    "fixed_data_train": fixed_data_train,
                    "fixed_data_validation": fixed_data_validation
                }
                models_data_indiv = [MODEL, best_parameters_df, r2_train_PFI,mae_train_PFI,rmse_train_PFI,r2_validation_PFI,mae_validation_PFI,rmse_validation_PFI,rmse_validation,X_train_PFI_scaled,X_train_scaled,y_pred_train_PFI,y_pred_validation_PFI, cv_score,X_validation_PFI_scaled,mode,cv_kfold,'Robert_results.txt',y_train_PFI,y_validation_PFI, X_PFI,fixed_data_train,fixed_data_validation]
            elif mode == 'clas':
                # calculate accuracy, F1 score and MCC for train and validation sets
                accuracy_train_PFI,f1score_train_PFI,mcc_train_PFI,accuracy_validation_PFI,f1score_validation_PFI,mcc_validation_PFI,y_pred_train_PFI,y_pred_validation_PFI = predictor_workflow(seed,MODEL,best_parameters_df,X_train_PFI_scaled,y_train_PFI,X_validation_PFI_scaled,y_validation_PFI,mode,size)
                # calculates k-fold cross validation
                cv_score = cross_val_calc(seed,MODEL,best_parameters_df,X_train_PFI_scaled,y_train_PFI,mode,cv_kfold)
                # print stats
                #print_model_stats(MODEL,X_train_PFI_scaled,X_validation_PFI_scaled,accuracy_train_PFI,f1score_train_PFI,mcc_train_PFI,accuracy_validation_PFI,f1score_validation_PFI,mcc_validation_PFI,mode,cv_score,cv_kfold,'Robert_results.txt')
                # data of the model
                models_data_indiv = [MODEL, best_parameters_df, accuracy_train_PFI,f1score_train_PFI,mcc_train_PFI,accuracy_validation_PFI,f1score_validation_PFI,mcc_validation_PFI,y_pred_train_PFI,y_pred_validation_PFI, cv_score]
        
            #Create csv files for all model and training sizes
            dict_model_PFI_pd = pd.DataFrame.from_dict(dict_model_PFI, orient='index')
            dict_model_PFI_pd=dict_model_PFI_pd.transpose()
            dict_model_PFI_excel = dict_model_PFI_pd.to_csv(f'Raw_data/Model_params/{dict_model_PFI["MODEL"]}_{size}_PFI.csv', index = None, header=True)

            models_data.append(models_data_indiv)
    
        size_data_indiv = [size,models_data]
        size_data.append(size_data_indiv)
        
train_and_evaluate_models(X, y, train, split, model, mode, seed, w_dir, csv_params, cv_kfold)
def find_min_column_value(csv_files, column_name, output_directory):
    min_value = float('inf')
    min_file = None
    for csv_file in csv_files:
        with open(csv_file, 'r') as f:
            reader = csv.DictReader(f)
            for row in reader:
                value = float(row[column_name])
                if value < min_value:
                    min_value = value
                    min_file = csv_file
    
    shutil.copy(min_file, output_directory)
    return min_value, min_file

csv_files = glob.glob('Raw_data/Model_params/*[!_PFI]*.csv')
column_name = 'rmse_validation'
min_value, min_file = find_min_column_value(csv_files, column_name, 'Raw_data/Best_Model/Best_Model.csv')

csv_files_PFI = glob.glob('Raw_data/Model_params/*_PFI*.csv')
column_name_PFI = 'rmse_validation_PFI'
min_value_PFI, min_file_PFI = find_min_column_value(csv_files_PFI, column_name_PFI, 'Raw_data/Best_Model/Best_Model_PFI.csv')

# Read the CSV file into a Pandas DataFrame
df_min = pd.read_csv(min_file)

# Get the 'MODEL' and 'size' values from the first row of the DataFrame
model_value = df_min['MODEL'].iloc[0]
size_value = df_min['size'].iloc[0]

# Read the CSV file into a Pandas DataFrame
df_min_PFI = pd.read_csv(min_file_PFI)

# Get the 'MODEL' and 'size' values from the first row of the DataFrame
model_value_PFI = df_min_PFI['MODEL'].iloc[0]
size_value_PFI = df_min_PFI['size'].iloc[0]

# Warning if there is a model without PFI with rmse < than model with PFI with the less rmse value  
if min_value < min_value_PFI:
    print('\n'f"x  Warning! Error lower without PFI filter (no PFI: RMSE = {round(min_value,2)} using {model_value}_{size_value} ; with PFI filter: {round(min_value_PFI,2)} using {model_value_PFI}_{size_value_PFI}) consider using PFI=False")      

print('\n'f"The optimal model using PFI={PFI} is {model_value_PFI} with training size {size_value_PFI}%"'\n')

#Obtain the best model (<rmse_validation value)
#best_model = optimal_model(size_data)

# List to store the rmse_validation_PFI values of files without _PFI in the name
rmse_list_1 = []
# List to store the rmse_validation_PFI values of the files with _PFI in the name
rmse_list_2 = []
# Iterate over the csv files in the directory Raw_data/Model_params
for filename in os.listdir("Raw_data/Model_params"):
    # If the file does not have _PFI in its name
    if "_PFI" not in filename:
        # Read the file with pandas and select the value of rmse_validation_PFI
        df = pd.read_csv(f"Raw_data/Model_params/{filename}")
        rmse = df["rmse_validation"].values[0]
        # Add the value to the list rmse_list_1
        rmse_list_1.append(rmse)
    # If the file does have _PFI in its name
    else:
        # Read the file with pandas and select the value of rmse_validation_PFI
        df = pd.read_csv(f"Raw_data/Model_params/{filename}")
        rmse = df["rmse_validation_PFI"].values[0]
        # Add the value to the list rmse_list_2
        rmse_list_2.append(rmse)

def create_dataframe(data, column_name):
    num_columns = len(model)
    num_rows = len(train)
    # Creates a list of column names using model
    column_names = sorted(model)
    # Creates a list of row names using train
    row_names = train
    values_matrix = np.array(data).reshape(num_columns, num_rows)
    # Creates the DataFrame using pd.DataFrame() and providing the array of values, the number of columns and rows, and the lists of column and row names
    df = pd.DataFrame(data=values_matrix, columns=row_names, index=column_names)
    df = df.transpose()
    df.columns.name = column_name
    return df
plot_data_1 = create_dataframe(rmse_list_1, 'Model Type')
plot_data_2 = create_dataframe(rmse_list_2, 'Model Type')

def create_heatmap(data, title, output_file):
    df_plot = pd.DataFrame(data)
    df_plot.columns = [model]
    df_plot.index = [train]
    df_plot = df_plot.sort_index(ascending=False)
    fig, ax = plt.subplots(figsize=(7.45,6))
    sb.set(font_scale=1.2, style='ticks')
    cmap_blues_75_percent_512 =  [mcolor.rgb2hex(c) for c in plt.cm.Blues(np.linspace(0, 0.8, 512))]
    ax = sb.heatmap(df_plot, annot=True, linewidth=1, cmap=cmap_blues_75_percent_512, cbar_kws={'label': 'RMSE Validation'})
    ax.set(xlabel="Model Type", ylabel="Training Size")
    plt.title(title)
    sb.despine(top=False, right=False)
    plt.savefig(output_file, dpi=600, bbox_inches='tight')
    ax.plot()

if PFI:
    create_heatmap(plot_data_1, 'NO_PFI', 'Benchmark_methods/NO_PFI.png')
    create_heatmap(plot_data_2, 'PFI', 'Benchmark_methods/PFI.png')
else:
    create_heatmap(plot_data_1, 'NO_PFI', 'Benchmark_methods/NO_PFI.png')

100%|██████████| 5/5 [00:00<00:00, 13.33trial/s, best loss: 0.2222170828214671]
100%|██████████| 5/5 [00:00<00:00, 11.03trial/s, best loss: 0.1996192977283523] 
100%|██████████| 5/5 [00:00<00:00, 23.84trial/s, best loss: 0.129822432847353]  
100%|██████████| 5/5 [00:00<00:00, 12.26trial/s, best loss: 0.12330528822327]   


KeyError: 'rmse_validation'

In [24]:
# with open(min_file_PFI, 'r') as f:
#     reader = csv.reader(f)
#     data = list(reader)

df_model_PFI = pd.read_csv(min_file_PFI)
# print(df_model_PFI)
#Dictionary with the values of the best model_PFI
# dict_model_PFI = {
#         "MODEL": data[1][0],
#         "size": data[1][1],
#         "best_parameters_df": data[1][2],
#         "r2_train_PFI": data[1][3],
#         "mae_train_PFI": data[1][4],
#         "rmse_train_PFI": data[1][5],
#         "r2_validation_PFI": data[1][6],
#         "mae_validation_PFI": data[1][7],
#         "rmse_validation_PFI": data[1][8],
#         "rmse_validation": data[1][9],
#         "X_train_PFI_scaled": data[1][10],
#         "X_train_scaled": data[1][11],
#         "y_pred_train_PFI": data[1][12],
#         "y_pred_validation_PFI": data[1][13],
#         "cv_score": data[1][14],
#         "X_validation_PFI_scaled": data[1][15],
#         "mode": data[1][16],
#         "cv_kfold": data[1][17],
#         "Robert_results": data[1][18],
#         "y_train_PFI": data[1][19],
#         "y_validation_PFI": data[1][20],
#         "X_PFI": data[1][21],
#         "fixed_data_train": data[1][22],
#         "fixed_data_validation": data[1][23]
# }
# # run the best model from hyperopt and calculates its efficiency using only the most important features from the PFI analysis
# print_model_stats(MODEL,X_train_PFI_scaled,X_validation_PFI_scaled,r2_train_PFI,mae_train_PFI,rmse_train_PFI,r2_validation_PFI,mae_validation_PFI,rmse_validation_PFI,mode,cv_score,cv_kfold,'Robert_results.txt')

dict_model_PFI = {}

cv_score = df_model_PFI["cv_score"][0].replace('[','').replace(']','')
dict_model_PFI['cv_score'] = [float(x) for x in cv_score.split()]

print(df_model_PFI["X_validation_PFI_scaled"][0])
print(np.asarray(df_model_PFI["cv_score"][0]))
# print_model_stats(best_model[0],best_model[9],best_model[14],best_model[2],best_model[3],best_model[4],best_model[5],best_model[6],best_model[7],mode,best_model[13],best_model[16],best_model[17])
print_model_stats(df_model_PFI["MODEL"][0], df_model_PFI["X_train_PFI_scaled"][0], df_model_PFI["X_validation_PFI_scaled"][0],
                  float(df_model_PFI["r2_train_PFI"][0]), float(df_model_PFI["mae_train_PFI"][0]), float(df_model_PFI["rmse_train_PFI"][0]),
                  float(df_model_PFI["r2_validation_PFI"][0]), float(df_model_PFI["mae_validation_PFI"][0]), float(df_model_PFI["rmse_validation_PFI"][0]),
                  df_model_PFI["mode"][0], np.asarray(cv_score), int(df_model_PFI["cv_kfold"][0]), df_model_PFI["Robert_results"][0])

#print_model_stats(MODEL,X_train_PFI_scaled,X_validation_PFI_scaled,r2_train_PFI,mae_train_PFI,rmse_train_PFI,r2_validation_PFI,mae_validation_PFI,rmse_validation_PFI,mode,cv_score,cv_kfold,'Robert_results.txt')
# calculate the permutation feature importance (PFI) of the final model and saves the data
# models_data_indiv = [MODEL, best_parameters_df, r2_train_PFI,mae_train_PFI,rmse_train_PFI,r2_validation_PFI,mae_validation_PFI,rmse_validation_PFI,rmse_validation,X_train_PFI_scaled,X_train_scaled,y_pred_train_PFI,y_pred_validation_PFI, cv_score,X_validation_PFI_scaled,mode,cv_kfold,'Robert_results.txt',y_train_PFI,y_validation_PFI]
# combined_descriptor_list = PFI_workflow(X_PFI,model,best_parameters_df,X_train_PFI_scaled,y_train_PFI,X_validation_PFI_scaled,y_validation_PFI,n_repeats,0,True,mode,PFI)

combined_descriptor_list = PFI_workflow(best_model[20],best_model[0],best_model[1],best_model[9],best_model[18],best_model[14],best_model[19],n_repeats,0,True,mode,PFI)

#Save the data in a CSV file including set column and predicted values 

X_train_csv = best_model[21].copy()
X_validation_csv = best_model[22].copy()

X_train_csv[response_value] = best_model[18]
X_validation_csv[response_value] = best_model[19]

X_train_csv[f'Predicted {response_value}'] = best_model[11]
X_validation_csv[f'Predicted {response_value}'] =  best_model[12]
 
X_train_csv = pd.concat([X_train_csv, best_model[9]], axis=1)
X_validation_csv = pd.concat([X_validation_csv, best_model[14]], axis=1)

X_train_csv['Set'] = 'Training'
X_validation_csv['Set'] = 'Validation'

df_csv = pd.concat([X_train_csv, X_validation_csv], axis=0)

#creates an Excel database with only the most important descriptors used by the model
export_param_excel = df_csv.to_csv(f'Final_dataset/{csv_name}_final_dataset.csv', index = None, header=True)

# Plot training and test sets
if mode == 'reg':
    sb.set(font_scale=1.2, style="ticks") #set styling preferences

    Plotdata_train_PFI = {'y_train_PFI': best_model[18], 'y_pred_train_PFI':  best_model[11]} 
    Plotdata_validation_PFI = {'y_validation_PFI': best_model[19], 'y_pred_validation_PFI':  best_model[12]}

    df_train_PFI = pd.DataFrame.from_dict(Plotdata_train_PFI)
    df_validation_PFI = pd.DataFrame.from_dict(Plotdata_validation_PFI)

    # Build the plot
    # Set up some features to plot the dots
    color_train = 'b'
    color_validation = 'orange'
    size = 30
    alpha = 1 # from 0 (transparent) to 1 (opaque)

    # Create subplot with a certain size and title
    fig, ax = plt.subplots(figsize=(5,5))

    # Set styling preferences
    sb.set(font_scale=1.2, style="ticks")
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)

    # title of the graph
    total_points = len(best_model[18])+len(best_model[19])
    train_proportion = len(best_model[18])/total_points
    validation_proportion = len(best_model[19])/total_points
    ratios =  str(round(train_proportion,2)*100)+':'+str(round(validation_proportion,2)*100)
    title_text = best_model[0]+' model with train:validation ('+ratios+') of '+str(total_points)+' datapoints'
    
    plt.text(0.5, 1.08, title_text, horizontalalignment='center',
         fontsize=14, fontweight='bold', transform = ax.transAxes)

    # Plot the data
    points_train = ax.scatter(df_train_PFI["y_train_PFI"], df_train_PFI["y_pred_train_PFI"],
                c = color_train, s = size, edgecolor = 'k', linewidths = 0.8, alpha = alpha, zorder=2)

    points_validation = ax.scatter(df_validation_PFI["y_validation_PFI"], df_validation_PFI["y_pred_validation_PFI"],
                c = color_validation, s = size, edgecolor = 'k', linewidths = 0.8, alpha = alpha, zorder=2)

    # Put a legend below current axis
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.17),
            fancybox=True, shadow=True, ncol=5, labels=['Training','Validation'])

    # Add the regression line with a confidence interval based on the training sets
    plot = sb.regplot("y_train_PFI", "y_pred_train_PFI", data=df_train_PFI, scatter=False, color=".1", 
                    truncate = True, ax=ax)

    # Title of the axis
    plot = ax.set(ylabel=f'Predicted {response_value}', xlabel=f'{response_value} from database')
    
    # Add gridlines
    ax.grid(linestyle='--', linewidth=1)

    # set limits
    size_space = 0.1*abs(min(best_model[18])-max(best_model[18]))
    if min(best_model[18]) < min(best_model[19]):
        min_value_graph = min(best_model[18])-size_space
    else:
        min_value_graph = min(best_model[19])-size_space
        
    if max(best_model[18]) > max(best_model[19]):
        max_value_graph = max(best_model[18])+size_space
    else:
        max_value_graph = max(best_model[19])+size_space
        
    plt.xlim(min_value_graph, max_value_graph)
    plt.ylim(min_value_graph, max_value_graph)
        
    # save the plot a png image, type True
    plt.savefig('Predicted vs database values.png', dpi=400, bbox_inches='tight')

    plt.show()

    print('\nThe corresponding graph was saved in '+w_dir+'.')

elif mode == 'clas':
    predictor_model = predictor_model_fun(best_model[0], best_model[1], seed, mode)

    predictor_model.fit(best_model[9], best_model[18])

    plot_confusion_matrix(predictor_model, best_model[14], best_model[19],cmap='Blues') 
    plt.show()


          x6        x7       x10
3   0.560956 -1.115352 -0.093580
7   0.330019  0.176108 -0.093580
8   0.323469  0.176108 -0.093580
11 -0.771414 -1.115352  0.935804
13  1.193091  1.467569 -1.122965
14  1.179330  1.467569 -1.122965
15  1.167189  1.467569 -1.122965
17  1.207031  1.467569 -1.122965
21  1.068913  1.467569 -1.122965
22  1.061932  1.467569 -1.122965
25  0.221717  0.176108 -0.093580
26  1.402781  0.176108 -1.122965
31  0.226432  0.176108 -0.093580
34 -1.851322 -1.115352  1.965189
35 -1.854284 -1.115352  1.965189
[0.90748125 0.82181002 0.5219487  0.28883196 0.71309385]
Model: RF
k-neighbours-based training, validation and test sets have been created with this distribution:
Training points: 758
Validation points: 527

k-neighbours-based training: R2 = 0.97; MAE = 0.08; RMSE = 0.12
5.0-fold cross validation: 0.65 ± 0.22
k-neighbours-based validation: R2 = 0.96; MAE = 0.11; RMSE = 0.16


NameError: name 'best_model' is not defined

In [49]:
# run x- and y-shuffle statistical tests
random.seed(a=seed)

# load original data
df_tests_model = pd.read_csv(f'Final_dataset/{csv_name}_final_dataset.csv')

training_data = df_tests_model[df_tests_model.Set == 'Training']
validation_data = df_tests_model[df_tests_model.Set == 'Validation']

# parameters to discard from the csv
shuffle_drops = fixed_descriptors.copy()
shuffle_drops.append('Set')
shuffle_drops.append('Predicted '+response_value)

X_train_tests = training_data.drop(shuffle_drops, axis=1)
X_validation_tests = validation_data.drop(shuffle_drops, axis=1)

y_train_tests = training_data[response_value]
y_validation_tests = validation_data[response_value]

# standardizes the data sets using the mean and standard dev from the train set
Xmean_tests = X_train_tests.mean(axis=0)
Xstd_tests = X_train_tests.std(axis=0)
X_train_tests_scaled = (X_train_tests - Xmean) / Xstd
X_validation_tests_scaled = (X_validation_tests - Xmean) / Xstd

y_train_tests = training_data[response_value]
y_validation_tests = validation_data[response_value]

# x shuffle test
X_train_shuffled = training_data.drop(shuffle_drops, axis=1)
X_validation_shuffled = validation_data.drop(shuffle_drops, axis=1)

# fixed_descriptors
X_train_shuffled = np.asarray(X_train_shuffled)
X_validation_shuffled = np.asarray(X_validation_shuffled)

# standardizes the data sets using the mean and standard dev from the train set
Xmean = X_train_shuffled.mean(axis=0)
Xstd = X_train_shuffled.std(axis=0)
X_train_shuffled_scaled = (X_train_shuffled - Xmean) / Xstd
X_validation_shuffled_scaled = (X_validation_shuffled - Xmean) / Xstd

for row in X_train_shuffled_scaled:
    random.shuffle(row)
    random.shuffle(row)
    random.shuffle(row)
    
for row in X_validation_shuffled_scaled:
    random.shuffle(row)
    random.shuffle(row)
    random.shuffle(row)

print('\nResults from the x-shuffle test')
r2_train_tests,mae_train_tests,rmse_train_tests,r2_validation_tests,mae_validation_tests,rmse_validation_tests,y_pred_train_tests,y_pred_validation_tests = predictor_workflow(seed,model,best_parameters_df,X_train_shuffled_scaled,y_train_tests,X_validation_shuffled_scaled,y_validation_tests,mode,train)
cv_score_x_shuffle = cross_val_calc(seed,model,best_parameters_df,X_train_shuffled_scaled,y_train_tests,mode,cv_kfold)
print_model_stats(model,X_train_tests_scaled,X_validation_tests_scaled,r2_train_tests,mae_train_tests,rmse_train_tests,r2_validation_tests,mae_validation_tests,rmse_validation_tests,mode,cv_score_x_shuffle,cv_kfold,'Robert_results_x-shuffle.txt')

# y shuffle test
y_train_shuffled = y_train_tests.copy()
y_validation_shuffled = y_validation_tests.copy()

y_train_shuffled = np.asarray(y_train_shuffled)   
y_validation_shuffled = np.asarray(y_validation_shuffled) 

random.shuffle(y_train_shuffled)
random.shuffle(y_validation_shuffled)

print('\nResults from the y-shuffle test')
r2_train_tests,mae_train_tests,rmse_train_tests,r2_validation_tests,mae_validation_tests,rmse_validation_tests,y_pred_train_tests,y_pred_validation_tests = predictor_workflow(seed,model,best_parameters_df,X_train_tests_scaled,y_train_shuffled,X_validation_tests_scaled,y_validation_shuffled,mode,train)
cv_score_y_shuffle = cross_val_calc(seed,model,best_parameters_df,X_train_tests_scaled,y_train_shuffled,mode,cv_kfold)
print_model_stats(model,X_train_tests_scaled,X_validation_tests_scaled,r2_train_tests,mae_train_tests,rmse_train_tests,r2_validation_tests,mae_validation_tests,rmse_validation_tests,mode,cv_score_y_shuffle,cv_kfold,'Robert_results_y-shuffle.txt')



Results from the x-shuffle test


UnboundLocalError: local variable 'predictor_model_fun' referenced before assignment