# 1 Read predictions CNN
        

In [1]:
import os
import pandas as pd
import numpy as np
import re
import pickle
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer, scale
from IPython.display import display, Markdown
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import warnings
from auxiliary_functions import *
warnings.filterwarnings('ignore')

##############################################################
# Directories, files, parameters
##############################################################
plots_dir = "plots"
PATH = "results/"
feat_file='cloud_features.csv'

parameters_rf = {'n_estimators':list(range(100,1600,100))}

##############################################################
# Obtaining best CNN execution (in validation set)
##############################################################

summary_files = [each for each in os.listdir(PATH) if each.endswith('_summary_acc.csv')]

# Generating dataframe with all summary files. New columns with file name, model name and set name (train,valid,test)
df = pd.concat((pd.read_csv(os.path.join(PATH, f)).assign(file = f).assign(model = re.search('[0-9]_+(.+?)_summary_acc.csv', f).group(1).split("_")[0]).
                assign(set = re.search('[0-9]_+(.+?)_summary_acc.csv', f.replace("no_ceil_", "")).group(1).split("_")[1]) for f in summary_files))

# Extracting validation results to find best execution
df_val = df[df["set"] == "valid"]
best_exec_acc_val = df_val[df_val["accuracy"] == max(df_val["accuracy"])]
file_best_exec = best_exec_acc_val["file"][0]
file_best_exec_id = re.search('(.+?)_[a-z]+_summary_acc.csv', file_best_exec).group(1)

##############################################################
# Reading indices of the best CNN execution
##############################################################
# Reading indices of the best CNN execution
indices_file = file_best_exec.replace("valid_summary_acc.csv", "indices.pickle")
with open(os.path.join(PATH,indices_file), 'rb') as f:
    in_train, in_valid, in_test  = pickle.load(f)

##############################################################
# Reading predictions of the best CNN execution
##############################################################
# Reading images
img_file='images.npz'
file_dir = "data/"
images = np.load('%s/%s' % (file_dir, img_file), 'r', True)['arr_0']
img_train, img_test, img_valid = images[in_train], images[in_test], images[in_valid]

# Reading predictions of the best CNN execution
preds_file = file_best_exec.replace("valid_summary_acc.csv", "preds.csv")
preds_cnn_train = pd.read_csv(os.path.join(PATH, "%s_train_preds.csv" % file_best_exec_id), index_col=0)
preds_cnn_test = pd.read_csv(os.path.join(PATH, "%s_test_preds.csv" % file_best_exec_id), index_col=0)
preds_cnn_valid = pd.read_csv(os.path.join(PATH, "%s_valid_preds.csv" % file_best_exec_id), index_col=0)

# Deleting observation column
del preds_cnn_train["obs"]
del preds_cnn_test["obs"]
del preds_cnn_valid["obs"]

test_summary_data_cnn = pd.read_csv(os.path.join(PATH, "%s_test_summary_acc.csv" % file_best_exec_id), index_col=0)
print("Best CNN model found: %s" % file_best_exec_id)
display(test_summary_data_cnn)


Best CNN model found: 1584291022619663_inceptionresnetv2_no_ceil


Unnamed: 0_level_0,macro_avg-precision,macro_avg-recall,macro_avg-f1-score,macro_avg-support,weighted_avg-precision,weighted_avg-recall,weighted_avg-f1-score,weighted_avg-support
accuracy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.934995,0.939045,0.947362,0.941528,923,0.938303,0.934995,0.934377,923


# 2 Obtaining ceil features and estimators

In [2]:
# Obtaining ceil features
features = pd.read_csv('%s/%s' % ("data" , feat_file), sep=';', decimal=',')

# Obtaining y vector
cloud_type = np.array(features['cloud.type'])
encoder = LabelBinarizer()
cloud_encoded = encoder.fit_transform(cloud_type)
y_train, y_test, y_valid = cloud_encoded[in_train], cloud_encoded[in_test], cloud_encoded[in_valid]
y_test_dec = encoder.inverse_transform(y_test)

ceil_info = np.array(features[["ceil.height0", "ceil.height1", "ceil.height2", "ceil.depth0",
                               "ceil.depth1", "ceil.depth2","ceil.layers"]])
ceil_info = scale(ceil_info, copy=False)

# Applying indices retrieved from the best execution to obtain ceil
ceil_train, ceil_test, ceil_valid = ceil_info[in_train], ceil_info[in_test], ceil_info[in_valid]

# Filtering columns and applying indices retrieved from the best execution to obtain estimators
features_estimators = features.drop(["date", "file", "camnum", "cloud.type","ceil.height0", "ceil.height1",
                                     "ceil.height2", "ceil.depth0","ceil.depth1", "ceil.depth2","ceil.layers"], axis=1)

features_estimators = pd.DataFrame(scale(features_estimators, copy=False), columns=features_estimators.columns)

estimators_train, estimators_test, estimators_valid = features_estimators.iloc[in_train, :], features_estimators.iloc[in_test, :], features_estimators.iloc[in_valid, :]

### Summary of features

- **preds_cnn_train, preds_cnn_test, preds_cnn_valid**: predictions of the best CNN model
- **estimators_train, estimators_test, estimators_valid**: estimators to train RF classifier
- **ceil_train, ceil_test, ceil_valid**: features to combine with outputs of RF and CNN classifiers



## Experiment 2: RF on estimators classification

In [None]:
##############################################################
# Experiment 2: RF on estimators classification
##############################################################
preds_cnn_train_hot = encoder.transform(preds_cnn_train)
preds_cnn_valid_hot = encoder.transform(preds_cnn_valid)
preds_cnn_test_hot = encoder.transform(preds_cnn_test)

experiment_name = "EXP_2_RF_estimators"

##############################################################
# Classic classifiers comparison
##############################################################
#encoder.inverse_transform(y_train)

train_classifiers_on_set(X_train=estimators_train, Y_train=encoder.inverse_transform(y_train), 
                         X_test=estimators_test, Y_test=encoder.inverse_transform(y_test), 
                         output_file_id=file_best_exec_id, experiment_name=experiment_name, 
                         output_dir=plots_dir, encoder=encoder)

##############################################################
# Training RF with different no. estimators
##############################################################


preds_estimators_test_RF_hot, preds_estimators_train_RF_hot = grid_search_rf(X_train=estimators_train, Y_train=y_train, X_test=estimators_test, Y_test=y_test,
               encoder=encoder, parameters_rf=parameters_rf, file_best_exec_id=file_best_exec_id, experiment_name=experiment_name, output_dir=plots_dir)



## Experiment 3: Average of RF over estimators + CNN predictions

In [None]:
##############################################################
# Experiment 3: Average of RF over estimators + CNN predictions
##############################################################
experiment_name = "EXP_3_RF_estimators_CNN"

# Test set of classifiers
#preds_estimators_test_RF_hot = grid_rf.predict(estimators_test)
preds_estimators_test_RF_CNN = encoder.inverse_transform((preds_estimators_test_RF_hot + preds_cnn_test_hot)/2)

print(classification_report(y_pred= preds_estimators_test_RF_CNN, y_true= y_test_dec, digits= 3))
generate_confusion_matrix_and_report(y_pred=preds_estimators_test_RF_CNN, y_test_dec=y_test_dec, output_file_id=file_best_exec_id, experiment_name=experiment_name, output_dir=plots_dir)


## Experiment 4: Standard classifiers on CNN predictions + RF over estimators

In [None]:
##############################################################
# Experiment 4: Standard classifiers on CNN predictions + RF over estimators
##############################################################
experiment_name = "EXP_4_standard_classifiers_CNN_RF"
print("EXPERIMENT %s" % experiment_name)


#preds_estimators_test_RF_hot = grid_rf.predict(estimators_test)
#preds_estimators_test_RF = encoder.inverse_transform(preds_estimators_test_RF_hot)

x_train_cnn_rf_predictions = np.concatenate((preds_estimators_train_RF_hot, preds_cnn_train_hot, ceil_train), axis=1)
x_test_cnn_rf_predictions = np.concatenate((preds_estimators_test_RF_hot, preds_cnn_test_hot, ceil_test), axis=1)



train_classifiers_on_set(X_train=x_train_cnn_rf_predictions, Y_train=encoder.inverse_transform(y_train), 
                         X_test=x_test_cnn_rf_predictions, Y_test=encoder.inverse_transform(y_test), 
                         output_file_id=file_best_exec_id, experiment_name=experiment_name, output_dir=plots_dir, encoder=encoder)



## Experiment 5: Standard classifiers on CNN predictions + RF over estimators + CEIL features

In [None]:
experiment_name = "EXP_5_standard_classifiers_ceil_CNN_RF"

#preds_estimators_test_RF_hot = grid_rf.predict(estimators_test)
#preds_estimators_test_RF = encoder.inverse_transform(preds_estimators_test_RF_hot)

x_train_ceil_cnn_rf_predictions = np.concatenate((preds_estimators_train_RF_hot, preds_cnn_train_hot, ceil_train), axis=1)
x_test_ceil_cnn_rf_predictions = np.concatenate((preds_estimators_test_RF_hot, preds_cnn_test_hot, ceil_test), axis=1)



train_classifiers_on_set(X_train=x_train_ceil_cnn_rf_predictions, Y_train=encoder.inverse_transform(y_train), 
                         X_test=x_test_ceil_cnn_rf_predictions, Y_test=encoder.inverse_transform(y_test), 
                         output_file_id=file_best_exec_id, experiment_name=experiment_name, output_dir=plots_dir, encoder=encoder)