In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import BaggingClassifier as BC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.neighbors.nearest_centroid import NearestCentroid as NC
from sklearn.gaussian_process import GaussianProcessClassifier as GPC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.tree import ExtraTreeClassifier as ExTC1
from sklearn.ensemble import ExtraTreesClassifier as ExTC2
from sklearn.neural_network import MLPClassifier as NNC
from sklearn.gaussian_process import kernels

In [2]:
classifiers = {
    "BC": BC,
    "SVC": SVC,
    "KNC": KNC,
    "NC": NC,
    "GPC": GPC,
    "DTC": DTC,
    "NNC": NNC,
    "ExTC1": ExTC1,
    "ExTC2": ExTC2,
}

In [3]:
R2 = pd.read_csv('./surrogate_performance/benchmark_test_R^2.csv', index_col=0)
features = pd.read_csv('./features/benchmark_train.csv', index_col=0)

In [4]:
# Drop NANs
features.dropna(axis = 1, inplace=True)

# Drop columns with only one unique value
cols = features.select_dtypes([np.number]).columns
std = features[cols].std()
cols_to_drop = std[std==0].index
features = features.drop(cols_to_drop, axis=1)

# Drop columns with inf
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.dropna(axis = 1, inplace=True)

# Calculating Loss and Targets. Target is the modelling method with lowest loss.
loss = -R2.sub(R2.max(axis=1), axis=0)
targets = loss.idxmin(axis=1)

# Getting data that is avalable in both targets and features
features_available_for = targets.index.intersection(features.index)
targets = targets.loc[features_available_for]
######################
loss = loss.loc[features_available_for]

In [5]:
# Engineering features

engineering_features = pd.read_csv("./features/engineering_train.csv", index_col=0)
engineering_features = engineering_features[features.columns]
engineering_features = engineering_features.dropna()

In [6]:
# Engineering R2

engineering_R2 = pd.read_csv('./surrogate_performance/engineering_test_R^2.csv', index_col=0)
# Calculating Loss and Targets. Target is the modelling method with lowest loss.
engineering_loss = -engineering_R2.sub(engineering_R2.max(axis=1), axis=0)
engineering_targets = engineering_loss.idxmin(axis=1)
# Getting data that is avalable in both targets and features
features_available_for = engineering_targets.index.intersection(engineering_features.index)
engineering_targets = engineering_targets.loc[features_available_for]
engineering_loss = engineering_loss.loc[features_available_for]

predictions = pd.DataFrame(columns = ["selector_e", "selector_e_b", "selector_original" "CVmean", "CVmax"], index = engineering_loss.index)

In [7]:
# To get the problem characteristics out of the filename:

columns = {'ProblemName':0, 'num_var':1, 'num_samples':2, 'distribution':3}

characteristics = pd.DataFrame(index=engineering_loss.index, columns=columns)
for index in characteristics.index:
    for column in columns:
        characteristics[column][index] = index.split('.')[0].split('_')[columns[column]]
characteristics['num_var'] = characteristics['num_var'].astype(int)
characteristics['num_samples'] = characteristics['num_samples'].astype(int)

engineering_loss = characteristics.join(engineering_loss)

In [8]:
classifier_type = "ExTC2"
for prob_id in engineering_loss['ProblemName'].unique():
    test_features = engineering_features[engineering_loss['ProblemName']==prob_id]
    train_features = engineering_features[engineering_loss['ProblemName']!=prob_id]
    test_targets = engineering_targets[engineering_loss['ProblemName']==prob_id]
    train_targets = engineering_targets[engineering_loss['ProblemName']!=prob_id]
    selector_e = classifiers[classifier_type](n_estimators=100)
    selector_e_b = classifiers[classifier_type](n_estimators=100)
    # Just Engineering
    selector_e.fit(train_features.values, train_targets.values)
    # Engineering + Benchmark
    selector_e_b.fit(np.vstack((train_features.values, features.values)), np.hstack((train_targets.values,targets.values)))
    predictions["selector_e"][engineering_loss['ProblemName']==prob_id] = selector_e.predict(test_features)
    predictions["selector_e_b"][engineering_loss['ProblemName']==prob_id] = selector_e_b.predict(test_features)
    
# Whole selector

selector = classifiers[classifier_type](n_estimators=100)
selector.fit(features.values, targets.values)

engineering_predictions = selector.predict(engineering_features)
predictions["selector_original"] = engineering_predictions

In [9]:
loss_prediction = pd.DataFrame(columns=["selector_e", "selector_e_b", "selector_original", "CVmean", "CVmax"], index=engineering_features.index, dtype=float)

cv_scores_mean = pd.read_csv("./surrogate_performance/engineeringCV-score-mean.csv", index_col=0)
cv_scores_mean = cv_scores_mean.loc[features_available_for]
cv_scores_max = pd.read_csv("./surrogate_performance/engineeringCV-score-max.csv", index_col=0)
cv_scores_max = cv_scores_max.loc[features_available_for]

cv_mean_best = cv_scores_mean.idxmax(axis=1)
cv_max_best = cv_scores_max.idxmax(axis=1)


for comp_id in engineering_features.index:
    loss_prediction['selector_e'].at[comp_id] = engineering_loss[predictions.loc[comp_id]["selector_e"]].loc[comp_id]
    loss_prediction['selector_e_b'].at[comp_id] = engineering_loss[predictions.loc[comp_id]["selector_e_b"]].loc[comp_id]
    loss_prediction['selector_original'].at[comp_id] = engineering_loss[predictions.loc[comp_id]["selector_original"]].loc[comp_id]
    loss_prediction['CVmax'].at[comp_id] = engineering_loss[cv_max_best.loc[comp_id]].loc[comp_id]
    loss_prediction['CVmean'].at[comp_id] = engineering_loss[cv_mean_best.loc[comp_id]].loc[comp_id]
    
engineering_loss = engineering_loss.join(loss_prediction)

In [10]:
engg_loss_chart = engineering_loss[['ExtraTrees_10','ExtraTrees_100', 'GPR_matern3/2','GPR_matern5/2', 'selector_e', 'selector_e_b', "selector_original", 'CVmax', 'CVmean']].boxplot(rot=90)
engg_loss_chart.set_title('Loss of Surrogate Models and predicted models,\n when selector trained on Benchmark+Engineering')
engg_loss_chart.set_ylim([0,0.2])
plt.tight_layout()
engg_loss_chart

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f56fba62a20>

In [12]:
engg_loss_chart = engineering_loss[['ExtraTrees_10','ExtraTrees_100', 'GPR_matern3/2','GPR_matern5/2', 'selector_e', 'selector_e_b', "selector_original", 'CVmax', 'CVmean']].boxplot(rot=90)
engg_loss_chart.set_title('Loss of Surrogate Models and predicted models,\n when selector trained on Benchmark+Engineering')
#engg_loss_chart.set_ylim([0,1])
plt.tight_layout()
engg_loss_chart

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f56f94ef1d0>