In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import ExtraTreesClassifier as ExTC
from pprint import pprint
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
alt.renderers.enable('notebook')
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
R2 = pd.read_csv('./surrogate_performance/R^2.csv', index_col=0)
features = pd.read_csv('./features/benchmark.csv', index_col=0)

In [3]:
# Drop NANs
features.dropna(axis = 1, inplace=True)

In [4]:
# Drop columns with only one unique value
cols = features.select_dtypes([np.number]).columns
std = features[cols].std()
cols_to_drop = std[std==0].index
features = features.drop(cols_to_drop, axis=1)

In [5]:
# Drop columns with inf
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.dropna(axis = 1, inplace=True)

In [6]:
# Calculating Loss and Targets. Target is the modelling method with lowest loss.
loss = -R2.sub(R2.max(axis=1), axis=0)
targets = loss.idxmin(axis=1)

In [7]:
# Getting data that is avalable in both targets and features
features_available_for = targets.index.intersection(features.index)
targets = targets.loc[features_available_for]

In [8]:
# Train test split
train_features, test_features, train_targets, test_targets = tts(features, targets)

In [9]:
# Training the selector
selector = ExTC()
selector.fit(train_features.values, train_targets.values)



ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [10]:
predicted_targets = selector.predict(test_features)
predicted_targets = pd.DataFrame(predicted_targets, index=test_features.index, columns=['Prediction'])

In [11]:
con_mat = metrics.confusion_matrix(test_targets, predicted_targets)
con_mat = pd.DataFrame(con_mat, index=selector.classes_, columns= selector.classes_)
sns.set_style("whitegrid")
sns.heatmap(con_mat,cmap="RdBu",annot=True,annot_kws={"size": 7},linewidths=0.5, linecolor='black')
plt.tight_layout()

<IPython.core.display.Javascript object>

In [12]:
# For loss comparison
comparison_index = loss.index.intersection(predicted_targets.index)
loss_compare = loss.loc[comparison_index]
loss_prediction = pd.DataFrame(columns=['Prediction'], index=comparison_index, dtype=float)

In [13]:
for comp_id in comparison_index:
    loss_prediction['Prediction'].at[comp_id] = loss_compare[predicted_targets.loc[comp_id]].loc[comp_id]

In [14]:
loss_compare.sort_index(axis=1, inplace=True)
loss_compare = loss_compare.join(loss_prediction)

In [15]:
loss_chart = loss_compare.boxplot(rot=90)
loss_chart.set_title('Loss of Surrogate Models and predicted models')
loss_chart.set_ylim([0,1])
plt.tight_layout()
loss_prediction.quantile(q=0.95)

<IPython.core.display.Javascript object>

Prediction    0.029876
Name: 0.95, dtype: float64

In [17]:
loss_compare.describe()

Unnamed: 0,AdaBoost_10,AdaBoost_100,DecisionTree,ExtraTrees_10,ExtraTrees_100,GPR_matern3/2,GPR_matern5/2,GPR_rbf,MLP,RandomForest_10,RandomForest_100,svm_linear,svm_rbf,Prediction
count,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0
mean,0.261121,0.223786,0.292845,0.19087,0.175202,0.67839,1.406194,35.315442,3.95988,0.201364,0.187623,0.370124,0.630301,0.005894
std,0.284821,0.271559,0.286134,0.268952,0.270656,2.62893,6.436998,121.456628,6.27681,0.2754,0.276875,0.406453,0.522038,0.032623
min,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
25%,0.06988,0.050612,0.105093,0.021562,0.001105,3.8e-05,0.000237,0.020145,0.321681,0.029396,0.014175,0.072465,0.126823,-0.0
50%,0.168471,0.133653,0.204599,0.073335,0.048658,0.043867,0.030835,0.151021,1.074229,0.086731,0.062612,0.205593,0.674629,-0.0
75%,0.346644,0.278067,0.386189,0.264516,0.238615,0.189841,0.285861,3.081299,4.424742,0.280876,0.256722,0.62222,0.937543,-0.0
max,1.723352,1.650829,1.600574,1.593804,1.602479,56.856788,58.379491,640.163801,42.903156,1.629075,1.629392,3.66386,2.613276,0.691752


## Feature Importances

In [18]:
importances = selector.feature_importances_
std = np.std([tree.feature_importances_ for tree in selector.estimators_],
             axis=0)
feat_importance = pd.DataFrame(np.vstack((importances, std)), columns=features.columns, index=['mean', 'std'])

In [19]:
feat_importance.sort_index(by=['mean'], axis=1, inplace=True)
fig = feat_importance.loc['mean'].plot.bar(yerr=feat_importance.loc['std'])
plt.title('Feature Importances')
plt.tight_layout()

  """Entry point for launching an IPython kernel.


<IPython.core.display.Javascript object>

In [20]:
engineering_features = pd.read_csv("./features/engineering.csv", index_col=0)
engineering_features = engineering_features[features.columns]
engineering_features = engineering_features.dropna()

In [21]:
engineering_R2 = pd.read_csv('./surrogate_performance/engineeringR^2.csv', index_col=0)
# Calculating Loss and Targets. Target is the modelling method with lowest loss.
engineering_loss = -engineering_R2.sub(engineering_R2.max(axis=1), axis=0)
engineering_targets = engineering_loss.idxmin(axis=1)
# Getting data that is avalable in both targets and features
features_available_for = engineering_targets.index.intersection(engineering_features.index)
engineering_targets = engineering_targets.loc[features_available_for]
engineering_loss = engineering_loss.loc[features_available_for]

In [22]:
engineering_R2.plot.box(rot=45)
plt.ylim([0,1])
plt.ylabel('R²')
plt.title('R² Values of various modelling techniques on engineering problems')
plt.tight_layout()

<IPython.core.display.Javascript object>

In [23]:
selector = ExTC()
selector.fit(features.values, targets.values)
engineering_predictions = selector.predict(engineering_features)
engineering_predictions = pd.DataFrame(engineering_predictions, index=engineering_features.index, columns=['Prediction'])



In [24]:
metrics.confusion_matrix(engineering_targets, engineering_predictions)

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  1,  0,  0,  0,  0,  0,  0],
       [ 1, 15, 10,  0,  0,  0,  0,  3,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  1, 11,  6,  1,  0,  0,  0,  7],
       [ 4,  1,  6,  7,  0,  0,  0,  0,  0],
       [ 0,  0,  3,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [25]:
engineering_loss_prediction = pd.DataFrame(columns=['Prediction'], index=engineering_features.index, dtype=float)
for comp_id in engineering_features.index:
    engineering_loss_prediction['Prediction'].at[comp_id] = engineering_loss[engineering_predictions.loc[comp_id]].loc[comp_id]

In [26]:
engineering_loss.sort_index(axis=1, inplace=True)
engineering_loss = engineering_loss.join(engineering_loss_prediction)

In [27]:
engg_loss_chart = engineering_loss.boxplot(rot=90)
engg_loss_chart.set_title('Loss of Surrogate Models and predicted models')
plt.ylabel('loss')
engg_loss_chart.set_ylim([0,1])
plt.tight_layout()
engg_loss_chart

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fdc8c098048>

In [28]:
engg_loss_chart = engineering_loss[['ExtraTrees_10','ExtraTrees_100', 'GPR_matern3/2','GPR_matern5/2', 'Prediction']].boxplot(rot=90)
engg_loss_chart.set_title('Loss of Surrogate Models and predicted models')
engg_loss_chart.set_ylim([0,0.4])
plt.tight_layout()
engg_loss_chart

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fdc86cb2390>

In [29]:
engineering_loss.describe()

Unnamed: 0,AdaBoost_10,AdaBoost_100,DecisionTree,ExtraTrees_10,ExtraTrees_100,GPR_matern3/2,GPR_matern5/2,GPR_rbf,MLP,RandomForest_10,RandomForest_100,svm_linear,svm_rbf,Prediction
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,0.119753,0.100426,0.11145,0.026402,0.01672,0.151598,0.176945,32.97168,66.272585,0.058712,0.048652,0.531101,0.703219,0.017241
std,0.095361,0.152892,0.116977,0.036064,0.028565,0.406823,0.49096,111.351412,220.510202,0.071614,0.060653,0.398577,0.436696,0.028684
min,0.005125,0.001085,3.6e-05,-0.0,-0.0,2e-06,-0.0,-0.0,-0.0,1.6e-05,1e-05,0.002114,0.002207,-0.0
25%,0.067919,0.039794,0.034042,0.003758,-0.0,0.000107,-0.0,3.1e-05,0.323454,0.013309,0.009045,0.149653,0.27939,4.9e-05
50%,0.101976,0.067813,0.079759,0.013654,0.005676,0.00326,5e-05,0.028301,3.723865,0.033138,0.024081,0.525263,0.948181,0.003953
75%,0.140208,0.099816,0.13686,0.030999,0.019397,0.063207,0.080757,1.54761,26.708943,0.074922,0.060918,0.93219,1.038348,0.018159
max,0.432549,1.05399,0.501111,0.197298,0.148233,2.214465,2.702947,413.478576,1259.316729,0.34575,0.25617,1.23766,1.306356,0.147296


In [30]:
engineering_loss.median()

AdaBoost_10         0.101976
AdaBoost_100        0.067813
DecisionTree        0.079759
ExtraTrees_10       0.013654
ExtraTrees_100      0.005676
GPR_matern3/2       0.003260
GPR_matern5/2       0.000050
GPR_rbf             0.028301
MLP                 3.723865
RandomForest_10     0.033138
RandomForest_100    0.024081
svm_linear          0.525263
svm_rbf             0.948181
Prediction          0.003953
dtype: float64