In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import ExtraTreesClassifier as ExTC
from pprint import pprint
from sklearn import metrics
import matplotlib.pyplot as plt
import altair as alt
alt.renderers.enable('notebook')
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
R2 = pd.read_csv('./surrogate_performance/R^2.csv', index_col=0)
features = pd.read_csv('./features/benchmark.csv', index_col=0)

In [3]:
# Drop NANs
features.dropna(axis = 1, inplace=True)

In [4]:
# Drop columns with only one unique value
cols = features.select_dtypes([np.number]).columns
std = features[cols].std()
cols_to_drop = std[std==0].index
features = features.drop(cols_to_drop, axis=1)

In [5]:
# Drop columns with inf
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.dropna(axis = 1, inplace=True)

In [6]:
# Calculating Loss and Targets. Target is the modelling method with lowest loss.
loss = -R2.sub(R2.max(axis=1), axis=0)
targets = loss.idxmin(axis=1)

In [7]:
# Getting data that is avalable in both targets and features
features_available_for = targets.index.intersection(features.index)
targets = targets.loc[features_available_for]

In [8]:
# Train test split
train_features, test_features, train_targets, test_targets = tts(features, targets)

In [9]:
# Training the selector
selector = ExTC()
selector.fit(train_features.values, train_targets.values)



ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [10]:
predicted_targets = selector.predict(test_features)
predicted_targets = pd.DataFrame(predicted_targets, index=test_features.index, columns=['Prediction'])

In [11]:
metrics.confusion_matrix(test_targets, predicted_targets)

array([[  2,   1,   0,   0,   0,   0,   0,   0,   0,   1,   4,   0,   0],
       [  2,   6,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   0],
       [  1,   0,   0,   1,   2,   0,   0,   0,   0,   1,   0,   0,   0],
       [  0,   0,   0,   2,   8,   0,   0,   0,   0,   0,   0,   1,   0],
       [  1,   0,   0,   3, 117,   3,   0,   0,   0,   2,   3,   1,   0],
       [  0,   0,   0,   0,   2, 156,   4,   0,   0,   0,   0,   5,   0],
       [  0,   0,   0,   0,   1,  10, 111,  12,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   2,   0,   9,  57,   0,   0,   0,   1,   0],
       [  0,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0,   2,   0],
       [  2,   0,   0,   0,   2,   0,   0,   0,   0,   2,   2,   1,   0],
       [  2,   2,   1,   0,   9,   0,   0,   0,   0,   2,  35,   1,   0],
       [  0,   0,   0,   0,   0,   6,   0,   0,   0,   0,   0,  72,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2]])

In [12]:
# For loss comparison
comparison_index = loss.index.intersection(predicted_targets.index)
loss_compare = loss.loc[comparison_index]
loss_prediction = pd.DataFrame(columns=['Prediction'], index=comparison_index, dtype=float)

In [13]:
for comp_id in comparison_index:
    loss_prediction['Prediction'].at[comp_id] = loss_compare[predicted_targets.loc[comp_id]].loc[comp_id]

In [14]:
loss_compare.sort_index(axis=1, inplace=True)
loss_compare = loss_compare.join(loss_prediction)

In [15]:
loss_chart = loss_compare.boxplot(rot=90)
loss_chart.set_title('Loss of Surrogate Models and predicted models')
loss_chart.set_ylim([0,1])
plt.tight_layout()
loss_prediction.quantile(q=0.95)

<IPython.core.display.Javascript object>

Prediction    0.018972
Name: 0.95, dtype: float64

In [16]:
loss_compare.describe()

Unnamed: 0,AdaBoost_10,AdaBoost_100,DecisionTree,ExtraTrees_10,ExtraTrees_100,GPR_matern3/2,GPR_matern5/2,GPR_rbf,MLP,RandomForest_10,RandomForest_100,svm_linear,svm_rbf,Prediction
count,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0
mean,0.302519,0.257625,0.335469,0.225791,0.208891,0.5805587,1.244498,31.863212,4.09605,0.237824,0.223715,0.399252,0.670064,0.004784
std,0.31628,0.300155,0.320133,0.295688,0.297819,1.567469,5.403534,110.103599,6.240539,0.303686,0.305213,0.443529,0.519653,0.02364
min,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
25%,0.075993,0.051207,0.116901,0.026786,0.003101,1.922015e-12,0.000277,0.01496,0.435432,0.031604,0.017308,0.067816,0.14165,-0.0
50%,0.188598,0.158534,0.227756,0.096892,0.07043,0.04065041,0.021697,0.142328,1.190084,0.103041,0.083685,0.229345,0.76234,-0.0
75%,0.434674,0.329977,0.463401,0.319902,0.296361,0.1633445,0.206284,2.961122,4.394774,0.348347,0.314133,0.657408,0.958364,-0.0
max,1.667175,1.6197,1.632588,1.53595,1.520473,11.2868,58.328864,640.142307,39.102894,1.610471,1.592976,3.666497,2.584234,0.378694


## Feature Importances

In [17]:
importances = selector.feature_importances_
std = np.std([tree.feature_importances_ for tree in selector.estimators_],
             axis=0)
feat_importance = pd.DataFrame(np.vstack((importances, std)), columns=features.columns, index=['mean', 'std'])

In [18]:
feat_importance.sort_index(by=['mean'], axis=1, inplace=True)
feat_importance.loc['mean'].plot.bar(yerr=feat_importance.loc['std'])
plt.tight_layout()

  """Entry point for launching an IPython kernel.


<IPython.core.display.Javascript object>

In [19]:
engineering_features = pd.read_csv("./features/engineering.csv", index_col=0)
engineering_features = engineering_features[features.columns]
engineering_features = engineering_features.dropna()

In [20]:
engineering_R2 = pd.read_csv('./surrogate_performance/engineeringR^2.csv', index_col=0)
# Calculating Loss and Targets. Target is the modelling method with lowest loss.
engineering_loss = -engineering_R2.sub(engineering_R2.max(axis=1), axis=0)
engineering_targets = engineering_loss.idxmin(axis=1)
# Getting data that is avalable in both targets and features
features_available_for = engineering_targets.index.intersection(engineering_features.index)
engineering_targets = engineering_targets.loc[features_available_for]
engineering_loss = engineering_loss.loc[features_available_for]

In [21]:
engineering_loss.plot.box(rot=45)
plt.ylim([0,1])
plt.tight_layout()

<IPython.core.display.Javascript object>

In [22]:
selector = ExTC()
selector.fit(features.values, targets.values)
engineering_predictions = selector.predict(engineering_features)
engineering_predictions = pd.DataFrame(engineering_predictions, index=engineering_features.index, columns=['Prediction'])



In [23]:
metrics.confusion_matrix(engineering_targets, engineering_predictions)

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  0, 13,  2,  0,  0,  0, 13,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  4,  7,  5,  3,  0,  1,  6],
       [ 2,  0,  4,  0,  5,  2,  0,  2,  3],
       [ 0,  0,  0,  3,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [24]:
engineering_loss_prediction = pd.DataFrame(columns=['Prediction'], index=engineering_features.index, dtype=float)
for comp_id in engineering_features.index:
    engineering_loss_prediction['Prediction'].at[comp_id] = engineering_loss[engineering_predictions.loc[comp_id]].loc[comp_id]

In [25]:
engineering_loss.sort_index(axis=1, inplace=True)
engineering_loss = engineering_loss.join(engineering_loss_prediction)

In [26]:
engg_loss_chart = engineering_loss.boxplot(rot=90)
engg_loss_chart.set_title('Loss of Surrogate Models and predicted models')
engg_loss_chart.set_ylim([0,1])
plt.tight_layout()
engg_loss_chart

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fdd53529518>

In [27]:
engg_loss_chart = engineering_loss[['ExtraTrees_10','ExtraTrees_100', 'GPR_matern3/2','GPR_matern5/2', 'Prediction']].boxplot(rot=90)
engg_loss_chart.set_title('Loss of Surrogate Models and predicted models')
engg_loss_chart.set_ylim([0,0.4])
plt.tight_layout()
engg_loss_chart

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fdd51960e80>

In [28]:
engineering_loss.describe()

Unnamed: 0,AdaBoost_10,AdaBoost_100,DecisionTree,ExtraTrees_10,ExtraTrees_100,GPR_matern3/2,GPR_matern5/2,GPR_rbf,MLP,RandomForest_10,RandomForest_100,svm_linear,svm_rbf,Prediction
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,0.119753,0.100426,0.11145,0.026402,0.01672,0.151598,0.176945,32.97168,66.272585,0.058712,0.048652,0.531101,0.703219,16.166711
std,0.095361,0.152892,0.116977,0.036064,0.028565,0.406823,0.49096,111.351412,220.510202,0.071614,0.060653,0.398577,0.436696,80.52218
min,0.005125,0.001085,3.6e-05,-0.0,-0.0,2e-06,-0.0,-0.0,-0.0,1.6e-05,1e-05,0.002114,0.002207,-0.0
25%,0.067919,0.039794,0.034042,0.003758,-0.0,0.000107,-0.0,3.1e-05,0.323454,0.013309,0.009045,0.149653,0.27939,0.0
50%,0.101976,0.067813,0.079759,0.013654,0.005676,0.00326,5e-05,0.028301,3.723865,0.033138,0.024081,0.525263,0.948181,0.004854
75%,0.140208,0.099816,0.13686,0.030999,0.019397,0.063207,0.080757,1.54761,26.708943,0.074922,0.060918,0.93219,1.038348,0.052277
max,0.432549,1.05399,0.501111,0.197298,0.148233,2.214465,2.702947,413.478576,1259.316729,0.34575,0.25617,1.23766,1.306356,413.478576


In [29]:
engineering_loss.median()

AdaBoost_10         0.101976
AdaBoost_100        0.067813
DecisionTree        0.079759
ExtraTrees_10       0.013654
ExtraTrees_100      0.005676
GPR_matern3/2       0.003260
GPR_matern5/2       0.000050
GPR_rbf             0.028301
MLP                 3.723865
RandomForest_10     0.033138
RandomForest_100    0.024081
svm_linear          0.525263
svm_rbf             0.948181
Prediction          0.004854
dtype: float64