### New knee plots

Lets draw some new knee plots using already formed clusterings that are saved to files

In [1]:
%pylab inline 
pylab.rcParams['figure.figsize'] = (15,12)
import numpy as np
import simplejson as json
import os
from gradutil import *

In [2]:
ide = ideal(False)

In [3]:
new_clustering = dict()
for nclust in range(100, 8501, 200):
    try:
        with open('clusterings/new_{}.json'.format(nclust), 'r') as rfile:
            clustering = json.load(rfile)
        n_clustering = dict()
        for seedn in clustering.keys():
            n_clustering[eval(seedn)] = dict()
            for key in clustering[seedn].keys():
                n_clustering[eval(seedn)][key] = np.array(clustering[seedn][key])
        new_clustering[nclust] = n_clustering
    except FileNotFoundError:
        break

This takes some time. Total 42 files approx. 20 Mb each.

We want to extract the intracluster distance values from the dictionary.

In [4]:
nclusts = []
dists = []
for nclust in sorted(new_clustering.keys()):
    nclusts.append(nclust)
    dists_clust = []
    for seedn in new_clustering[nclust].keys():
        dists_clust.append(np.nansum(new_clustering[nclust][seedn]['dist']))
    dists.append(dists_clust)

Then let's draw the actual picture. In this it has to be remembered that these distances are calculated using the abstarct mathematical centers for the clusters! So not the stand closest to the center, as is really used. (With given clustering data it is not possible to calculate those distances...)

In [9]:
pylab.rcParams['figure.figsize'] = (10,8)
plt.suptitle('Number of clusters and average sum of intra cluster distances.\nValues from 10 independent runs',
            fontsize=15)
plt.plot(nclusts, np.mean(dists, axis=1))
plt.xlabel('Number of clusters', fontsize=12)
plt.ylabel('Sum of intra cluster distances', fontsize=12)
# plt.axvline(x=1500, ymin=0, ymax=250)

## Plots for optimization results using clustering

In [47]:
new_optims = dict()
for nclust in range(100, 8301, 200):
    try:
        with open('optimizations/hope_{}.json'.format(nclust), 'r') as rfile:
            optims = json.load(rfile)
        n_optims = dict()
        for seedn in optims.keys():
            n_optims[eval(seedn)] = dict()
            for name in optims[seedn].keys():
                n_optims[eval(seedn)][name] = dict()
                for key in optims[seedn][name].keys():
                    n_optims[eval(seedn)][name][key] = float(optims[seedn][name][key])
        new_optims[nclust] = n_optims
    except FileNotFoundError:
        break

Extract the single objective optimal values from the dictionary

In [48]:
indices = []
r_revenue = []
r_carbon = []
r_deadwood = []
r_ha = []
s_revenue = []
s_carbon = []
s_deadwood = []
s_ha = []
for nclust in sorted(new_optims.keys()):
    o_rev = []
    o_car = []
    o_dea = []
    o_ha = []
    q_rev = []
    q_car = []
    q_dea = []
    q_ha = []
    for seedn in new_optims[nclust].keys():
        o_rev.append(new_optims[nclust][seedn]['revenue']['real'])
        o_car.append(new_optims[nclust][seedn]['carbon']['real'])
        o_dea.append(new_optims[nclust][seedn]['deadwood']['real'])
        o_ha.append(new_optims[nclust][seedn]['ha']['real'])
        q_rev.append(new_optims[nclust][seedn]['revenue']['surrogate'])
        q_car.append(new_optims[nclust][seedn]['carbon']['surrogate'])
        q_dea.append(new_optims[nclust][seedn]['deadwood']['surrogate'])
        q_ha.append(new_optims[nclust][seedn]['ha']['surrogate'])
    r_revenue.append(o_rev)
    r_carbon.append(o_car)
    r_deadwood.append(o_dea)
    r_ha.append(o_ha)
    s_revenue.append(q_rev)
    s_carbon.append(q_car)
    s_deadwood.append(q_dea)
    s_ha.append(q_ha)
    indices.append(nclust)
r_revenue = np.array(r_revenue)
r_carbon = np.array(r_carbon)
r_deadwood = np.array(r_deadwood)
r_ha = np.array(r_ha)
s_revenue = np.array(s_revenue)
s_carbon = np.array(s_carbon)
s_deadwood = np.array(s_deadwood)
s_ha = np.array(s_ha)
indices = np.array(indices)

In [49]:
names = np.array([['Revenue', 'Carbon'],['Deadwood', 'Habitat']])
optims = np.array([ideal(False)[:2], ideal(False)[2:]])
ymaxs = np.array([[3.5e+8,6.1e+6],[3.1e+5, 2.8e+4]])

In [50]:
pylab.rcParams['figure.figsize'] = (15,12)

fig, ax = plt.subplots(2,2)
fig.suptitle('Optimization results using original variable values\nwhen clustering based surrogate mapped to original variables.\nValues from 10 independent runs',
             fontsize=20)
data = np.array([[r_revenue, r_carbon], [r_deadwood, r_ha]])
for i in range(np.shape(ax)[0]):
    for j in range(np.shape(ax)[1]):
        ax[i,j].plot(indices, np.max(data[i,j], axis=1), color='g')
        ax[i,j].plot(indices, np.mean(data[i,j], axis=1), color='y')
        ax[i,j].plot(indices, np.min(data[i,j], axis=1), color='r')
        # ax[i,j].plot(indices[:maximum], data[i,j][:maximum])
        ax[i,j].plot((min(indices[:maximum]), max(indices)),(optims[i,j], optims[i,j]), color='b')
        ax[i,j].set_title(names[i,j], fontsize=15)
        ax[i,j].set_ylim(ymin=0, ymax=ymaxs[i,j])
        ax[i,j].set_xlabel('Number of clusters', fontsize=12)
        ax[i,j].set_ylabel('Optimization results', fontsize=12)
        # ax[i,j].axvline(x=1300, ymin=0, ymax=250)

 We want to inspect at which point the minimum values drop rapidly. The inspections are based on visual clues:

In [51]:
indices[np.min(r_revenue, axis=1) > 2.46e+8]

In [52]:
indices[np.min(r_carbon, axis=1) > 4.41e+6]

In [53]:
indices[np.min(r_deadwood, axis=1) > 2.13e+5]

In [54]:
indices[np.min(r_ha, axis=1) > 1.98e+4]

So it looks like the results stay quite stabil until 1300 clusters.
We also want to know up to which point the maximum values keep still increasing:

In [55]:
indices[np.max(r_revenue, axis=1) > 2.46e+8]

In [56]:
indices[np.max(r_carbon, axis=1) > 4.41e+6]

In [57]:
indices[np.max(r_deadwood, axis=1) > 2.13e+5]

In [58]:
indices[np.max(r_ha, axis=1) > 1.98e+4]

So it looks like that 3500 is the maximum number of clusters that produces best solutions for all the objectives.
We only want to know if these clusterings are the same for all the objectives. We know that 3500 clusters is index 18 in indices and so also in all the optims arrays, so lets compare them.

In [59]:
np.argmax(r_revenue[18]),np.argmax(r_carbon[18]),np.argmax(r_deadwood[18]),np.argmax(r_ha[18])

For all the objectives the clustering no.2 seems the best! Thats surprising actually. Anyway it is good news. We could then use it in the multiobjective optimizations also.

Whats the difference from attained values to the real values?

In [61]:
((np.array((np.max(r_revenue), np.max(r_carbon), np.max(r_deadwood), np.max(r_ha))) - ide)/ide)*100

The objectives are then 1.0%, 0.3%, 1.0% and 0.8% less than the real values

When considering "the best for all" options the values are then:

In [62]:
(np.array((np.max(r_revenue[18]), np.max(r_carbon[18]), np.max(r_deadwood[18]), np.max(r_ha[18]))) - ide)/ide

This makes then 1.5%, 1.6%, 3.9% and 3.4%. The differences are anyway quite minor.

In [64]:
pylab.rcParams['figure.figsize'] = (15,12)

fig, ax = plt.subplots(2,2)
fig.suptitle('Optimization results using values from previously formed clustering surrogate.\nValues from 10 independent runs',
            fontsize=20)

data = np.array([[s_revenue, s_carbon], [s_deadwood, s_ha]])
for i in range(np.shape(ax)[0]):
    for j in range(np.shape(ax)[1]):
        ax[i,j].plot(indices, np.max(data[i,j], axis=1), color='g')
        ax[i,j].plot(indices, np.mean(data[i,j], axis=1), color='y')
        ax[i,j].plot(indices, np.min(data[i,j], axis=1), color='r')
        # ax[i,j].plot(indices[:maximum], data[i,j][:maximum])
        ax[i,j].plot((min(indices), max(indices)),(optims[i,j], optims[i,j]), color='b')
        ax[i,j].set_title(names[i,j], fontsize=15)
        ax[i,j].set_ylim(ymin=0, ymax=ymaxs[i,j])
        ax[i,j].set_xlabel('Number of clusters', fontsize=12)
        ax[i,j].set_ylabel('Optimization results', fontsize=12)
        #ax[i,j].axvline(x=1500, ymin=0, ymax=250)
        ax[i,j].axvline(x=1700, ymin=0, ymax=250)
        #ax[i,j].axvline(x=2000, ymin=0, ymax=250)

It looks like the results stay quite stabel all the time. We could still inspect when the results start clearly to decrease:

In [65]:
indices[np.max(s_revenue, axis=1) > 2.85e+8]

In [66]:
indices[np.max(s_carbon, axis=1) > 4.65e+6]

In [67]:
indices[np.max(s_deadwood, axis=1) > 2.3e+5]

In [68]:
indices[np.max(s_ha, axis=1) > 2.0e+4]

It seems we are getting worse and worse results all the time when the number of clusters decrease. We could still try to inspect how the best possible results differ from the real values:

In [69]:
(np.array((np.max(s_revenue), np.max(s_carbon), np.max(s_deadwood), np.max(s_ha))) - ide)/ide

All the objectives are 32.3%, 8.7%, 18.7% and 24.3% better than the real results...

In [70]:
np.argmax(np.max(s_revenue, axis=1)), np.argmax(np.max(s_carbon, axis=1)), np.argmax(np.max(s_deadwood, axis=1)), np.argmax(np.max(s_ha, axis=1))

In [71]:
indices[0], indices[1], indices[4]

## Comparing different value assigning methods

In [72]:
pylab.rcParams['figure.figsize'] = (15,12)

fig, ax = plt.subplots(2,2)
fig.suptitle('Comparing best values attainable whether using values of\nproxy variables or original variables.\nValues from 10 independent runs',
            fontsize=20)

data_surr = np.array([[s_revenue, s_carbon], [s_deadwood, s_ha]])
data_real = np.array([[r_revenue, r_carbon], [r_deadwood, r_ha]])
for i in range(np.shape(ax)[0]):
    for j in range(np.shape(ax)[1]):
        ax[i,j].plot(indices, np.max(data_real[i,j], axis=1), color='b')
        ax[i,j].plot(indices, np.max(data_surr[i,j], axis=1), color='r')
        ax[i,j].plot((min(indices), max(indices)),(optims[i,j], optims[i,j]), color='g')
        ax[i,j].set_title(names[i,j], fontsize=15)
        ax[i,j].set_ylim(ymin=0, ymax=ymaxs[i,j])
        ax[i,j].set_xlabel('Number of clusters', fontsize=12)
        ax[i,j].set_ylabel('Optimization results', fontsize=12)
        ax[i,j].axvline(x=1100, ymin=0, ymax=250)
        ax[i,j].axvline(x=1700, ymin=0, ymax=250)
        ax[i,j].axvline(x=2400, ymin=0, ymax=250)
        ax[i,j].axvline(x=5300, ymin=0, ymax=250)