## Knee plots for up to 2000 clusters

In [1]:
%pylab inline 
pylab.rcParams['figure.figsize'] = (15,12)
import numpy as np
import simplejson as json
import os
from gradutil import *
ide = ideal(False)

This again takes quite some time, because we have a lot of files with a lot of contents.

In [2]:
new_optims = dict()
for nclust in range(50, 8301, 50):
    try:
        with open('optimizations/hope_{}.json'.format(nclust), 'r') as file:
            optimizations = json.load(file)
    except FileNotFoundError:
        break
    new_optims[nclust] = dict()
    for seedn in optimizations.keys():
        new_optims[nclust][eval(seedn)] = dict()
        for name in optimizations[seedn].keys():
            new_optims[nclust][eval(seedn)][name] = dict()
            for key in optimizations[seedn][name].keys():
                new_optims[nclust][eval(seedn)][name][key] = float(optimizations[seedn][name][key])

In [3]:
new_optims[100][2]

In [4]:
inds = []
real_revenue = []
real_carbon = []
real_deadwood = []
real_ha = []
surr_revenue = []
surr_carbon = []
surr_deadwood = []
surr_ha = []
for nclust in sorted(new_optims.keys()):
    r_rev = []
    r_car = []
    r_dea = []
    r_ha = []
    s_rev = []
    s_car = []
    s_dea = []
    s_ha = []
    for seedn in new_optims[nclust].keys():
        r_rev.append(new_optims[nclust][seedn]['revenue']['real'])
        r_car.append(new_optims[nclust][seedn]['carbon']['real'])
        r_dea.append(new_optims[nclust][seedn]['deadwood']['real'])
        r_ha.append(new_optims[nclust][seedn]['ha']['real'])
        s_rev.append(new_optims[nclust][seedn]['revenue']['surrogate'])
        s_car.append(new_optims[nclust][seedn]['carbon']['surrogate'])
        s_dea.append(new_optims[nclust][seedn]['deadwood']['surrogate'])
        s_ha.append(new_optims[nclust][seedn]['ha']['surrogate'])
    real_revenue.append(r_rev)
    real_carbon.append(r_car)
    real_deadwood.append(r_dea)
    real_ha.append(r_ha)
    surr_revenue.append(s_rev)
    surr_carbon.append(s_car)
    surr_deadwood.append(s_dea)
    surr_ha.append(s_ha)
    inds.append(nclust)
inds = np.array(inds)
real_revenue = np.array(real_revenue)
real_carbon = np.array(real_carbon)
real_deadwood = np.array(real_deadwood)
real_ha = np.array(real_ha)
surr_revenue = np.array(surr_revenue)
surr_carbon = np.array(surr_carbon)
surr_deadwood = np.array(surr_deadwood)
surr_ha = np.array(surr_ha)

In [5]:
pylab.rcParams['figure.figsize'] = (15,12)

fig, ax = plt.subplots(2,2)
fig.suptitle('Optimization results using values from previously formed clustering surrogate.\nValues from 10 independent runs',
            fontsize=20)

data = np.array([[surr_revenue, surr_carbon], [surr_deadwood, surr_ha]])
names = np.array([['Revenue', 'Carbon'],['Deadwood', 'Habitat']])
optims = np.array([ideal(False)[:2], ideal(False)[2:]])
ymaxs = np.array([[3.5e+8,6.1e+6],[3.1e+5, 2.8e+4]])
for i in range(np.shape(ax)[0]):
    for j in range(np.shape(ax)[1]):
        ax[i,j].plot(inds, np.max(data[i,j], axis=1),color='g')
        ax[i,j].plot(inds, np.mean(data[i,j], axis=1), color='y')
        ax[i,j].plot(inds, np.min(data[i,j], axis=1), color='r')
        #ax[i,j].plot(inds, data[i,j][:,3])
        ax[i,j].plot((min(inds), max(inds)),(optims[i,j], optims[i,j]), color='b')
        ax[i,j].set_title(names[i,j], fontsize=15)
        ax[i,j].set_ylim(ymin=0, ymax=ymaxs[i,j])
        ax[i,j].set_xlabel('Number of clusters', fontsize=12)
        ax[i,j].set_ylabel('Optimization results', fontsize=12)
        #for k in range(200, 1401, 200):
            #ax[i,j].axvline(x=k, ymin=0, ymax=250)

In [6]:
surr_all_stack = np.dstack((surr_revenue, surr_carbon, surr_deadwood, surr_ha))

In [7]:
np.min(np.min(abs((surr_all_stack-ide)/ide), axis=1), axis=0)*100

These are then the smallest relative differences we can attain for Timber revenue, Carbon storage, Deadwood volume and Habitat suitability: 0.148%, 0.017%, 0.007% and 0.030%. In practice this means there is no difference...

In [8]:
pylab.rcParams['figure.figsize'] = (15,12)

fig, ax = plt.subplots(2,2)
fig.suptitle('Optimization results using original variable values\nwhen clustering based surrogate mapped to original variables.\nValues from 10 independent runs',
            fontsize=20)

data = np.array([[real_revenue, real_carbon], [real_deadwood, real_ha]])
for i in range(np.shape(ax)[0]):
    for j in range(np.shape(ax)[1]):
        ax[i,j].plot(inds, np.max(data[i,j], axis=1), color='g')
        ax[i,j].plot(inds, np.mean(data[i,j], axis=1), color='y')
        ax[i,j].plot(inds, np.min(data[i,j], axis=1), color='r')
        ax[i,j].plot((min(inds), max(inds)),(optims[i,j], optims[i,j]), color='b')
        ax[i,j].set_title(names[i,j], fontsize=15)
        ax[i,j].set_ylim(ymin=0, ymax=ymaxs[i,j])
        ax[i,j].set_xlabel('Number of clusters', fontsize=12)
        ax[i,j].set_ylabel('Optimization results', fontsize=12)
        #for k in range(200, 1401, 200):
         #   ax[i,j].axvline(x=k, ymin=0, ymax=250)

In [9]:
real_all_stack = np.dstack((real_revenue, real_carbon, real_deadwood, real_ha))

In [10]:
np.min(np.min(abs((real_all_stack-ide)/ide), axis=1), axis=0)*100

## Finding the best compromise in proxy based results


Calculate the best values for all the objectives, for every number of clusters

In [11]:
ide_reshape = (np.ones((4,len(surr_revenue)))*ide.reshape(4,1))
max_all = (np.array((np.max(surr_revenue, axis=1), np.max(surr_carbon, axis=1), np.max(surr_deadwood, axis=1), np.max(surr_ha, axis=1)))-ide_reshape)/ide_reshape

Lets inspect then what are the relative differences for all the best possible values for all the objectives.
Also the "absolute" sum of differences plotted (violet)

In [12]:
pylab.rcParams['figure.figsize'] = (10,8)
plt.suptitle('Relative differences in objectives', fontsize=15)
plt.plot(inds, max_all.transpose())
plt.plot(inds, sum([np.abs(num) for num in max_all.transpose()],axis=1), color='r')
plt.plot(inds, np.zeros(len(inds)))
plt.xlabel('Number of clusters', fontsize=12)
plt.ylabel('Relative differences', fontsize=12)
plt.axvline(x=1350, ymin=0, ymax=250)

Now different clusterings for same number of clusters (different random initializations) are not considered.
Now plot all the sums of relative differences for different initializations.

In [13]:
np.shape(surr_all_stack)

In [14]:
sums_all = sum(abs((surr_all_stack[:20]-ide)/ide), axis=2)

In [15]:
nc = np.argmin([sums_all[i,n] for i,n in enumerate(np.argmin(sums_all, axis=1))])

In [16]:
sn = np.argmin(sums_all[nc])

In [17]:
inds[nc], range(2,12)[sn]

In [18]:
sums_all[nc,sn]

In [19]:
pylab.rcParams['figure.figsize'] = (10,8)
plt.suptitle('Sums of relative optimization errors of the four objectives,\nfor all the clusterings.\nUsing the proxy variable based results.', fontsize=15)
plt.scatter(inds[nc], sum(abs((surr_all_stack[nc,sn]-ide)/ide)), color='b', s=120)
plt.scatter(np.ones((len(surr_revenue),10))*inds.reshape(len(surr_revenue),1), sum(abs((surr_all_stack-ide)/ide), axis=2), color='r')
plt.xlabel('Number of clusters', fontsize=12)
plt.ylabel('Sums of relative differences', fontsize=12)
# plt.axvline(x=1500, ymin=0, ymax=250)

It is now {nc} clusters. So we want to inspect that more thoroughly.

In [20]:
pylab.rcParams['figure.figsize'] = (10,8)
plt.suptitle('Relative differences in objectives, {} clusters'.format(inds[nc]), fontsize=15)
plt.plot(range(2,12), (surr_all_stack[nc,]-ide)/ide)
plt.plot(range(2,12), sum([abs(n) for n in (surr_all_stack[11]-ide)/ide], axis=1))
plt.xlabel('Clustering initialization seed', fontsize=12)
plt.ylabel('Relative differences', fontsize=12)

The best possible results are then attained with {nc} clusters and initialization seed as {sn}

In [21]:
bestind = (nc,sn)

In [22]:
surr_revenue[bestind], surr_carbon[bestind], surr_deadwood[bestind], surr_ha[bestind]

In [23]:
(np.array((surr_revenue[bestind], surr_carbon[bestind], surr_deadwood[bestind], surr_ha[bestind])) - ide)/ide*100

Now when inspecting the relative differences in this best clustering the relative differences for Timber revenue, Carbon storage, Deadwood volume and Habitat suitability are 2.5%, -1.4%, -0.8% and -2.9%

In [24]:
real_revenue[bestind], real_carbon[bestind], real_deadwood[bestind], real_ha[bestind]

In [25]:
(np.array((real_revenue[bestind], real_carbon[bestind], real_deadwood[bestind], real_ha[bestind])) - ide)/ide*100

For the original variables the results are then -1.5%, -0.9%, -2.3% and -2.3%. These are bit smaller for all excpet
for the deadwood values, which are worse.