## Knee plots for up to 2000 clusters

In [1]:
%pylab inline 
pylab.rcParams['figure.figsize'] = (15,12)
import numpy as np
import simplejson as json
import os
from gradutil import *

In [2]:
revenue, carbon, deadwood, ha = init_boreal()
n_revenue = nan_to_bau(revenue)
n_carbon = nan_to_bau(carbon)
n_deadwood = nan_to_bau(deadwood)
n_ha = nan_to_bau(ha)
revenue_norm = normalize(n_revenue.values)
carbon_norm = normalize(n_carbon.values)
deadwood_norm = normalize(n_deadwood.values)
ha_norm = normalize(n_ha.values)
ide = ideal(False)
nad = nadir(False)
opt = SolverFactory('cplex')
x = pd.concat((n_revenue, n_carbon, n_deadwood, n_ha), axis=1)
x_stack = np.dstack((n_revenue, n_carbon, n_deadwood, n_ha))
x_norm = normalize(x.values)
x_norm_stack = normalize(x_stack)

This again takes quite some time, because we have a lot of files with a lot of contents.

In [3]:
with open('optimizations/opt50_2001_50.json','r') as file:
    optimizations = json.loads(file.read())

In [4]:
new_optims = dict()
for nclust in optimizations.keys():
    new_optims[eval(nclust)] = dict()
    for seedn in optimizations[nclust].keys():
        new_optims[eval(nclust)][eval(seedn)] = dict()
        for name in optimizations[nclust][seedn].keys():
            new_optims[eval(nclust)][eval(seedn)][name] = dict()
            for key in optimizations[nclust][seedn][name].keys():
                new_optims[eval(nclust)][eval(seedn)][name][key] = float(optimizations[nclust][seedn][name][key])

In [5]:
new_optims[2000][2]

In [6]:
indices = []
real_revenue = []
real_carbon = []
real_deadwood = []
real_ha = []
surr_revenue = []
surr_carbon = []
surr_deadwood = []
surr_ha = []
for nclust in sorted(new_optims.keys()):
    r_rev = []
    r_car = []
    r_dea = []
    r_ha = []
    s_rev = []
    s_car = []
    s_dea = []
    s_ha = []
    for seedn in new_optims[nclust].keys():
        r_rev.append(new_optims[nclust][seedn]['revenue']['real'])
        r_car.append(new_optims[nclust][seedn]['carbon']['real'])
        r_dea.append(new_optims[nclust][seedn]['deadwood']['real'])
        r_ha.append(new_optims[nclust][seedn]['ha']['real'])
        s_rev.append(new_optims[nclust][seedn]['revenue']['surrogate'])
        s_car.append(new_optims[nclust][seedn]['carbon']['surrogate'])
        s_dea.append(new_optims[nclust][seedn]['deadwood']['surrogate'])
        s_ha.append(new_optims[nclust][seedn]['ha']['surrogate'])
    real_revenue.append(r_rev)
    real_carbon.append(r_car)
    real_deadwood.append(r_dea)
    real_ha.append(r_ha)
    surr_revenue.append(s_rev)
    surr_carbon.append(s_car)
    surr_deadwood.append(s_dea)
    surr_ha.append(s_ha)
    indices.append(nclust)
indices = np.array(indices)
real_revenue = np.array(real_revenue)
real_carbon = np.array(real_carbon)
real_deadwood = np.array(real_deadwood)
real_ha = np.array(real_ha)
surr_revenue = np.array(surr_revenue)
surr_carbon = np.array(surr_carbon)
surr_deadwood = np.array(surr_deadwood)
surr_ha = np.array(surr_ha)

In [280]:
pylab.rcParams['figure.figsize'] = (15,12)

fig, ax = plt.subplots(2,2)
fig.suptitle('Optimization results using values from previously formed clustering surrogate.\nValues from 10 independent runs',
            fontsize=20)

maximum = 1000
data = np.array([[surr_revenue, surr_carbon], [surr_deadwood, surr_ha]])
names = np.array([['Revenue', 'Carbon'],['Deadwood', 'Habitat']])
optims = np.array([ideal(False)[:2], ideal(False)[2:]])
ymaxs = np.array([[2.6e+8,4.6e+6],[2.4e+5, 2.2e+4]])
for i in range(np.shape(ax)[0]):
    for j in range(np.shape(ax)[1]):
        ax[i,j].plot(indices[:maximum], np.max(data[i,j], axis=1)[:maximum], color='g')
        ax[i,j].plot(indices[:maximum], np.mean(data[i,j], axis=1)[:maximum], color='y')
        ax[i,j].plot(indices[:maximum], np.min(data[i,j], axis=1)[:maximum], color='r')
        ax[i,j].plot((min(indices[:maximum]), max(indices[:maximum])),(optims[i,j], optims[i,j]), color='b')
        ax[i,j].set_title(names[i,j], fontsize=15)
        ax[i,j].set_ylim(ymin=0, ymax=ymaxs[i,j])
        ax[i,j].set_xlabel('Number of clusters', fontsize=12)
        ax[i,j].set_ylabel('Optimization results', fontsize=12)

In [78]:
surr_all_stack = np.dstack((surr_revenue, surr_carbon, surr_deadwood, surr_ha))

In [268]:
np.min(np.min(abs((surr_all_stack-ide)/ide), axis=1), axis=0)

These are then the smallest relative differences we can attain for Timber revenue, Carbon storage, Deadwood volume and Habitat suitability: 9.2%, 8.1%, 28.1% and 4.4%

In [299]:
pylab.rcParams['figure.figsize'] = (15,12)

fig, ax = plt.subplots(2,2)
fig.suptitle('Optimization results using original variable values\nwhen clustering based surrogate mapped to original variables.\nValues from 10 independent runs',
            fontsize=20)

data = np.array([[real_revenue, real_carbon], [real_deadwood, real_ha]])
for i in range(np.shape(ax)[0]):
    for j in range(np.shape(ax)[1]):
        ax[i,j].plot(indices[:maximum], np.max(data[i,j], axis=1)[:maximum], color='g')
        ax[i,j].plot(indices[:maximum], np.mean(data[i,j], axis=1)[:maximum], color='y')
        ax[i,j].plot(indices[:maximum], np.min(data[i,j], axis=1)[:maximum], color='r')
        ax[i,j].plot((min(indices[:maximum]), max(indices[:maximum])),(optims[i,j], optims[i,j]), color='b')
        ax[i,j].set_title(names[i,j], fontsize=15)
        ax[i,j].set_ylim(ymin=0, ymax=ymaxs[i,j])
        ax[i,j].set_xlabel('Number of clusters', fontsize=12)
        ax[i,j].set_ylabel('Optimization results', fontsize=12)
        ax[i,j].axvline(x=600, ymin=0, ymax=250)

## Finding the best compromise in proxy based results


Calculate the best values for all the objectives, for every number of clusters

In [45]:
ide_reshape = (np.ones((4,40))*ide.reshape(4,1))
max_all = (np.array((np.max(surr_revenue, axis=1), np.max(surr_carbon, axis=1), np.max(surr_deadwood, axis=1), np.max(surr_ha, axis=1)))-ide_reshape)/ide_reshape

Lets inspect then what are the relative differences for all the best possible values for all the objectives.
Also the "absolute" sum of differences plotted (violet)

In [206]:
pylab.rcParams['figure.figsize'] = (10,8)
plt.suptitle('Relative differences in objectives', fontsize=15)
plt.plot(indices, max_all.transpose())
plt.plot(indices, sum([np.abs(num) for num in max_all.transpose()],axis=1))
plt.xlabel('Number of clusters', fontsize=12)
plt.ylabel('Relative differences', fontsize=12)
# plt.axvline(x=1500, ymin=0, ymax=250)

It looks like the smallest sum of differences is attained with 600 clusters. Lets find that.

In [68]:
indices[11], np.argmax(surr_revenue[11]), np.argmax(surr_carbon[11]), np.argmax(surr_deadwood[11]), np.argmax(surr_ha[11])

In [118]:
sum([np.abs(num) for num in max_all.transpose()],axis=1)[11]

Now different clusterings for same number of clusters (different random initializations) are not considered.
Now plot all the sums of relative differences for different initializations.

In [294]:
np.shape(np.ones((40,10))*indices.reshape(40,1)), np.shape(sum(abs((surr_all_stack-ide)/ide), axis=2))

In [296]:
pylab.rcParams['figure.figsize'] = (10,8)
plt.suptitle('Sums of relative errors for all the clusterings', fontsize=15)
plt.scatter(np.ones((40,10))*indices.reshape(40,1), sum(abs((surr_all_stack-ide)/ide), axis=2), color='r')
plt.xlabel('Number of clusters', fontsize=12)
plt.ylabel('Sums of relative differences', fontsize=12)
# plt.axvline(x=1500, ymin=0, ymax=250)

In [199]:
sums_all = sum(abs((surr_all_stack-ide)/ide), axis=2)

In [196]:
np.argmin([sums_all[i,n] for i,n in enumerate(np.argmin(sums_all, axis=1))])

In [197]:
np.argmin(sums_all[11])

In [198]:
sums_all[11,8]

In [200]:
np.min(sums_all)

In [214]:
indices[11], sorted(new_optims[600].keys())[8]

It is still 600 clusters. So we want to inspect that more thoroughly.

In [205]:
pylab.rcParams['figure.figsize'] = (10,8)
plt.suptitle('Relative differences in objectives, 600 clusters', fontsize=15)
plt.plot(range(2,12), (surr_all_stack[11,]-ide)/ide)
plt.plot(range(2,12), sum([abs(n) for n in (surr_all_stack[11]-ide)/ide], axis=1))
plt.xlabel('Clustering initialization seed', fontsize=12)
plt.ylabel('Relative differences', fontsize=12)

The best possible results are then attained with 600 clusters and initialization seed as 10

In [297]:
surr_revenue[11,8], surr_carbon[11,8], surr_deadwood[11,8], surr_ha[11,8]

In [217]:
(np.array((surr_revenue[11,8], surr_carbon[11,8], surr_deadwood[11,8], surr_ha[11,8])) - ide)/ide

Now when inspecting the relative differences in this best clustering the relative differences for Timber revenue, Carbon storage, Deadwood volume and Habitat suitability are -11.7%, -10.5%, -39.7% and -5.4%

In [None]:
real

In [277]:
np.min(abs((real_all_stack[11]-ide)/ide), axis=0)

In [220]:
(np.array((real_revenue[11,8], real_carbon[11,8], real_deadwood[11,8], real_ha[11,8])) - ide)/ide

In [221]:
(np.array((real_revenue[3,8], real_carbon[3,8], real_deadwood[3,8], real_ha[3,8])) - ide)/ide